]> git.apps.os.sepia.ceph.com Git - ceph-ci.git/commitdiff
RGW - Zipper - move rgw/store to rgw/driver
authorDaniel Gryniewicz <dang@redhat.com>
Fri, 2 Dec 2022 16:34:44 +0000 (11:34 -0500)
committerDaniel Gryniewicz <dang@redhat.com>
Thu, 8 Dec 2022 19:42:29 +0000 (14:42 -0500)
Signed-off-by: Daniel Gryniewicz <dang@redhat.com>
241 files changed:
src/rgw/CMakeLists.txt
src/rgw/driver/daos/README.md [new file with mode: 0644]
src/rgw/driver/dbstore/CMakeLists.txt [new file with mode: 0644]
src/rgw/driver/dbstore/README.md [new file with mode: 0644]
src/rgw/driver/dbstore/common/connection_pool.h [new file with mode: 0644]
src/rgw/driver/dbstore/common/dbstore.cc [new file with mode: 0644]
src/rgw/driver/dbstore/common/dbstore.h [new file with mode: 0644]
src/rgw/driver/dbstore/common/dbstore_log.h [new file with mode: 0644]
src/rgw/driver/dbstore/config/sqlite.cc [new file with mode: 0644]
src/rgw/driver/dbstore/config/sqlite.h [new file with mode: 0644]
src/rgw/driver/dbstore/config/sqlite_schema.h [new file with mode: 0644]
src/rgw/driver/dbstore/config/store.cc [new file with mode: 0644]
src/rgw/driver/dbstore/config/store.h [new file with mode: 0644]
src/rgw/driver/dbstore/dbstore_main.cc [new file with mode: 0644]
src/rgw/driver/dbstore/dbstore_mgr.cc [new file with mode: 0644]
src/rgw/driver/dbstore/dbstore_mgr.h [new file with mode: 0644]
src/rgw/driver/dbstore/sqlite/CMakeLists.txt [new file with mode: 0644]
src/rgw/driver/dbstore/sqlite/connection.cc [new file with mode: 0644]
src/rgw/driver/dbstore/sqlite/connection.h [new file with mode: 0644]
src/rgw/driver/dbstore/sqlite/error.cc [new file with mode: 0644]
src/rgw/driver/dbstore/sqlite/error.h [new file with mode: 0644]
src/rgw/driver/dbstore/sqlite/sqliteDB.cc [new file with mode: 0644]
src/rgw/driver/dbstore/sqlite/sqliteDB.h [new file with mode: 0644]
src/rgw/driver/dbstore/sqlite/statement.cc [new file with mode: 0644]
src/rgw/driver/dbstore/sqlite/statement.h [new file with mode: 0644]
src/rgw/driver/dbstore/tests/CMakeLists.txt [new file with mode: 0644]
src/rgw/driver/dbstore/tests/dbstore_mgr_tests.cc [new file with mode: 0644]
src/rgw/driver/dbstore/tests/dbstore_tests.cc [new file with mode: 0644]
src/rgw/driver/immutable_config/store.cc [new file with mode: 0644]
src/rgw/driver/immutable_config/store.h [new file with mode: 0644]
src/rgw/driver/json_config/store.cc [new file with mode: 0644]
src/rgw/driver/json_config/store.h [new file with mode: 0644]
src/rgw/driver/rados/cls_fifo_legacy.cc [new file with mode: 0644]
src/rgw/driver/rados/cls_fifo_legacy.h [new file with mode: 0644]
src/rgw/driver/rados/config/impl.cc [new file with mode: 0644]
src/rgw/driver/rados/config/impl.h [new file with mode: 0644]
src/rgw/driver/rados/config/period.cc [new file with mode: 0644]
src/rgw/driver/rados/config/period_config.cc [new file with mode: 0644]
src/rgw/driver/rados/config/realm.cc [new file with mode: 0644]
src/rgw/driver/rados/config/store.cc [new file with mode: 0644]
src/rgw/driver/rados/config/store.h [new file with mode: 0644]
src/rgw/driver/rados/config/zone.cc [new file with mode: 0644]
src/rgw/driver/rados/config/zonegroup.cc [new file with mode: 0644]
src/rgw/driver/rados/rgw_bucket.cc [new file with mode: 0644]
src/rgw/driver/rados/rgw_bucket.h [new file with mode: 0644]
src/rgw/driver/rados/rgw_bucket_sync.cc [new file with mode: 0644]
src/rgw/driver/rados/rgw_bucket_sync.h [new file with mode: 0644]
src/rgw/driver/rados/rgw_cr_rados.cc [new file with mode: 0644]
src/rgw/driver/rados/rgw_cr_rados.h [new file with mode: 0644]
src/rgw/driver/rados/rgw_cr_tools.cc [new file with mode: 0644]
src/rgw/driver/rados/rgw_cr_tools.h [new file with mode: 0644]
src/rgw/driver/rados/rgw_d3n_datacache.cc [new file with mode: 0644]
src/rgw/driver/rados/rgw_d3n_datacache.h [new file with mode: 0644]
src/rgw/driver/rados/rgw_data_sync.cc [new file with mode: 0644]
src/rgw/driver/rados/rgw_data_sync.h [new file with mode: 0644]
src/rgw/driver/rados/rgw_datalog.cc [new file with mode: 0644]
src/rgw/driver/rados/rgw_datalog.h [new file with mode: 0644]
src/rgw/driver/rados/rgw_datalog_notify.cc [new file with mode: 0644]
src/rgw/driver/rados/rgw_datalog_notify.h [new file with mode: 0644]
src/rgw/driver/rados/rgw_etag_verifier.cc [new file with mode: 0644]
src/rgw/driver/rados/rgw_etag_verifier.h [new file with mode: 0644]
src/rgw/driver/rados/rgw_gc.cc [new file with mode: 0644]
src/rgw/driver/rados/rgw_gc.h [new file with mode: 0644]
src/rgw/driver/rados/rgw_gc_log.cc [new file with mode: 0644]
src/rgw/driver/rados/rgw_lc_tier.cc [new file with mode: 0644]
src/rgw/driver/rados/rgw_lc_tier.h [new file with mode: 0644]
src/rgw/driver/rados/rgw_log_backing.cc [new file with mode: 0644]
src/rgw/driver/rados/rgw_log_backing.h [new file with mode: 0644]
src/rgw/driver/rados/rgw_metadata.cc [new file with mode: 0644]
src/rgw/driver/rados/rgw_metadata.h [new file with mode: 0644]
src/rgw/driver/rados/rgw_notify.cc [new file with mode: 0644]
src/rgw/driver/rados/rgw_notify.h [new file with mode: 0644]
src/rgw/driver/rados/rgw_obj_manifest.cc [new file with mode: 0644]
src/rgw/driver/rados/rgw_obj_manifest.h [new file with mode: 0644]
src/rgw/driver/rados/rgw_object_expirer_core.cc [new file with mode: 0644]
src/rgw/driver/rados/rgw_object_expirer_core.h [new file with mode: 0644]
src/rgw/driver/rados/rgw_otp.cc [new file with mode: 0644]
src/rgw/driver/rados/rgw_otp.h [new file with mode: 0644]
src/rgw/driver/rados/rgw_period.cc [new file with mode: 0644]
src/rgw/driver/rados/rgw_rest_pubsub.cc [new file with mode: 0644]
src/rgw/driver/rados/rgw_rest_pubsub.h [new file with mode: 0644]
src/rgw/driver/rados/rgw_rest_realm.cc [new file with mode: 0644]
src/rgw/driver/rados/rgw_rest_realm.h [new file with mode: 0644]
src/rgw/driver/rados/rgw_rest_user.cc [new file with mode: 0644]
src/rgw/driver/rados/rgw_rest_user.h [new file with mode: 0644]
src/rgw/driver/rados/rgw_sal_rados.cc [new file with mode: 0644]
src/rgw/driver/rados/rgw_sal_rados.h [new file with mode: 0644]
src/rgw/driver/rados/rgw_service.cc [new file with mode: 0644]
src/rgw/driver/rados/rgw_service.h [new file with mode: 0644]
src/rgw/driver/rados/rgw_sync.cc [new file with mode: 0644]
src/rgw/driver/rados/rgw_sync.h [new file with mode: 0644]
src/rgw/driver/rados/rgw_sync_counters.cc [new file with mode: 0644]
src/rgw/driver/rados/rgw_sync_counters.h [new file with mode: 0644]
src/rgw/driver/rados/rgw_sync_error_repo.cc [new file with mode: 0644]
src/rgw/driver/rados/rgw_sync_error_repo.h [new file with mode: 0644]
src/rgw/driver/rados/rgw_sync_module.cc [new file with mode: 0644]
src/rgw/driver/rados/rgw_sync_module.h [new file with mode: 0644]
src/rgw/driver/rados/rgw_sync_module_aws.cc [new file with mode: 0644]
src/rgw/driver/rados/rgw_sync_module_aws.h [new file with mode: 0644]
src/rgw/driver/rados/rgw_sync_module_es.cc [new file with mode: 0644]
src/rgw/driver/rados/rgw_sync_module_es.h [new file with mode: 0644]
src/rgw/driver/rados/rgw_sync_module_es_rest.cc [new file with mode: 0644]
src/rgw/driver/rados/rgw_sync_module_es_rest.h [new file with mode: 0644]
src/rgw/driver/rados/rgw_sync_module_log.cc [new file with mode: 0644]
src/rgw/driver/rados/rgw_sync_module_log.h [new file with mode: 0644]
src/rgw/driver/rados/rgw_sync_trace.cc [new file with mode: 0644]
src/rgw/driver/rados/rgw_sync_trace.h [new file with mode: 0644]
src/rgw/driver/rados/rgw_tools.cc [new file with mode: 0644]
src/rgw/driver/rados/rgw_tools.h [new file with mode: 0644]
src/rgw/driver/rados/rgw_trim_bilog.cc [new file with mode: 0644]
src/rgw/driver/rados/rgw_trim_bilog.h [new file with mode: 0644]
src/rgw/driver/rados/rgw_trim_datalog.cc [new file with mode: 0644]
src/rgw/driver/rados/rgw_trim_datalog.h [new file with mode: 0644]
src/rgw/driver/rados/rgw_trim_mdlog.cc [new file with mode: 0644]
src/rgw/driver/rados/rgw_trim_mdlog.h [new file with mode: 0644]
src/rgw/driver/rados/rgw_user.cc [new file with mode: 0644]
src/rgw/driver/rados/rgw_user.h [new file with mode: 0644]
src/rgw/driver/rados/rgw_zone.cc [new file with mode: 0644]
src/rgw/driver/rados/rgw_zone.h [new file with mode: 0644]
src/rgw/rgw_rados.cc
src/rgw/rgw_sal.cc
src/rgw/rgw_sal_dbstore.h
src/rgw/store/daos/README.md [deleted file]
src/rgw/store/dbstore/CMakeLists.txt [deleted file]
src/rgw/store/dbstore/README.md [deleted file]
src/rgw/store/dbstore/common/connection_pool.h [deleted file]
src/rgw/store/dbstore/common/dbstore.cc [deleted file]
src/rgw/store/dbstore/common/dbstore.h [deleted file]
src/rgw/store/dbstore/common/dbstore_log.h [deleted file]
src/rgw/store/dbstore/config/sqlite.cc [deleted file]
src/rgw/store/dbstore/config/sqlite.h [deleted file]
src/rgw/store/dbstore/config/sqlite_schema.h [deleted file]
src/rgw/store/dbstore/config/store.cc [deleted file]
src/rgw/store/dbstore/config/store.h [deleted file]
src/rgw/store/dbstore/dbstore_main.cc [deleted file]
src/rgw/store/dbstore/dbstore_mgr.cc [deleted file]
src/rgw/store/dbstore/dbstore_mgr.h [deleted file]
src/rgw/store/dbstore/sqlite/CMakeLists.txt [deleted file]
src/rgw/store/dbstore/sqlite/connection.cc [deleted file]
src/rgw/store/dbstore/sqlite/connection.h [deleted file]
src/rgw/store/dbstore/sqlite/error.cc [deleted file]
src/rgw/store/dbstore/sqlite/error.h [deleted file]
src/rgw/store/dbstore/sqlite/sqliteDB.cc [deleted file]
src/rgw/store/dbstore/sqlite/sqliteDB.h [deleted file]
src/rgw/store/dbstore/sqlite/statement.cc [deleted file]
src/rgw/store/dbstore/sqlite/statement.h [deleted file]
src/rgw/store/dbstore/tests/CMakeLists.txt [deleted file]
src/rgw/store/dbstore/tests/dbstore_mgr_tests.cc [deleted file]
src/rgw/store/dbstore/tests/dbstore_tests.cc [deleted file]
src/rgw/store/immutable_config/store.cc [deleted file]
src/rgw/store/immutable_config/store.h [deleted file]
src/rgw/store/json_config/store.cc [deleted file]
src/rgw/store/json_config/store.h [deleted file]
src/rgw/store/rados/cls_fifo_legacy.cc [deleted file]
src/rgw/store/rados/cls_fifo_legacy.h [deleted file]
src/rgw/store/rados/config/impl.cc [deleted file]
src/rgw/store/rados/config/impl.h [deleted file]
src/rgw/store/rados/config/period.cc [deleted file]
src/rgw/store/rados/config/period_config.cc [deleted file]
src/rgw/store/rados/config/realm.cc [deleted file]
src/rgw/store/rados/config/store.cc [deleted file]
src/rgw/store/rados/config/store.h [deleted file]
src/rgw/store/rados/config/zone.cc [deleted file]
src/rgw/store/rados/config/zonegroup.cc [deleted file]
src/rgw/store/rados/rgw_bucket.cc [deleted file]
src/rgw/store/rados/rgw_bucket.h [deleted file]
src/rgw/store/rados/rgw_bucket_sync.cc [deleted file]
src/rgw/store/rados/rgw_bucket_sync.h [deleted file]
src/rgw/store/rados/rgw_cr_rados.cc [deleted file]
src/rgw/store/rados/rgw_cr_rados.h [deleted file]
src/rgw/store/rados/rgw_cr_tools.cc [deleted file]
src/rgw/store/rados/rgw_cr_tools.h [deleted file]
src/rgw/store/rados/rgw_d3n_datacache.cc [deleted file]
src/rgw/store/rados/rgw_d3n_datacache.h [deleted file]
src/rgw/store/rados/rgw_data_sync.cc [deleted file]
src/rgw/store/rados/rgw_data_sync.h [deleted file]
src/rgw/store/rados/rgw_datalog.cc [deleted file]
src/rgw/store/rados/rgw_datalog.h [deleted file]
src/rgw/store/rados/rgw_datalog_notify.cc [deleted file]
src/rgw/store/rados/rgw_datalog_notify.h [deleted file]
src/rgw/store/rados/rgw_etag_verifier.cc [deleted file]
src/rgw/store/rados/rgw_etag_verifier.h [deleted file]
src/rgw/store/rados/rgw_gc.cc [deleted file]
src/rgw/store/rados/rgw_gc.h [deleted file]
src/rgw/store/rados/rgw_gc_log.cc [deleted file]
src/rgw/store/rados/rgw_lc_tier.cc [deleted file]
src/rgw/store/rados/rgw_lc_tier.h [deleted file]
src/rgw/store/rados/rgw_log_backing.cc [deleted file]
src/rgw/store/rados/rgw_log_backing.h [deleted file]
src/rgw/store/rados/rgw_metadata.cc [deleted file]
src/rgw/store/rados/rgw_metadata.h [deleted file]
src/rgw/store/rados/rgw_notify.cc [deleted file]
src/rgw/store/rados/rgw_notify.h [deleted file]
src/rgw/store/rados/rgw_obj_manifest.cc [deleted file]
src/rgw/store/rados/rgw_obj_manifest.h [deleted file]
src/rgw/store/rados/rgw_object_expirer_core.cc [deleted file]
src/rgw/store/rados/rgw_object_expirer_core.h [deleted file]
src/rgw/store/rados/rgw_otp.cc [deleted file]
src/rgw/store/rados/rgw_otp.h [deleted file]
src/rgw/store/rados/rgw_period.cc [deleted file]
src/rgw/store/rados/rgw_rest_pubsub.cc [deleted file]
src/rgw/store/rados/rgw_rest_pubsub.h [deleted file]
src/rgw/store/rados/rgw_rest_realm.cc [deleted file]
src/rgw/store/rados/rgw_rest_realm.h [deleted file]
src/rgw/store/rados/rgw_rest_user.cc [deleted file]
src/rgw/store/rados/rgw_rest_user.h [deleted file]
src/rgw/store/rados/rgw_sal_rados.cc [deleted file]
src/rgw/store/rados/rgw_sal_rados.h [deleted file]
src/rgw/store/rados/rgw_service.cc [deleted file]
src/rgw/store/rados/rgw_service.h [deleted file]
src/rgw/store/rados/rgw_sync.cc [deleted file]
src/rgw/store/rados/rgw_sync.h [deleted file]
src/rgw/store/rados/rgw_sync_counters.cc [deleted file]
src/rgw/store/rados/rgw_sync_counters.h [deleted file]
src/rgw/store/rados/rgw_sync_error_repo.cc [deleted file]
src/rgw/store/rados/rgw_sync_error_repo.h [deleted file]
src/rgw/store/rados/rgw_sync_module.cc [deleted file]
src/rgw/store/rados/rgw_sync_module.h [deleted file]
src/rgw/store/rados/rgw_sync_module_aws.cc [deleted file]
src/rgw/store/rados/rgw_sync_module_aws.h [deleted file]
src/rgw/store/rados/rgw_sync_module_es.cc [deleted file]
src/rgw/store/rados/rgw_sync_module_es.h [deleted file]
src/rgw/store/rados/rgw_sync_module_es_rest.cc [deleted file]
src/rgw/store/rados/rgw_sync_module_es_rest.h [deleted file]
src/rgw/store/rados/rgw_sync_module_log.cc [deleted file]
src/rgw/store/rados/rgw_sync_module_log.h [deleted file]
src/rgw/store/rados/rgw_sync_trace.cc [deleted file]
src/rgw/store/rados/rgw_sync_trace.h [deleted file]
src/rgw/store/rados/rgw_tools.cc [deleted file]
src/rgw/store/rados/rgw_tools.h [deleted file]
src/rgw/store/rados/rgw_trim_bilog.cc [deleted file]
src/rgw/store/rados/rgw_trim_bilog.h [deleted file]
src/rgw/store/rados/rgw_trim_datalog.cc [deleted file]
src/rgw/store/rados/rgw_trim_datalog.h [deleted file]
src/rgw/store/rados/rgw_trim_mdlog.cc [deleted file]
src/rgw/store/rados/rgw_trim_mdlog.h [deleted file]
src/rgw/store/rados/rgw_user.cc [deleted file]
src/rgw/store/rados/rgw_user.h [deleted file]
src/rgw/store/rados/rgw_zone.cc [deleted file]
src/rgw/store/rados/rgw_zone.h [deleted file]
src/test/CMakeLists.txt

index 7b6ac917f854565b040ff0737f7b12819cb8c8c2..8d663be01fb56af8eff591d38a19fcea08298c0c 100644 (file)
@@ -142,57 +142,57 @@ set(librgw_common_srcs
   rgw_bucket_encryption.cc
   rgw_tracer.cc
   rgw_lua_background.cc
-  store/rados/cls_fifo_legacy.cc
-  store/rados/rgw_bucket.cc
-  store/rados/rgw_bucket_sync.cc
-  store/rados/rgw_cr_rados.cc
-  store/rados/rgw_cr_tools.cc
-  store/rados/rgw_d3n_datacache.cc
-  store/rados/rgw_datalog.cc
-  store/rados/rgw_datalog_notify.cc
-  store/rados/rgw_data_sync.cc
-  store/rados/rgw_etag_verifier.cc
-  store/rados/rgw_gc.cc
-  store/rados/rgw_gc_log.cc
-  store/rados/rgw_lc_tier.cc
-  store/rados/rgw_log_backing.cc
-  store/rados/rgw_metadata.cc
-  store/rados/rgw_notify.cc
-  store/rados/rgw_obj_manifest.cc
-  store/rados/rgw_object_expirer_core.cc
-  store/rados/rgw_otp.cc
-  store/rados/rgw_period.cc
-  store/rados/rgw_rest_pubsub.cc
-  store/rados/rgw_rest_realm.cc
-  store/rados/rgw_rest_user.cc
-  store/rados/rgw_sal_rados.cc
-  store/rados/rgw_service.cc
-  store/rados/rgw_sync.cc
-  store/rados/rgw_sync_counters.cc
-  store/rados/rgw_sync_error_repo.cc
-  store/rados/rgw_sync_module.cc
-  store/rados/rgw_sync_module_aws.cc
-  store/rados/rgw_sync_module_es.cc
-  store/rados/rgw_sync_module_es_rest.cc
-  store/rados/rgw_sync_module_log.cc
-  store/rados/rgw_sync_trace.cc
-  store/rados/rgw_tools.cc
-  store/rados/rgw_trim_bilog.cc
-  store/rados/rgw_trim_datalog.cc
-  store/rados/rgw_trim_mdlog.cc
-  store/rados/rgw_user.cc
-  store/rados/rgw_zone.cc)
+  driver/rados/cls_fifo_legacy.cc
+  driver/rados/rgw_bucket.cc
+  driver/rados/rgw_bucket_sync.cc
+  driver/rados/rgw_cr_rados.cc
+  driver/rados/rgw_cr_tools.cc
+  driver/rados/rgw_d3n_datacache.cc
+  driver/rados/rgw_datalog.cc
+  driver/rados/rgw_datalog_notify.cc
+  driver/rados/rgw_data_sync.cc
+  driver/rados/rgw_etag_verifier.cc
+  driver/rados/rgw_gc.cc
+  driver/rados/rgw_gc_log.cc
+  driver/rados/rgw_lc_tier.cc
+  driver/rados/rgw_log_backing.cc
+  driver/rados/rgw_metadata.cc
+  driver/rados/rgw_notify.cc
+  driver/rados/rgw_obj_manifest.cc
+  driver/rados/rgw_object_expirer_core.cc
+  driver/rados/rgw_otp.cc
+  driver/rados/rgw_period.cc
+  driver/rados/rgw_rest_pubsub.cc
+  driver/rados/rgw_rest_realm.cc
+  driver/rados/rgw_rest_user.cc
+  driver/rados/rgw_sal_rados.cc
+  driver/rados/rgw_service.cc
+  driver/rados/rgw_sync.cc
+  driver/rados/rgw_sync_counters.cc
+  driver/rados/rgw_sync_error_repo.cc
+  driver/rados/rgw_sync_module.cc
+  driver/rados/rgw_sync_module_aws.cc
+  driver/rados/rgw_sync_module_es.cc
+  driver/rados/rgw_sync_module_es_rest.cc
+  driver/rados/rgw_sync_module_log.cc
+  driver/rados/rgw_sync_trace.cc
+  driver/rados/rgw_tools.cc
+  driver/rados/rgw_trim_bilog.cc
+  driver/rados/rgw_trim_datalog.cc
+  driver/rados/rgw_trim_mdlog.cc
+  driver/rados/rgw_user.cc
+  driver/rados/rgw_zone.cc)
 
 list(APPEND librgw_common_srcs
-  store/immutable_config/store.cc
-  store/json_config/store.cc
-  store/rados/config/impl.cc
-  store/rados/config/period.cc
-  store/rados/config/period_config.cc
-  store/rados/config/realm.cc
-  store/rados/config/store.cc
-  store/rados/config/zone.cc
-  store/rados/config/zonegroup.cc)
+  driver/immutable_config/store.cc
+  driver/json_config/store.cc
+  driver/rados/config/impl.cc
+  driver/rados/config/period.cc
+  driver/rados/config/period_config.cc
+  driver/rados/config/realm.cc
+  driver/rados/config/store.cc
+  driver/rados/config/zone.cc
+  driver/rados/config/zonegroup.cc)
 
 if(WITH_RADOSGW_AMQP_ENDPOINT)
   list(APPEND librgw_common_srcs rgw_amqp.cc)
@@ -201,7 +201,7 @@ if(WITH_RADOSGW_KAFKA_ENDPOINT)
   list(APPEND librgw_common_srcs rgw_kafka.cc)
 endif()
 if(WITH_RADOSGW_DBSTORE)
-  add_subdirectory(store/dbstore)
+  add_subdirectory(driver/dbstore)
   list(APPEND librgw_common_srcs rgw_sal_dbstore.cc)
 endif()
 if(WITH_RADOSGW_MOTR)
@@ -253,7 +253,7 @@ target_link_libraries(rgw_common
     spawn)
 target_include_directories(rgw_common
   PUBLIC "${CMAKE_SOURCE_DIR}/src/rgw/services"
-  PUBLIC "${CMAKE_SOURCE_DIR}/src/rgw/store/rados"
+  PUBLIC "${CMAKE_SOURCE_DIR}/src/rgw/driver/rados"
   PUBLIC "${CMAKE_SOURCE_DIR}/src/rgw"
   PUBLIC "${LUA_INCLUDE_DIR}")
 if(WITH_RADOSGW_KAFKA_ENDPOINT)
@@ -334,7 +334,7 @@ set(rgw_a_srcs
   rgw_rest_log.cc
   rgw_rest_metadata.cc
   rgw_rest_ratelimit.cc
-  store/rados/rgw_rest_realm.cc
+  driver/rados/rgw_rest_realm.cc
   rgw_rest_sts.cc
   rgw_rest_swift.cc
   rgw_rest_usage.cc
@@ -359,7 +359,7 @@ target_include_directories(rgw_a
   SYSTEM PUBLIC "../rapidjson/include"
   PUBLIC "${CMAKE_SOURCE_DIR}/src/dmclock/support/src"
   PUBLIC "${CMAKE_SOURCE_DIR}/src/rgw"
-  PUBLIC "${CMAKE_SOURCE_DIR}/src/rgw/store/rados"
+  PUBLIC "${CMAKE_SOURCE_DIR}/src/rgw/driver/rados"
   PRIVATE "${CMAKE_SOURCE_DIR}/src/libkmip")
 
 if(WITH_RADOSGW_AMQP_ENDPOINT)
diff --git a/src/rgw/driver/daos/README.md b/src/rgw/driver/daos/README.md
new file mode 100644 (file)
index 0000000..de6d215
--- /dev/null
@@ -0,0 +1,47 @@
+# DAOS
+
+Standalone RADOS Gateway (RGW) on [DAOS](http://daos.io/) (Experimental)
+
+## CMake Option
+
+Add below cmake option
+
+```bash
+    -DWITH_RADOSGW_DAOS=ON
+```
+
+## Build
+
+```bash
+    cd build
+    ninja [vstart]
+```
+
+## Running Test cluster
+
+Edit ceph.conf to add below option
+
+```conf
+    [client]
+        rgw backend store = daos
+```
+
+Restart vstart cluster or just RGW server
+
+```bash
+    [..] RGW=1 ../src/vstart.sh -d
+```
+
+The above configuration brings up an RGW server on DAOS.
+
+## Creating a test user
+
+ To create a `testid` user to be used for s3 operations, use the following command:
+
+ ```bash
+local akey='0555b35654ad1656d804'
+local skey='h7GhxuBLTrlhVUyxSPUKUV8r/2EI4ngqJxD7iBdBYLhwluN30JaT3Q=='
+    radosgw-admin user create --uid testid \
+        --access-key $akey --secret $skey \
+        --display-name 'M. Tester' --email tester@ceph.com --no-mon-config
+ ```
diff --git a/src/rgw/driver/dbstore/CMakeLists.txt b/src/rgw/driver/dbstore/CMakeLists.txt
new file mode 100644 (file)
index 0000000..0d34d32
--- /dev/null
@@ -0,0 +1,72 @@
+#need to update cmake version here
+cmake_minimum_required(VERSION 3.14.0)
+project(dbstore)
+
+option(USE_SQLITE "Enable SQLITE DB" ON)
+
+set (CMAKE_INCLUDE_DIR ${CMAKE_INCLUDE_DIR} "${CMAKE_CURRENT_SOURCE_DIR}/common")
+
+set(dbstore_srcs
+    common/dbstore_log.h
+    common/dbstore.h
+    common/dbstore.cc
+    config/store.cc)
+IF(USE_SQLITE)
+  list(APPEND dbstore_srcs
+      config/sqlite.cc
+      sqlite/connection.cc
+      sqlite/error.cc
+      sqlite/statement.cc)
+endif()
+
+set(dbstore_mgr_srcs
+    dbstore_mgr.h
+    dbstore_mgr.cc
+    )
+
+add_library(dbstore_lib ${dbstore_srcs})
+target_include_directories(dbstore_lib
+    PUBLIC "${CMAKE_SOURCE_DIR}/src/fmt/include"
+    PUBLIC "${CMAKE_SOURCE_DIR}/src/rgw"
+    PUBLIC "${CMAKE_SOURCE_DIR}/src/rgw/store/rados"
+    PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}")
+set(link_targets spawn)
+if(WITH_JAEGER)
+  list(APPEND link_targets jaeger_base)
+endif()
+list(APPEND link_targets rgw_common)
+target_link_libraries(dbstore_lib PUBLIC ${link_targets})
+
+set (CMAKE_LINK_LIBRARIES ${CMAKE_LINK_LIBRARIES} dbstore_lib)
+
+IF(USE_SQLITE)
+  add_subdirectory(sqlite)
+  set(CMAKE_INCLUDE_DIR ${CMAKE_INCLUDE_DIR} "${CMAKE_CURRENT_SOURCE_DIR}/sqlite")
+  add_compile_definitions(SQLITE_ENABLED=1)
+  set (CMAKE_LINK_LIBRARIES ${CMAKE_LINK_LIBRARIES} rgw_common)
+  set (CMAKE_LINK_LIBRARIES ${CMAKE_LINK_LIBRARIES} sqlite_db)
+  add_dependencies(sqlite_db dbstore_lib)
+ENDIF()
+
+# add pthread library
+set (CMAKE_LINK_LIBRARIES ${CMAKE_LINK_LIBRARIES} pthread)
+
+find_package(gtest QUIET)
+if(WITH_TESTS)
+    add_subdirectory(tests)
+else()
+       message(WARNING "Gtest not enabled")
+endif()
+
+include_directories(${CMAKE_INCLUDE_DIR})
+add_library(dbstore STATIC ${dbstore_mgr_srcs})
+target_link_libraries(dbstore ${CMAKE_LINK_LIBRARIES})
+
+# testing purpose
+set(dbstore_main_srcs
+    dbstore_main.cc)
+
+set (CMAKE_LINK_LIBRARIES ${CMAKE_LINK_LIBRARIES} dbstore)
+add_executable(dbstore-bin ${dbstore_main_srcs})
+add_dependencies(dbstore-bin dbstore)
+target_link_libraries(dbstore-bin ${CMAKE_LINK_LIBRARIES})
diff --git a/src/rgw/driver/dbstore/README.md b/src/rgw/driver/dbstore/README.md
new file mode 100644 (file)
index 0000000..0867bc2
--- /dev/null
@@ -0,0 +1,53 @@
+# DBStore
+Standalone Rados Gateway (RGW) on DBStore (Experimental)
+
+
+## CMake Option
+Add below cmake option (enabled by default)
+
+    -DWITH_RADOSGW_DBSTORE=ON 
+
+
+## Build
+
+    cd build
+    ninja [vstart]
+
+
+## Running Test cluster
+Edit ceph.conf to add below option
+
+    [client]
+        rgw backend store = dbstore
+
+Start vstart cluster
+
+    [..] RGW=1 ../src/vstart.sh -o rgw_backend_store=dbstore -n -d
+
+The above vstart command brings up RGW server on dbstore and creates few default users (eg., testid) to be used for s3 operations.
+
+`radosgw-admin` can be used to create and remove other users.
+
+
+By default, dbstore creates .db file *'/var/lib/ceph/radosgw/dbstore-default_ns.db'* to store the data. This can be configured using below options in ceph.conf
+
+    [client]
+        dbstore db dir = <path for the directory for storing the db backend store data>
+        dbstore db name prefix = <prefix to the file names created by db backend store>
+
+
+## DBStore Unit Tests
+To execute DBStore unit test cases (using Gtest framework), from build directory
+
+    ninja unittest_dbstore_tests
+    ./bin/unittest_dbstore_tests [logfile] [loglevel]
+    (default logfile: rgw_dbstore_tests.log, loglevel: 20)
+    ninja unittest_dbstore_mgr_tests
+    ./bin/unittest_dbstore_mgr_tests
+
+To execute Sample test file
+
+    ninja src/rgw/driver/dbstore/install
+    ./bin/dbstore-bin [logfile] [loglevel]
+    (default logfile: rgw_dbstore_bin.log, loglevel: 20)
+
diff --git a/src/rgw/driver/dbstore/common/connection_pool.h b/src/rgw/driver/dbstore/common/connection_pool.h
new file mode 100644 (file)
index 0000000..07f3c81
--- /dev/null
@@ -0,0 +1,147 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2022 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#pragma once
+
+#include <concepts>
+#include <condition_variable>
+#include <memory>
+#include <mutex>
+#include <boost/circular_buffer.hpp>
+#include "common/dout.h"
+
+namespace rgw::dbstore {
+
+template <typename Connection>
+class ConnectionHandle;
+
+/// A thread-safe base class that manages a fixed-size pool of generic database
+/// connections and supports the reclamation of ConnectionHandles. This class
+/// is the subset of ConnectionPool which doesn't depend on the Factory type.
+template <typename Connection>
+class ConnectionPoolBase {
+ public:
+  ConnectionPoolBase(std::size_t max_connections)
+      : connections(max_connections)
+  {}
+ private:
+  friend class ConnectionHandle<Connection>;
+
+  // TODO: the caller may detect a connection error that prevents the connection
+  // from being reused. allow them to indicate these errors here
+  void put(std::unique_ptr<Connection> connection)
+  {
+    auto lock = std::scoped_lock{mutex};
+    connections.push_back(std::move(connection));
+
+    if (connections.size() == 1) { // was empty
+      cond.notify_one();
+    }
+  }
+ protected:
+  std::mutex mutex;
+  std::condition_variable cond;
+  boost::circular_buffer<std::unique_ptr<Connection>> connections;
+};
+
+/// Handle to a database connection borrowed from the pool. Automatically
+/// returns the connection to its pool on the handle's destruction.
+template <typename Connection>
+class ConnectionHandle {
+  ConnectionPoolBase<Connection>* pool = nullptr;
+  std::unique_ptr<Connection> conn;
+ public:
+  ConnectionHandle() noexcept = default;
+  ConnectionHandle(ConnectionPoolBase<Connection>* pool,
+                   std::unique_ptr<Connection> conn) noexcept
+    : pool(pool), conn(std::move(conn)) {}
+
+  ~ConnectionHandle() {
+    if (conn) {
+      pool->put(std::move(conn));
+    }
+  }
+
+  ConnectionHandle(ConnectionHandle&&) = default;
+  ConnectionHandle& operator=(ConnectionHandle&& o) noexcept {
+    if (conn) {
+      pool->put(std::move(conn));
+    }
+    conn = std::move(o.conn);
+    pool = o.pool;
+    return *this;
+  }
+
+  explicit operator bool() const noexcept { return static_cast<bool>(conn); }
+  Connection& operator*() const noexcept { return *conn; }
+  Connection* operator->() const noexcept { return conn.get(); }
+  Connection* get() const noexcept { return conn.get(); }
+};
+
+
+// factory_of concept requires the function signature:
+//   F(const DoutPrefixProvider*) -> std::unique_ptr<T>
+template <typename F, typename T>
+concept factory_of = requires (F factory, const DoutPrefixProvider* dpp) {
+  { factory(dpp) } -> std::same_as<std::unique_ptr<T>>;
+  requires std::move_constructible<F>;
+};
+
+
+/// Generic database connection pool that enforces a limit on open connections.
+template <typename Connection, factory_of<Connection> Factory>
+class ConnectionPool : public ConnectionPoolBase<Connection> {
+ public:
+  ConnectionPool(Factory factory, std::size_t max_connections)
+      : ConnectionPoolBase<Connection>(max_connections),
+        factory(std::move(factory))
+  {}
+
+  /// Borrow a connection from the pool. If all existing connections are in use,
+  /// use the connection factory to create another one. If we've reached the
+  /// limit on open connections, wait on a condition variable for the next one
+  /// returned to the pool.
+  auto get(const DoutPrefixProvider* dpp)
+      -> ConnectionHandle<Connection>
+  {
+    auto lock = std::unique_lock{this->mutex};
+    std::unique_ptr<Connection> conn;
+
+    if (!this->connections.empty()) {
+      // take an existing connection
+      conn = std::move(this->connections.front());
+      this->connections.pop_front();
+    } else if (total < this->connections.capacity()) {
+      // add another connection to the pool
+      conn = factory(dpp);
+      ++total;
+    } else {
+      // wait for the next put()
+      // TODO: support optional_yield
+      ldpp_dout(dpp, 4) << "ConnectionPool waiting on a connection" << dendl;
+      this->cond.wait(lock, [&] { return !this->connections.empty(); });
+      ldpp_dout(dpp, 4) << "ConnectionPool done waiting" << dendl;
+      conn = std::move(this->connections.front());
+      this->connections.pop_front();
+    }
+
+    return {this, std::move(conn)};
+  }
+ private:
+  Factory factory;
+  std::size_t total = 0;
+};
+
+} // namespace rgw::dbstore
diff --git a/src/rgw/driver/dbstore/common/dbstore.cc b/src/rgw/driver/dbstore/common/dbstore.cc
new file mode 100644 (file)
index 0000000..3936368
--- /dev/null
@@ -0,0 +1,2245 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "dbstore.h"
+
+using namespace std;
+
+namespace rgw { namespace store {
+
+map<string, class ObjectOp*> DB::objectmap = {};
+
+map<string, class ObjectOp*> DB::getObjectMap() {
+  return DB::objectmap;
+}
+
+int DB::Initialize(string logfile, int loglevel)
+{
+  int ret = -1;
+  const DoutPrefixProvider *dpp = get_def_dpp();
+
+  if (!cct) {
+    cout << "Failed to Initialize. No ceph Context \n";
+    return -1;
+  }
+
+  if (loglevel > 0) {
+    cct->_conf->subsys.set_log_level(ceph_subsys_rgw, loglevel);
+  }
+  if (!logfile.empty()) {
+    cct->_log->set_log_file(logfile);
+    cct->_log->reopen_log_file();
+  }
+
+
+  db = openDB(dpp);
+
+  if (!db) {
+    ldpp_dout(dpp, 0) <<"Failed to open database " << dendl;
+    return ret;
+  }
+
+  ret = InitializeDBOps(dpp);
+
+  if (ret) {
+    ldpp_dout(dpp, 0) <<"InitializeDBOps failed " << dendl;
+    closeDB(dpp);
+    db = NULL;
+    return ret;
+  }
+
+  ldpp_dout(dpp, 0) << "DB successfully initialized - name:" \
+    << db_name << "" << dendl;
+
+  return ret;
+}
+
+int DB::createGC(const DoutPrefixProvider *dpp) {
+  int ret = 0;
+  /* create gc thread */
+
+  gc_worker = std::make_unique<DB::GC>(dpp, this);
+  gc_worker->create("db_gc");
+
+  return ret;
+}
+
+int DB::stopGC() {
+  if (gc_worker) {
+    gc_worker->signal_stop();
+    gc_worker->join();
+  }
+  return 0;
+}
+
+int DB::Destroy(const DoutPrefixProvider *dpp)
+{
+  if (!db)
+    return 0;
+
+  stopGC();
+
+  closeDB(dpp);
+
+
+  ldpp_dout(dpp, 20)<<"DB successfully destroyed - name:" \
+    <<db_name << dendl;
+
+  return 0;
+}
+
+
+std::shared_ptr<class DBOp> DB::getDBOp(const DoutPrefixProvider *dpp, std::string_view Op,
+                  const DBOpParams *params)
+{
+  if (!Op.compare("InsertUser"))
+    return dbops.InsertUser;
+  if (!Op.compare("RemoveUser"))
+    return dbops.RemoveUser;
+  if (!Op.compare("GetUser"))
+    return dbops.GetUser;
+  if (!Op.compare("InsertBucket"))
+    return dbops.InsertBucket;
+  if (!Op.compare("UpdateBucket"))
+    return dbops.UpdateBucket;
+  if (!Op.compare("RemoveBucket"))
+    return dbops.RemoveBucket;
+  if (!Op.compare("GetBucket"))
+    return dbops.GetBucket;
+  if (!Op.compare("ListUserBuckets"))
+    return dbops.ListUserBuckets;
+  if (!Op.compare("InsertLCEntry"))
+    return dbops.InsertLCEntry;
+  if (!Op.compare("RemoveLCEntry"))
+    return dbops.RemoveLCEntry;
+  if (!Op.compare("GetLCEntry"))
+    return dbops.GetLCEntry;
+  if (!Op.compare("ListLCEntries"))
+    return dbops.ListLCEntries;
+  if (!Op.compare("InsertLCHead"))
+    return dbops.InsertLCHead;
+  if (!Op.compare("RemoveLCHead"))
+    return dbops.RemoveLCHead;
+  if (!Op.compare("GetLCHead"))
+    return dbops.GetLCHead;
+
+  /* Object Operations */
+  map<string, class ObjectOp*>::iterator iter;
+  class ObjectOp* Ob;
+
+  {
+    const std::lock_guard<std::mutex> lk(mtx);
+    iter = DB::objectmap.find(params->op.bucket.info.bucket.name);
+  }
+
+  if (iter == DB::objectmap.end()) {
+    ldpp_dout(dpp, 30)<<"No objectmap found for bucket: " \
+      <<params->op.bucket.info.bucket.name << dendl;
+    /* not found */
+    return nullptr;
+  }
+
+  Ob = iter->second;
+
+  if (!Op.compare("PutObject"))
+    return Ob->PutObject;
+  if (!Op.compare("DeleteObject"))
+    return Ob->DeleteObject;
+  if (!Op.compare("GetObject"))
+    return Ob->GetObject;
+  if (!Op.compare("UpdateObject"))
+    return Ob->UpdateObject;
+  if (!Op.compare("ListBucketObjects"))
+    return Ob->ListBucketObjects;
+  if (!Op.compare("ListVersionedObjects"))
+    return Ob->ListVersionedObjects;
+  if (!Op.compare("PutObjectData"))
+    return Ob->PutObjectData;
+  if (!Op.compare("UpdateObjectData"))
+    return Ob->UpdateObjectData;
+  if (!Op.compare("GetObjectData"))
+    return Ob->GetObjectData;
+  if (!Op.compare("DeleteObjectData"))
+    return Ob->DeleteObjectData;
+  if (!Op.compare("DeleteStaleObjectData"))
+    return Ob->DeleteStaleObjectData;
+
+  return nullptr;
+}
+
+int DB::objectmapInsert(const DoutPrefixProvider *dpp, string bucket, class ObjectOp* ptr)
+{
+  map<string, class ObjectOp*>::iterator iter;
+  class ObjectOp *Ob;
+
+  const std::lock_guard<std::mutex> lk(mtx);
+  iter = DB::objectmap.find(bucket);
+
+  if (iter != DB::objectmap.end()) {
+    // entry already exists
+    // return success or replace it or
+    // return error ?
+    //
+    // return success for now & delete the newly allocated ptr
+    ldpp_dout(dpp, 30)<<"Objectmap entry already exists for bucket("\
+      <<bucket<<"). Not inserted " << dendl;
+    delete ptr;
+    return 0;
+  }
+
+  Ob = (class ObjectOp*) ptr;
+  Ob->InitializeObjectOps(getDBname(), dpp);
+
+  DB::objectmap.insert(pair<string, class ObjectOp*>(bucket, Ob));
+
+  return 0;
+}
+
+int DB::objectmapDelete(const DoutPrefixProvider *dpp, string bucket)
+{
+  map<string, class ObjectOp*>::iterator iter;
+
+  const std::lock_guard<std::mutex> lk(mtx);
+  iter = DB::objectmap.find(bucket);
+
+  if (iter == DB::objectmap.end()) {
+    // entry doesn't exist
+    // return success or return error ?
+    // return success for now
+    ldpp_dout(dpp, 20)<<"Objectmap entry for bucket("<<bucket<<") "
+      <<"doesnt exist to delete " << dendl;
+    return 0;
+  }
+
+  DB::objectmap.erase(iter);
+
+  return 0;
+}
+
+int DB::InitializeParams(const DoutPrefixProvider *dpp, DBOpParams *params)
+{
+  int ret = -1;
+
+  if (!params)
+    goto out;
+
+  params->cct = cct;
+
+  //reset params here
+  params->user_table = user_table;
+  params->bucket_table = bucket_table;
+  params->quota_table = quota_table;
+  params->lc_entry_table = lc_entry_table;
+  params->lc_head_table = lc_head_table;
+
+  ret = 0;
+out:
+  return ret;
+}
+
+int DB::ProcessOp(const DoutPrefixProvider *dpp, std::string_view Op, DBOpParams *params) {
+  int ret = -1;
+  shared_ptr<class DBOp> db_op;
+
+  db_op = getDBOp(dpp, Op, params);
+
+  if (!db_op) {
+    ldpp_dout(dpp, 0)<<"No db_op found for Op("<<Op<<")" << dendl;
+    return ret;
+  }
+  ret = db_op->Execute(dpp, params);
+
+  if (ret) {
+    ldpp_dout(dpp, 0)<<"In Process op Execute failed for fop(" << Op << ")" << dendl;
+  } else {
+    ldpp_dout(dpp, 20)<<"Successfully processed fop(" << Op << ")" << dendl;
+  }
+
+  return ret;
+}
+
+int DB::get_user(const DoutPrefixProvider *dpp,
+    const std::string& query_str, const std::string& query_str_val,
+    RGWUserInfo& uinfo, map<string, bufferlist> *pattrs,
+    RGWObjVersionTracker *pobjv_tracker) {
+  int ret = 0;
+
+  if (query_str.empty() || query_str_val.empty()) {
+    ldpp_dout(dpp, 0)<<"In GetUser - Invalid query(" << query_str <<"), query_str_val(" << query_str_val <<")" << dendl;
+    return -1;
+  }
+
+  DBOpParams params = {};
+  InitializeParams(dpp, &params);
+
+  params.op.query_str = query_str;
+
+  // validate query_str with UserTable entries names
+  if (query_str == "username") {
+    params.op.user.uinfo.display_name = query_str_val;
+  } else if (query_str == "email") {
+    params.op.user.uinfo.user_email = query_str_val;
+  } else if (query_str == "access_key") {
+    RGWAccessKey k(query_str_val, "");
+    map<string, RGWAccessKey> keys;
+    keys[query_str_val] = k;
+    params.op.user.uinfo.access_keys = keys;
+  } else if (query_str == "user_id") {
+    params.op.user.uinfo.user_id = uinfo.user_id;
+  } else {
+    ldpp_dout(dpp, 0)<<"In GetUser Invalid query string :" <<query_str.c_str()<<") " << dendl;
+    return -1;
+  }
+
+  ret = ProcessOp(dpp, "GetUser", &params);
+
+  if (ret)
+    goto out;
+
+  /* Verify if its a valid user */
+  if (params.op.user.uinfo.access_keys.empty() ||
+        params.op.user.uinfo.user_id.id.empty()) {
+    ldpp_dout(dpp, 0)<<"In GetUser - No user with query(" <<query_str.c_str()<<"), user_id(" << uinfo.user_id <<") found" << dendl;
+    return -ENOENT;
+  }
+
+  uinfo = params.op.user.uinfo;
+
+  if (pattrs) {
+    *pattrs = params.op.user.user_attrs;
+  }
+
+  if (pobjv_tracker) {
+    pobjv_tracker->read_version = params.op.user.user_version;
+  }
+
+out:
+  return ret;
+}
+
+int DB::store_user(const DoutPrefixProvider *dpp,
+    RGWUserInfo& uinfo, bool exclusive, map<string, bufferlist> *pattrs,
+    RGWObjVersionTracker *pobjv, RGWUserInfo* pold_info)
+{
+  DBOpParams params = {};
+  InitializeParams(dpp, &params);
+  int ret = 0;
+
+  /* Check if the user already exists and return the old info, caller will have a use for it */
+  RGWUserInfo orig_info;
+  RGWObjVersionTracker objv_tracker = {};
+  obj_version& obj_ver = objv_tracker.read_version;
+
+  orig_info.user_id = uinfo.user_id;
+  ret = get_user(dpp, string("user_id"), uinfo.user_id.id, orig_info, nullptr, &objv_tracker);
+
+  if (!ret && obj_ver.ver) {
+    /* already exists. */
+
+    if (pold_info) {
+      *pold_info = orig_info;
+    }
+
+    if (pobjv && (pobjv->read_version.ver != obj_ver.ver)) {
+      /* Object version mismatch.. return ECANCELED */
+      ret = -ECANCELED;
+      ldpp_dout(dpp, 0)<<"User Read version mismatch err:(" <<ret<<") " << dendl;
+      return ret;
+    }
+
+    if (exclusive) {
+      // return
+      return ret;
+    }
+    obj_ver.ver++;
+  } else {
+    obj_ver.ver = 1;
+    obj_ver.tag = "UserTAG";
+  }
+
+  params.op.user.user_version = obj_ver;
+  params.op.user.uinfo = uinfo;
+
+  if (pattrs) {
+    params.op.user.user_attrs = *pattrs;
+  }
+
+  ret = ProcessOp(dpp, "InsertUser", &params);
+
+  if (ret) {
+    ldpp_dout(dpp, 0)<<"store_user failed with err:(" <<ret<<") " << dendl;
+    goto out;
+  }
+  ldpp_dout(dpp, 20)<<"User creation successful - userid:(" <<uinfo.user_id<<") " << dendl;
+
+  if (pobjv) {
+    pobjv->read_version = obj_ver;
+    pobjv->write_version = obj_ver;
+  }
+
+out:
+  return ret;
+}
+
+int DB::remove_user(const DoutPrefixProvider *dpp,
+    RGWUserInfo& uinfo, RGWObjVersionTracker *pobjv)
+{
+  DBOpParams params = {};
+  InitializeParams(dpp, &params);
+  int ret = 0;
+
+  RGWUserInfo orig_info;
+  RGWObjVersionTracker objv_tracker = {};
+
+  orig_info.user_id = uinfo.user_id;
+  ret = get_user(dpp, string("user_id"), uinfo.user_id.id, orig_info, nullptr, &objv_tracker);
+
+  if (ret) {
+    return ret;
+  }
+
+  if (!ret && objv_tracker.read_version.ver) {
+    /* already exists. */
+
+    if (pobjv && (pobjv->read_version.ver != objv_tracker.read_version.ver)) {
+      /* Object version mismatch.. return ECANCELED */
+      ret = -ECANCELED;
+      ldpp_dout(dpp, 0)<<"User Read version mismatch err:(" <<ret<<") " << dendl;
+      return ret;
+    }
+  }
+
+  params.op.user.uinfo.user_id = uinfo.user_id;
+
+  ret = ProcessOp(dpp, "RemoveUser", &params);
+
+  if (ret) {
+    ldpp_dout(dpp, 0)<<"remove_user failed with err:(" <<ret<<") " << dendl;
+    goto out;
+  }
+
+out:
+  return ret;
+}
+
+int DB::get_bucket_info(const DoutPrefixProvider *dpp, const std::string& query_str,
+    const std::string& query_str_val,
+    RGWBucketInfo& info,
+    rgw::sal::Attrs* pattrs, ceph::real_time* pmtime,
+    obj_version* pbucket_version) {
+  int ret = 0;
+
+  if (query_str.empty()) {
+    // not checking for query_str_val as the query can be to fetch
+    // entries with null values
+    return -1;
+  }
+
+  DBOpParams params = {};
+  DBOpParams params2 = {};
+  InitializeParams(dpp, &params);
+
+  if (query_str == "name") {
+    params.op.bucket.info.bucket.name = info.bucket.name;
+  } else {
+    ldpp_dout(dpp, 0)<<"In GetBucket Invalid query string :" <<query_str.c_str()<<") " << dendl;
+    return -1;
+  }
+
+  ret = ProcessOp(dpp, "GetBucket", &params);
+
+  if (ret) {
+    ldpp_dout(dpp, 0)<<"In GetBucket failed err:(" <<ret<<") " << dendl;
+    goto out;
+  }
+
+  if (!ret && params.op.bucket.info.bucket.marker.empty()) {
+    return -ENOENT;
+  }
+  info = params.op.bucket.info;
+
+  if (pattrs) {
+    *pattrs = params.op.bucket.bucket_attrs;
+  }
+
+  if (pmtime) {
+    *pmtime = params.op.bucket.mtime;
+  }
+  if (pbucket_version) {
+    *pbucket_version = params.op.bucket.bucket_version;
+  }
+
+out:
+  return ret;
+}
+
+int DB::create_bucket(const DoutPrefixProvider *dpp,
+    const RGWUserInfo& owner, rgw_bucket& bucket,
+    const string& zonegroup_id,
+    const rgw_placement_rule& placement_rule,
+    const string& swift_ver_location,
+    const RGWQuotaInfo * pquota_info,
+    map<std::string, bufferlist>& attrs,
+    RGWBucketInfo& info,
+    obj_version *pobjv,
+    obj_version *pep_objv,
+    real_time creation_time,
+    rgw_bucket *pmaster_bucket,
+    uint32_t *pmaster_num_shards,
+    optional_yield y,
+    bool exclusive)
+{
+  /*
+   * XXX: Simple creation for now.
+   *
+   * Referring to RGWRados::create_bucket(), 
+   * Check if bucket already exists, select_bucket_placement,
+   * is explicit put/remove instance info needed? - should not be ideally
+   */
+
+  DBOpParams params = {};
+  InitializeParams(dpp, &params);
+  int ret = 0;
+
+  /* Check if the bucket already exists and return the old info, caller will have a use for it */
+  RGWBucketInfo orig_info;
+  orig_info.bucket.name = bucket.name;
+  ret = get_bucket_info(dpp, string("name"), "", orig_info, nullptr, nullptr, nullptr);
+
+  if (!ret && !orig_info.owner.id.empty() && exclusive) {
+    /* already exists. Return the old info */
+
+    info = std::move(orig_info);
+    return ret;
+  }
+
+  RGWObjVersionTracker& objv_tracker = info.objv_tracker;
+
+  objv_tracker.read_version.clear();
+
+  if (pobjv) {
+    objv_tracker.write_version = *pobjv;
+  } else {
+    objv_tracker.generate_new_write_ver(cct);
+  }
+  params.op.bucket.bucket_version = objv_tracker.write_version;
+  objv_tracker.read_version = params.op.bucket.bucket_version;
+
+  uint64_t bid = next_bucket_id();
+  string s = getDBname() + "." + std::to_string(bid);
+  bucket.marker = bucket.bucket_id = s;
+
+  info.bucket = bucket;
+  info.owner = owner.user_id;
+  info.zonegroup = zonegroup_id;
+  info.placement_rule = placement_rule;
+  info.swift_ver_location = swift_ver_location;
+  info.swift_versioning = (!swift_ver_location.empty());
+
+  info.requester_pays = false;
+  if (real_clock::is_zero(creation_time)) {
+    info.creation_time = ceph::real_clock::now();
+  } else {
+    info.creation_time = creation_time;
+  }
+  if (pquota_info) {
+    info.quota = *pquota_info;
+  }
+
+  params.op.bucket.info = info;
+  params.op.bucket.bucket_attrs = attrs;
+  params.op.bucket.mtime = ceph::real_time();
+  params.op.user.uinfo.user_id.id = owner.user_id.id;
+
+  ret = ProcessOp(dpp, "InsertBucket", &params);
+
+  if (ret) {
+    ldpp_dout(dpp, 0)<<"create_bucket failed with err:(" <<ret<<") " << dendl;
+    goto out;
+  }
+
+out:
+  return ret;
+}
+
+int DB::remove_bucket(const DoutPrefixProvider *dpp, const RGWBucketInfo info) {
+  int ret = 0;
+
+  DBOpParams params = {};
+  InitializeParams(dpp, &params);
+
+  params.op.bucket.info.bucket.name = info.bucket.name;
+
+  ret = ProcessOp(dpp, "RemoveBucket", &params);
+
+  if (ret) {
+    ldpp_dout(dpp, 0)<<"In RemoveBucket failed err:(" <<ret<<") " << dendl;
+    goto out;
+  }
+
+out:
+  return ret;
+}
+
+int DB::list_buckets(const DoutPrefixProvider *dpp, const std::string& query_str,
+    rgw_user& user,
+    const string& marker,
+    const string& end_marker,
+    uint64_t max,
+    bool need_stats,
+    RGWUserBuckets *buckets,
+    bool *is_truncated)
+{
+  int ret = 0;
+
+  DBOpParams params = {};
+  InitializeParams(dpp, &params);
+
+  params.op.user.uinfo.user_id = user;
+  params.op.bucket.min_marker = marker;
+  params.op.bucket.max_marker = end_marker;
+  params.op.list_max_count = max;
+  params.op.query_str = query_str;
+
+  ret = ProcessOp(dpp, "ListUserBuckets", &params);
+
+  if (ret) {
+    ldpp_dout(dpp, 0)<<"In ListUserBuckets failed err:(" <<ret<<") " << dendl;
+    goto out;
+  }
+
+  /* need_stats: stats are already part of entries... In case they are maintained in
+   * separate table , maybe use "Inner Join" with stats table for the query.
+   */
+  if (params.op.bucket.list_entries.size() == max)
+    *is_truncated = true;
+
+  for (auto& entry : params.op.bucket.list_entries) {
+    if (!end_marker.empty() &&
+        end_marker.compare(entry.bucket.marker) <= 0) {
+      *is_truncated = false;
+      break;
+    }
+    buckets->add(std::move(entry));
+  }
+
+  if (query_str == "all") {
+    // userID/OwnerID may have changed. Update it.
+    user.id = params.op.bucket.info.owner.id;
+  }
+
+out:
+  return ret;
+}
+
+int DB::update_bucket(const DoutPrefixProvider *dpp, const std::string& query_str,
+    RGWBucketInfo& info,
+    bool exclusive,
+    const rgw_user* powner_id,
+    map<std::string, bufferlist>* pattrs,
+    ceph::real_time* pmtime,
+    RGWObjVersionTracker* pobjv)
+{
+  int ret = 0;
+  DBOpParams params = {};
+  obj_version bucket_version;
+  RGWBucketInfo orig_info;
+
+  /* Check if the bucket already exists and return the old info, caller will have a use for it */
+  orig_info.bucket.name = info.bucket.name;
+  params.op.bucket.info.bucket.name = info.bucket.name;
+  ret = get_bucket_info(dpp, string("name"), "", orig_info, nullptr, nullptr,
+      &bucket_version);
+
+  if (ret) {
+    ldpp_dout(dpp, 0)<<"Failed to read bucket info err:(" <<ret<<") " << dendl;
+    goto out;
+  }
+
+  if (!orig_info.owner.id.empty() && exclusive) {
+    /* already exists. Return the old info */
+
+    info = std::move(orig_info);
+    return ret;
+  }
+
+  /* Verify if the objv read_ver matches current bucket version */
+  if (pobjv) {
+    if (pobjv->read_version.ver != bucket_version.ver) {
+      ldpp_dout(dpp, 0)<<"Read version mismatch err:(" <<ret<<") " << dendl;
+      ret = -ECANCELED;
+      goto out;
+    }
+  } else {
+    pobjv = &info.objv_tracker;
+  }
+
+  InitializeParams(dpp, &params);
+
+  params.op.bucket.info.bucket.name = info.bucket.name;
+
+  if (powner_id) {
+    params.op.user.uinfo.user_id.id = powner_id->id;
+  } else {
+    params.op.user.uinfo.user_id.id = orig_info.owner.id;
+  }
+
+  /* Update version & mtime */
+  params.op.bucket.bucket_version.ver = ++(bucket_version.ver);
+
+  if (pmtime) {
+    params.op.bucket.mtime = *pmtime;;
+  } else {
+    params.op.bucket.mtime = ceph::real_time();
+  }
+
+  if (query_str == "attrs") {
+    params.op.query_str = "attrs";
+    params.op.bucket.bucket_attrs = *pattrs;
+  } else if (query_str == "owner") {
+    /* Update only owner i.e, chown. 
+     * Update creation_time too */
+    params.op.query_str = "owner";
+    params.op.bucket.info.creation_time = params.op.bucket.mtime;
+  } else if (query_str == "info") {
+    params.op.query_str = "info";
+    params.op.bucket.info = info;
+  } else {
+    ret = -1;
+    ldpp_dout(dpp, 0)<<"In UpdateBucket Invalid query_str : " << query_str << dendl;
+    goto out;
+  }
+
+  ret = ProcessOp(dpp, "UpdateBucket", &params);
+
+  if (ret) {
+    ldpp_dout(dpp, 0)<<"In UpdateBucket failed err:(" <<ret<<") " << dendl;
+    goto out;
+  }
+
+  if (pobjv) {
+    pobjv->read_version = params.op.bucket.bucket_version;
+    pobjv->write_version = params.op.bucket.bucket_version;
+  }
+
+out:
+  return ret;
+}
+
+/**
+ * Get ordered listing of the objects in a bucket.
+ *
+ * max_p: maximum number of results to return
+ * bucket: bucket to list contents of
+ * prefix: only return results that match this prefix
+ * delim: do not include results that match this string.
+ *     Any skipped results will have the matching portion of their name
+ *     inserted in common_prefixes with a "true" mark.
+ * marker: if filled in, begin the listing with this object.
+ * end_marker: if filled in, end the listing with this object.
+ * result: the objects are put in here.
+ * common_prefixes: if delim is filled in, any matching prefixes are
+ * placed here.
+ * is_truncated: if number of objects in the bucket is bigger than
+ * max, then truncated.
+ */
+int DB::Bucket::List::list_objects(const DoutPrefixProvider *dpp, int64_t max,
+                          vector<rgw_bucket_dir_entry> *result,
+                          map<string, bool> *common_prefixes, bool *is_truncated)
+{
+  int ret = 0;
+  DB *store = target->get_store();
+  int64_t count = 0;
+  std::string prev_obj;
+
+  DBOpParams db_params = {};
+  store->InitializeParams(dpp, &db_params);
+
+  db_params.op.bucket.info = target->get_bucket_info(); 
+  /* XXX: Handle whole marker? key -> name, instance, ns? */
+  db_params.op.obj.min_marker = params.marker.name;
+  db_params.op.obj.max_marker = params.end_marker.name;
+  db_params.op.obj.prefix = params.prefix + "%";
+  db_params.op.list_max_count = max + 1; /* +1 for next_marker */
+
+  ret = store->ProcessOp(dpp, "ListBucketObjects", &db_params);
+
+  if (ret) {
+    ldpp_dout(dpp, 0)<<"In ListBucketObjects failed err:(" <<ret<<") " << dendl;
+    goto out;
+  }
+
+  for (auto& entry : db_params.op.obj.list_entries) {
+
+    if (!params.list_versions) {
+      if (entry.flags & rgw_bucket_dir_entry::FLAG_DELETE_MARKER) {
+        prev_obj = entry.key.name;
+        // skip all non-current entries and delete_marker
+        continue;
+      }
+      if (entry.key.name == prev_obj) {
+        // non current versions..skip the entry
+        continue;
+      }
+      entry.flags |= rgw_bucket_dir_entry::FLAG_CURRENT;
+    } else {
+      if (entry.key.name != prev_obj) {
+        // current version
+        entry.flags |= rgw_bucket_dir_entry::FLAG_CURRENT;
+      } else {
+        entry.flags &= ~(rgw_bucket_dir_entry::FLAG_CURRENT);
+        entry.flags |= rgw_bucket_dir_entry::FLAG_VER;
+      }
+    }
+
+    prev_obj = entry.key.name;
+
+    if (count >= max) {
+      *is_truncated = true;
+      next_marker.name = entry.key.name;
+      next_marker.instance = entry.key.instance;
+      break;
+    }
+
+    if (!params.delim.empty()) {
+    const std::string& objname = entry.key.name;
+       const int delim_pos = objname.find(params.delim, params.prefix.size());
+         if (delim_pos >= 0) {
+           /* extract key -with trailing delimiter- for CommonPrefix */
+           const std::string& prefix_key =
+             objname.substr(0, delim_pos + params.delim.length());
+
+           if (common_prefixes &&
+               common_prefixes->find(prefix_key) == common_prefixes->end()) {
+          next_marker = prefix_key;
+          (*common_prefixes)[prefix_key] = true;
+          count++;
+        }
+        continue;
+      }
+    }
+
+    if (!params.end_marker.name.empty() &&
+        params.end_marker.name.compare(entry.key.name) <= 0) {
+      // should not include end_marker
+      *is_truncated = false;
+      break;
+    }
+    count++;
+    result->push_back(std::move(entry));
+  }
+out:
+  return ret;
+}
+
+int DB::raw_obj::InitializeParamsfromRawObj(const DoutPrefixProvider *dpp,
+                                            DBOpParams* params) {
+  int ret = 0;
+
+  if (!params)
+    return -1;
+
+  params->op.bucket.info.bucket.name = bucket_name;
+  params->op.obj.state.obj.key.name = obj_name;
+  params->op.obj.state.obj.key.instance = obj_instance;
+  params->op.obj.state.obj.key.ns = obj_ns;
+  params->op.obj.obj_id = obj_id;
+
+  if (multipart_part_str != "0.0") {
+    params->op.obj.is_multipart = true;
+  } else {
+    params->op.obj.is_multipart = false;
+  }
+
+  params->op.obj_data.multipart_part_str = multipart_part_str;
+  params->op.obj_data.part_num = part_num;
+
+  return ret;
+}
+
+int DB::Object::InitializeParamsfromObject(const DoutPrefixProvider *dpp,
+                                           DBOpParams* params) {
+  int ret = 0;
+  string bucket = bucket_info.bucket.name;
+
+  if (!params)
+    return -1;
+
+  params->op.bucket.info.bucket.name = bucket;
+  params->op.obj.state.obj = obj;
+  params->op.obj.obj_id = obj_id;
+
+  return ret;
+}
+
+int DB::Object::get_object_impl(const DoutPrefixProvider *dpp, DBOpParams& params) {
+  int ret = 0;
+
+  if (params.op.obj.state.obj.key.name.empty()) {
+    /* Initialize */
+    store->InitializeParams(dpp, &params);
+    InitializeParamsfromObject(dpp, &params);
+  }
+
+  ret = store->ProcessOp(dpp, "GetObject", &params);
+
+  /* pick one field check if object exists */
+  if (!ret && !params.op.obj.state.exists) {
+    ldpp_dout(dpp, 0)<<"Object(bucket:" << bucket_info.bucket.name << ", Object:"<< obj.key.name << ") doesn't exist" << dendl;
+    ret = -ENOENT;
+  }
+
+  return ret;
+}
+
+int DB::Object::obj_omap_set_val_by_key(const DoutPrefixProvider *dpp,
+                                        const std::string& key, bufferlist& val,
+                                        bool must_exist) {
+  int ret = 0;
+
+  DBOpParams params = {};
+
+  ret = get_object_impl(dpp, params);
+
+  if (ret) {
+    ldpp_dout(dpp, 0) <<"get_object_impl failed err:(" <<ret<<")" << dendl;
+    goto out;
+  }
+
+  params.op.obj.omap[key] = val;
+  params.op.query_str = "omap";
+  params.op.obj.state.mtime = real_clock::now();
+
+  ret = store->ProcessOp(dpp, "UpdateObject", &params);
+
+  if (ret) {
+    ldpp_dout(dpp, 0)<<"In UpdateObject failed err:(" <<ret<<") " << dendl;
+    goto out;
+  }
+
+out:
+  return ret;
+}
+
+int DB::Object::obj_omap_get_vals_by_keys(const DoutPrefixProvider *dpp,
+                                          const std::string& oid,
+                                          const std::set<std::string>& keys,
+                                          std::map<std::string, bufferlist>* vals)
+{
+  int ret = 0;
+  DBOpParams params = {};
+  std::map<std::string, bufferlist> omap;
+
+  if (!vals)
+    return -1;
+
+  ret = get_object_impl(dpp, params);
+
+  if (ret) {
+    ldpp_dout(dpp, 0) <<"get_object_impl failed err:(" <<ret<<")" << dendl;
+    goto out;
+  }
+
+  omap = params.op.obj.omap;
+
+  for (const auto& k :  keys) {
+    (*vals)[k] = omap[k];
+  }
+
+out:
+  return ret;
+}
+
+int DB::Object::add_mp_part(const DoutPrefixProvider *dpp,
+                            RGWUploadPartInfo info) {
+  int ret = 0;
+
+  DBOpParams params = {};
+
+  ret = get_object_impl(dpp, params);
+
+  if (ret) {
+    ldpp_dout(dpp, 0) <<"get_object_impl failed err:(" <<ret<<")" << dendl;
+    goto out;
+  }
+
+  params.op.obj.mp_parts.push_back(info);
+  params.op.query_str = "mp";
+  params.op.obj.state.mtime = real_clock::now();
+
+  ret = store->ProcessOp(dpp, "UpdateObject", &params);
+
+  if (ret) {
+    ldpp_dout(dpp, 0)<<"In UpdateObject failed err:(" <<ret<<") " << dendl;
+    goto out;
+  }
+
+out:
+  return ret;
+}
+
+int DB::Object::get_mp_parts_list(const DoutPrefixProvider *dpp,
+                                  std::list<RGWUploadPartInfo>& info)
+{
+  int ret = 0;
+  DBOpParams params = {};
+  std::map<std::string, bufferlist> omap;
+
+  ret = get_object_impl(dpp, params);
+
+  if (ret) {
+    ldpp_dout(dpp, 0) <<"get_object_impl failed err:(" <<ret<<")" << dendl;
+    goto out;
+  }
+
+  info = params.op.obj.mp_parts;
+
+out:
+  return ret;
+}
+
+/* Taken from rgw_rados.cc */
+void DB::gen_rand_obj_instance_name(rgw_obj_key *target_key)
+{
+#define OBJ_INSTANCE_LEN 32
+  char buf[OBJ_INSTANCE_LEN + 1];
+
+  gen_rand_alphanumeric_no_underscore(cct, buf, OBJ_INSTANCE_LEN); /* don't want it to get url escaped,
+                                                                      no underscore for instance name due to the way we encode the raw keys */
+
+  target_key->set_instance(buf);
+}
+
+int DB::Object::obj_omap_get_all(const DoutPrefixProvider *dpp,
+                                 std::map<std::string, bufferlist> *m)
+{
+  int ret = 0;
+  DBOpParams params = {};
+  std::map<std::string, bufferlist> omap;
+
+  if (!m)
+    return -1;
+
+  ret = get_object_impl(dpp, params);
+
+  if (ret) {
+    ldpp_dout(dpp, 0) <<"get_object_impl failed err:(" <<ret<<")" << dendl;
+    goto out;
+  }
+
+  (*m) = params.op.obj.omap;
+
+out:
+  return ret;
+}
+
+int DB::Object::obj_omap_get_vals(const DoutPrefixProvider *dpp,
+                                  const std::string& marker,
+                                  uint64_t max_count,
+                                  std::map<std::string, bufferlist> *m, bool* pmore)
+{
+  int ret = 0;
+  DBOpParams params = {};
+  std::map<std::string, bufferlist> omap;
+  map<string, bufferlist>::iterator iter;
+  uint64_t count = 0;
+
+  if (!m)
+    return -1;
+
+  ret = get_object_impl(dpp, params);
+
+  if (ret) {
+    ldpp_dout(dpp, 0) <<"get_object_impl failed err:(" <<ret<<")" << dendl;
+    goto out;
+  }
+
+  omap = params.op.obj.omap;
+
+  for (iter = omap.begin(); iter != omap.end(); ++iter) {
+
+    if (iter->first < marker)
+      continue;
+
+    if ((++count) > max_count) {
+      *pmore = true;
+      break;
+    }
+
+    (*m)[iter->first] = iter->second;
+  }
+
+out:
+  return ret;
+}
+
+int DB::Object::set_attrs(const DoutPrefixProvider *dpp,
+                          map<string, bufferlist>& setattrs,
+                          map<string, bufferlist>* rmattrs)
+{
+  int ret = 0;
+
+  DBOpParams params = {};
+  rgw::sal::Attrs *attrs;
+  map<string, bufferlist>::iterator iter;
+
+  ret = get_object_impl(dpp, params);
+
+  if (ret) {
+    ldpp_dout(dpp, 0) <<"get_object_impl failed err:(" <<ret<<")" << dendl;
+    goto out;
+  }
+
+  /* For now lets keep it simple..rmattrs & setattrs ..
+   * XXX: Check rgw_rados::set_attrs
+   */
+  attrs = &params.op.obj.state.attrset;
+  if (rmattrs) {
+    for (iter = rmattrs->begin(); iter != rmattrs->end(); ++iter) {
+      (*attrs).erase(iter->first);
+    }
+  }
+  for (iter = setattrs.begin(); iter != setattrs.end(); ++iter) {
+    (*attrs)[iter->first] = iter->second;
+  }
+
+  params.op.query_str = "attrs";
+  params.op.obj.state.mtime = real_clock::now();
+
+  ret = store->ProcessOp(dpp, "UpdateObject", &params);
+
+  if (ret) {
+    ldpp_dout(dpp, 0)<<"In UpdateObject failed err:(" <<ret<<") " << dendl;
+    goto out;
+  }
+
+out:
+  return ret;
+}
+
+int DB::Object::transition(const DoutPrefixProvider *dpp,
+                           const rgw_placement_rule& rule,
+                           const real_time& mtime,
+                           uint64_t olh_epoch)
+{
+  int ret = 0;
+
+  DBOpParams params = {};
+  map<string, bufferlist> *attrset;
+
+  store->InitializeParams(dpp, &params);
+  InitializeParamsfromObject(dpp, &params);
+
+  ret = store->ProcessOp(dpp, "GetObject", &params);
+
+  if (ret) {
+    ldpp_dout(dpp, 0) <<"In GetObject failed err:(" <<ret<<")" << dendl;
+    goto out;
+  }
+
+  /* pick one field check if object exists */
+  if (!params.op.obj.state.exists) {
+    ldpp_dout(dpp, 0)<<"Object(bucket:" << bucket_info.bucket.name << ", Object:"<< obj.key.name << ") doesn't exist" << dendl;
+    return -1;
+  }
+
+  params.op.query_str = "meta";
+  params.op.obj.state.mtime = real_clock::now();
+  params.op.obj.storage_class = rule.storage_class;
+  attrset = &params.op.obj.state.attrset;
+  if (!rule.storage_class.empty()) {
+    bufferlist bl;
+    bl.append(rule.storage_class);
+    (*attrset)[RGW_ATTR_STORAGE_CLASS] = bl;
+  }
+  params.op.obj.versioned_epoch = olh_epoch; // XXX: not sure if needed
+
+  /* Unlike Rados, in dbstore for now, both head and tail objects
+   * refer to same storage class
+   */
+  params.op.obj.head_placement_rule = rule;
+  params.op.obj.tail_placement.placement_rule = rule;
+
+  ret = store->ProcessOp(dpp, "UpdateObject", &params);
+
+  if (ret) {
+    ldpp_dout(dpp, 0)<<"In UpdateObject failed err:(" <<ret<<") " << dendl;
+    goto out;
+  }
+
+out:
+  return ret;
+}
+
+int DB::raw_obj::read(const DoutPrefixProvider *dpp, int64_t ofs,
+                      uint64_t len, bufferlist& bl)
+{
+  int ret = 0;
+  DBOpParams params = {};
+
+  db->InitializeParams(dpp, &params);
+  InitializeParamsfromRawObj(dpp, &params);
+
+  ret = db->ProcessOp(dpp, "GetObjectData", &params);
+
+  if (ret) {
+    ldpp_dout(dpp, 0)<<"In GetObjectData failed err:(" <<ret<<")" << dendl;
+    return ret;
+  }
+
+  /* Verify if its valid obj */
+  if (!params.op.obj_data.size) {
+    ret = -ENOENT;
+    ldpp_dout(dpp, 0)<<"In GetObjectData failed err:(" <<ret<<")" << dendl;
+    return ret;
+  }
+
+  bufferlist& read_bl = params.op.obj_data.data;
+
+  unsigned copy_len;
+  copy_len = std::min((uint64_t)read_bl.length() - ofs, len);
+  read_bl.begin(ofs).copy(copy_len, bl);
+  return bl.length();
+}
+
+int DB::raw_obj::write(const DoutPrefixProvider *dpp, int64_t ofs, int64_t write_ofs,
+                       uint64_t len, bufferlist& bl)
+{
+  int ret = 0;
+  DBOpParams params = {};
+
+  db->InitializeParams(dpp, &params);
+  InitializeParamsfromRawObj(dpp, &params);
+
+  /* XXX: Check for chunk_size ?? */
+  params.op.obj_data.offset = ofs;
+  unsigned write_len = std::min((uint64_t)bl.length() - write_ofs, len);
+  bl.begin(write_ofs).copy(write_len, params.op.obj_data.data);
+  params.op.obj_data.size = params.op.obj_data.data.length();
+  params.op.obj.state.mtime = real_clock::now();
+
+  ret = db->ProcessOp(dpp, "PutObjectData", &params);
+
+  if (ret) {
+    ldpp_dout(dpp, 0)<<"In PutObjectData failed err:(" <<ret<<")" << dendl;
+    return ret;
+  }
+
+  return write_len;
+}
+
+int DB::Object::list_versioned_objects(const DoutPrefixProvider *dpp,
+                                       std::list<rgw_bucket_dir_entry>& list_entries) {
+  int ret = 0;
+  store = get_store();
+  DBOpParams db_params = {};
+
+  store->InitializeParams(dpp, &db_params);
+  InitializeParamsfromObject(dpp, &db_params);
+
+  db_params.op.list_max_count = MAX_VERSIONED_OBJECTS;
+
+  ret = store->ProcessOp(dpp, "ListVersionedObjects", &db_params);
+
+  if (ret) {
+    ldpp_dout(dpp, 0)<<"In ListVersionedObjects failed err:(" <<ret<<") " << dendl;
+  } else {
+    list_entries = db_params.op.obj.list_entries;
+  }
+
+  return ret;
+}
+
+int DB::Object::get_obj_state(const DoutPrefixProvider *dpp,
+                              const RGWBucketInfo& bucket_info, const rgw_obj& obj,
+                              bool follow_olh, RGWObjState** state)
+{
+  int ret = 0;
+
+  DBOpParams params = {};
+  RGWObjState* s;
+
+  if (!obj.key.instance.empty()) {
+    /* Versionid provided. Fetch the object */
+    ret = get_object_impl(dpp, params);
+
+    if (ret && ret != -ENOENT) {
+      ldpp_dout(dpp, 0) <<"get_object_impl failed err:(" <<ret<<")" << dendl;
+      goto out;
+    }
+  } else {
+    /* Instance is empty. May or may not be versioned object.
+     * List all the versions and read the most recent entry */
+    ret = list_versioned_objects(dpp, params.op.obj.list_entries);
+
+    if (params.op.obj.list_entries.size() != 0) {
+       /* Ensure its not a delete marker */
+      auto& ent = params.op.obj.list_entries.front();
+      if (ent.flags & rgw_bucket_dir_entry::FLAG_DELETE_MARKER) {
+        ret = -ENOENT;
+        goto out;
+      }
+      store->InitializeParams(dpp, &params);
+      InitializeParamsfromObject(dpp, &params);
+      params.op.obj.state.obj.key = ent.key;
+    
+      ret = get_object_impl(dpp, params);
+
+      if (ret) {
+        ldpp_dout(dpp, 0) <<"get_object_impl of versioned object failed err:(" <<ret<<")" << dendl;
+        goto out;
+      }
+    } else {
+      ret = -ENOENT;
+      return ret;
+    }
+  }
+
+  s = &params.op.obj.state;
+  /* XXX: For now use state->shadow_obj to store ObjectID string */
+  s->shadow_obj = params.op.obj.obj_id;
+
+  *state = &obj_state;
+  **state = *s;
+
+out:
+  return ret;
+
+}
+
+int DB::Object::get_state(const DoutPrefixProvider *dpp, RGWObjState** pstate, bool follow_olh)
+{
+  return get_obj_state(dpp, bucket_info, obj, follow_olh, pstate);
+}
+
+int DB::Object::Read::get_attr(const DoutPrefixProvider *dpp, const char *name, bufferlist& dest)
+{
+  RGWObjState* state;
+  int r = source->get_state(dpp, &state, true);
+  if (r < 0)
+    return r;
+  if (!state->exists)
+    return -ENOENT;
+  if (!state->get_attr(name, dest))
+    return -ENODATA;
+
+  return 0;
+}
+
+int DB::Object::Read::prepare(const DoutPrefixProvider *dpp)
+{
+  DB *store = source->get_store();
+  CephContext *cct = store->ctx();
+
+  bufferlist etag;
+
+  map<string, bufferlist>::iterator iter;
+
+  RGWObjState* astate;
+
+  int r = source->get_state(dpp, &astate, true);
+  if (r < 0)
+    return r;
+
+  if (!astate->exists) {
+    return -ENOENT;
+  }
+
+  state.obj = astate->obj;
+  source->obj_id = astate->shadow_obj;
+
+  if (params.target_obj) {
+    *params.target_obj = state.obj;
+  }
+  if (params.attrs) {
+    *params.attrs = astate->attrset;
+    if (cct->_conf->subsys.should_gather<ceph_subsys_rgw, 20>()) {
+      for (iter = params.attrs->begin(); iter != params.attrs->end(); ++iter) {
+        ldpp_dout(dpp, 20) << "Read xattr rgw_rados: " << iter->first << dendl;
+      }
+    }
+  }
+
+  if (conds.if_match || conds.if_nomatch) {
+    r = get_attr(dpp, RGW_ATTR_ETAG, etag);
+    if (r < 0)
+      return r;
+
+    if (conds.if_match) {
+      string if_match_str = rgw_string_unquote(conds.if_match);
+      ldpp_dout(dpp, 10) << "ETag: " << string(etag.c_str(), etag.length()) << " " << " If-Match: " << if_match_str << dendl;
+      if (if_match_str.compare(0, etag.length(), etag.c_str(), etag.length()) != 0) {
+        return -ERR_PRECONDITION_FAILED;
+      }
+    }
+
+    if (conds.if_nomatch) {
+      string if_nomatch_str = rgw_string_unquote(conds.if_nomatch);
+      ldpp_dout(dpp, 10) << "ETag: " << string(etag.c_str(), etag.length()) << " " << " If-NoMatch: " << if_nomatch_str << dendl;
+      if (if_nomatch_str.compare(0, etag.length(), etag.c_str(), etag.length()) == 0) {
+        return -ERR_NOT_MODIFIED;
+      }
+    }
+  }
+
+  if (params.obj_size)
+    *params.obj_size = astate->size;
+  if (params.lastmod)
+    *params.lastmod = astate->mtime;
+
+  return 0;
+}
+
+int DB::Object::Read::range_to_ofs(uint64_t obj_size, int64_t &ofs, int64_t &end)
+{
+  if (ofs < 0) {
+    ofs += obj_size;
+    if (ofs < 0)
+      ofs = 0;
+    end = obj_size - 1;
+  } else if (end < 0) {
+    end = obj_size - 1;
+  }
+
+  if (obj_size > 0) {
+    if (ofs >= (off_t)obj_size) {
+      return -ERANGE;
+    }
+    if (end >= (off_t)obj_size) {
+      end = obj_size - 1;
+    }
+  }
+  return 0;
+}
+
+int DB::Object::Read::read(int64_t ofs, int64_t end, bufferlist& bl, const DoutPrefixProvider *dpp)
+{
+  DB *store = source->get_store();
+
+  uint64_t read_ofs = ofs;
+  uint64_t len, read_len;
+
+  bufferlist read_bl;
+  uint64_t max_chunk_size = store->get_max_chunk_size();
+
+  RGWObjState* astate;
+  int r = source->get_state(dpp, &astate, true);
+  if (r < 0)
+    return r;
+
+  if (!astate->exists) {
+    return -ENOENT;
+  }
+
+  if (astate->size == 0) {
+    end = 0;
+  } else if (end >= (int64_t)astate->size) {
+    end = astate->size - 1;
+  }
+
+  if (end < 0)
+    len = 0;
+  else
+    len = end - ofs + 1;
+
+
+  if (len > max_chunk_size) {
+    len = max_chunk_size;
+  }
+
+  int head_data_size = astate->data.length();
+  bool reading_from_head = (ofs < head_data_size);
+
+  if (reading_from_head) {
+    if (astate) { // && astate->prefetch_data)?
+      if (!ofs && astate->data.length() >= len) {
+        bl = astate->data;
+        return bl.length();
+      }
+
+      if (ofs < astate->data.length()) {
+        unsigned copy_len = std::min((uint64_t)head_data_size - ofs, len);
+        astate->data.begin(ofs).copy(copy_len, bl);
+        return bl.length();
+      }
+    }
+  }
+
+  /* tail object */
+  int part_num = (ofs / max_chunk_size);
+  /* XXX: Handle multipart_str */
+  raw_obj read_obj(store, source->get_bucket_info().bucket.name, astate->obj.key.name, 
+      astate->obj.key.instance, astate->obj.key.ns, source->obj_id, "0.0", part_num);
+
+  read_len = len;
+
+  ldpp_dout(dpp, 20) << "dbstore->read obj-ofs=" << ofs << " read_ofs=" << read_ofs << " read_len=" << read_len << dendl;
+
+  // read from non head object
+  r = read_obj.read(dpp, read_ofs, read_len, bl);
+
+  if (r < 0) {
+    return r;
+  }
+
+  return bl.length();
+}
+
+static int _get_obj_iterate_cb(const DoutPrefixProvider *dpp,
+    const DB::raw_obj& read_obj, off_t obj_ofs,
+    off_t len, bool is_head_obj,
+    RGWObjState* astate, void *arg)
+{
+  struct db_get_obj_data* d = static_cast<struct db_get_obj_data*>(arg);
+  return d->store->get_obj_iterate_cb(dpp, read_obj, obj_ofs, len,
+      is_head_obj, astate, arg);
+}
+
+int DB::get_obj_iterate_cb(const DoutPrefixProvider *dpp,
+    const raw_obj& read_obj, off_t obj_ofs,
+    off_t len, bool is_head_obj,
+    RGWObjState* astate, void *arg)
+{
+  struct db_get_obj_data* d = static_cast<struct db_get_obj_data*>(arg);
+  bufferlist bl;
+  int r = 0;
+
+  if (is_head_obj) {
+    bl = astate->data;
+  } else {
+    // read from non head object
+    raw_obj robj = read_obj;
+    /* read entire data. So pass offset as '0' & len as '-1' */
+    r = robj.read(dpp, 0, -1, bl);
+
+    if (r <= 0) {
+      return r;
+    }
+  }
+
+  unsigned read_ofs = 0, read_len = 0;
+  while (read_ofs < bl.length()) {
+    unsigned chunk_len = std::min((uint64_t)bl.length() - read_ofs, (uint64_t)len);
+    r = d->client_cb->handle_data(bl, read_ofs, chunk_len);
+    if (r < 0)
+      return r;
+    read_ofs += chunk_len;
+    read_len += chunk_len;
+    ldpp_dout(dpp, 20) << "dbstore->get_obj_iterate_cb  obj-ofs=" << obj_ofs << " len=" << len <<  " chunk_len = " << chunk_len << " read_len = " << read_len << dendl;
+  }
+
+
+  d->offset += read_len;
+
+  return read_len;
+}
+
+int DB::Object::Read::iterate(const DoutPrefixProvider *dpp, int64_t ofs, int64_t end, RGWGetDataCB *cb)
+{
+  DB *store = source->get_store();
+  const uint64_t chunk_size = store->get_max_chunk_size();
+
+  db_get_obj_data data(store, cb, ofs);
+
+  int r = source->iterate_obj(dpp, source->get_bucket_info(), state.obj,
+      ofs, end, chunk_size, _get_obj_iterate_cb, &data);
+  if (r < 0) {
+    ldpp_dout(dpp, 0) << "iterate_obj() failed with " << r << dendl;
+    return r;
+  }
+
+  return 0;
+}
+
+int DB::Object::iterate_obj(const DoutPrefixProvider *dpp,
+    const RGWBucketInfo& bucket_info, const rgw_obj& obj,
+    off_t ofs, off_t end, uint64_t max_chunk_size,
+    iterate_obj_cb cb, void *arg)
+{
+  DB *store = get_store();
+  uint64_t len;
+  RGWObjState* astate;
+
+  int r = get_state(dpp, &astate, true);
+  if (r < 0) {
+    return r;
+  }
+
+  if (!astate->exists) {
+    return -ENOENT;
+  }
+
+  if (end < 0)
+    len = 0;
+  else
+    len = end - ofs + 1;
+
+  /* XXX: Will it really help to store all parts info in astate like manifest in Rados? */
+  int part_num = 0;
+  int head_data_size = astate->data.length();
+
+  while (ofs <= end && (uint64_t)ofs < astate->size) {
+    part_num = (ofs / max_chunk_size);
+    uint64_t read_len = std::min(len, max_chunk_size);
+
+    /* XXX: Handle multipart_str */
+    raw_obj read_obj(store, get_bucket_info().bucket.name, astate->obj.key.name, 
+        astate->obj.key.instance, astate->obj.key.ns, obj_id, "0.0", part_num);
+    bool reading_from_head = (ofs < head_data_size);
+
+    r = cb(dpp, read_obj, ofs, read_len, reading_from_head, astate, arg);
+    if (r <= 0) {
+      return r;
+    }
+    /* r refers to chunk_len (no. of bytes) handled in cb */
+    len -= r;
+    ofs += r;
+  }
+
+  return 0;
+}
+
+int DB::Object::Write::prepare(const DoutPrefixProvider* dpp)
+{
+  DB *store = target->get_store();
+
+  int ret = -1;
+
+  /* XXX: handle assume_noent */
+
+  obj_state.obj = target->obj;
+  if (target->obj_id.empty()) {
+    if (!target->obj.key.instance.empty() && (target->obj.key.instance != "null")) {
+      /* versioned object. Set obj_id same as versionID/instance */
+      target->obj_id = target->obj.key.instance;
+    } else {
+      // generate obj_id
+      char buf[33];
+      gen_rand_alphanumeric(store->ctx(), buf, sizeof(buf) - 1);
+      target->obj_id = buf;
+    }
+  }
+
+  ret = 0;
+  return ret;
+}
+
+/* writes tail objects */
+int DB::Object::Write::write_data(const DoutPrefixProvider* dpp,
+                               bufferlist& data, uint64_t ofs) {
+  DB *store = target->get_store();
+  /* tail objects */
+  /* XXX: Split into parts each of max_chunk_size. But later make tail
+   * object chunk size limit to sqlite blob limit */
+  int part_num = 0;
+
+  uint64_t max_chunk_size = store->get_max_chunk_size();
+
+  /* tail_obj ofs should be greater than max_head_size */
+  if (mp_part_str == "0.0")  { // ensure not multipart meta object
+    if (ofs < store->get_max_head_size()) {
+      return -1;
+    }
+  }
+  
+  uint64_t end = data.length();
+  uint64_t write_ofs = 0;
+  /* as we are writing max_chunk_size at a time in sal_dbstore DBAtomicWriter::process(),
+   * maybe this while loop is not needed
+   */
+  while (write_ofs < end) {
+    part_num = (ofs / max_chunk_size);
+    uint64_t len = std::min(end, max_chunk_size);
+
+    /* XXX: Handle multipart_str */
+    raw_obj write_obj(store, target->get_bucket_info().bucket.name, obj_state.obj.key.name, 
+        obj_state.obj.key.instance, obj_state.obj.key.ns, target->obj_id, mp_part_str, part_num);
+
+
+    ldpp_dout(dpp, 20) << "dbstore->write obj-ofs=" << ofs << " write_len=" << len << dendl;
+
+    // write into non head object
+    int r = write_obj.write(dpp, ofs, write_ofs, len, data); 
+    if (r < 0) {
+      return r;
+    }
+    /* r refers to chunk_len (no. of bytes) handled in raw_obj::write */
+    len -= r;
+    ofs += r;
+    write_ofs += r;
+  }
+
+  return 0;
+}
+
+/* Write metadata & head object data */
+int DB::Object::Write::_do_write_meta(const DoutPrefixProvider *dpp,
+    uint64_t size, uint64_t accounted_size,
+    map<string, bufferlist>& attrs,
+    bool assume_noent, bool modify_tail)
+{
+  DB *store = target->get_store();
+
+  RGWObjState* state = &obj_state;
+  map<string, bufferlist> *attrset;
+  DBOpParams params = {};
+  int ret = 0;
+  string etag;
+  string content_type;
+  bufferlist acl_bl;
+  string storage_class;
+
+  map<string, bufferlist>::iterator iter;
+
+  store->InitializeParams(dpp, &params);
+  target->InitializeParamsfromObject(dpp, &params);
+
+  obj_state = params.op.obj.state;
+
+  if (real_clock::is_zero(meta.set_mtime)) {
+    meta.set_mtime = real_clock::now();
+  }
+
+  attrset = &state->attrset;
+  if (target->bucket_info.obj_lock_enabled() && target->bucket_info.obj_lock.has_rule()) {
+    // && meta.flags == PUT_OBJ_CREATE) {
+    auto iter = attrs.find(RGW_ATTR_OBJECT_RETENTION);
+    if (iter == attrs.end()) {
+      real_time lock_until_date = target->bucket_info.obj_lock.get_lock_until_date(meta.set_mtime);
+      string mode = target->bucket_info.obj_lock.get_mode();
+      RGWObjectRetention obj_retention(mode, lock_until_date);
+      bufferlist bl;
+      obj_retention.encode(bl);
+      (*attrset)[RGW_ATTR_OBJECT_RETENTION] = bl;
+    }
+  }
+
+  state->mtime = meta.set_mtime;
+
+  if (meta.data) {
+    /* if we want to overwrite the data, we also want to overwrite the
+       xattrs, so just remove the object */
+    params.op.obj.head_data = *meta.data;
+  }
+
+  if (meta.rmattrs) {
+    for (iter = meta.rmattrs->begin(); iter != meta.rmattrs->end(); ++iter) {
+      const string& name = iter->first;
+      (*attrset).erase(name.c_str());
+    }
+  }
+
+  if (meta.manifest) {
+    storage_class = meta.manifest->get_tail_placement().placement_rule.storage_class;
+
+    /* remove existing manifest attr */
+    iter = attrs.find(RGW_ATTR_MANIFEST);
+    if (iter != attrs.end())
+      attrs.erase(iter);
+
+    bufferlist bl;
+    encode(*meta.manifest, bl);
+    (*attrset)[RGW_ATTR_MANIFEST] = bl;
+  }
+
+  for (iter = attrs.begin(); iter != attrs.end(); ++iter) {
+    const string& name = iter->first;
+    bufferlist& bl = iter->second;
+
+    if (!bl.length())
+      continue;
+
+    (*attrset)[name.c_str()] = bl;
+
+    if (name.compare(RGW_ATTR_ETAG) == 0) {
+      etag = rgw_bl_str(bl);
+      params.op.obj.etag = etag;
+    } else if (name.compare(RGW_ATTR_CONTENT_TYPE) == 0) {
+      content_type = rgw_bl_str(bl);
+    } else if (name.compare(RGW_ATTR_ACL) == 0) {
+      acl_bl = bl;
+    }
+  }
+
+  if (!storage_class.empty()) {
+    bufferlist bl;
+    bl.append(storage_class);
+    (*attrset)[RGW_ATTR_STORAGE_CLASS] = bl;
+  }
+
+  params.op.obj.state = *state ;
+  params.op.obj.state.exists = true;
+  params.op.obj.state.size = size;
+  params.op.obj.state.accounted_size = accounted_size;
+  params.op.obj.owner = target->get_bucket_info().owner.id;
+  params.op.obj.category = meta.category;
+
+  if (meta.mtime) {
+    *meta.mtime = meta.set_mtime;
+  }
+
+  params.op.query_str = "meta";
+  params.op.obj.obj_id = target->obj_id;
+
+  /* Check if versioned */
+  bool is_versioned = !target->obj.key.instance.empty() && (target->obj.key.instance != "null");
+  params.op.obj.is_versioned = is_versioned;
+
+  if (is_versioned && (params.op.obj.category == RGWObjCategory::Main)) {
+    /* versioned object */
+    params.op.obj.flags |= rgw_bucket_dir_entry::FLAG_VER;
+  }
+  ret = store->ProcessOp(dpp, "PutObject", &params);
+  if (ret) {
+    ldpp_dout(dpp, 0)<<"In PutObject failed err:(" <<ret<<")" << dendl;
+    goto out;
+  }
+
+
+out:
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: do_write_meta returned ret=" << ret << dendl;
+  }
+
+  meta.canceled = true;
+
+  return ret;
+}
+
+int DB::Object::Write::write_meta(const DoutPrefixProvider *dpp, uint64_t size, uint64_t accounted_size,
+    map<string, bufferlist>& attrs)
+{
+  bool assume_noent = false;
+  /* handle assume_noent */
+  int r = _do_write_meta(dpp, size, accounted_size, attrs, assume_noent, meta.modify_tail);
+  return r;
+}
+
+int DB::Object::Delete::delete_obj(const DoutPrefixProvider *dpp) {
+  int ret = 0;
+  DBOpParams del_params = {};
+  bool versioning_enabled = ((params.versioning_status & BUCKET_VERSIONED) == BUCKET_VERSIONED); 
+  bool versioning_suspended = ((params.versioning_status & BUCKET_VERSIONS_SUSPENDED) == BUCKET_VERSIONS_SUSPENDED); 
+  bool regular_obj = true;
+  std::string versionid = target->obj.key.instance;
+
+  ret = target->get_object_impl(dpp, del_params);
+
+  if (ret < 0 && ret != -ENOENT) {
+    ldpp_dout(dpp, 0)<<"GetObject during delete failed err:(" <<ret<<")" << dendl;
+    return ret;
+  }
+
+  regular_obj = (del_params.op.obj.category == RGWObjCategory::Main);
+  if (!ret) {
+    if (!versionid.empty()) {
+      // version-id is provided
+      ret = delete_obj_impl(dpp, del_params);
+      return ret;
+    } else { // version-id is empty..
+      /*
+       * case: bucket_versioned
+       *    create_delete_marker;
+       * case: bucket_suspended
+       *    delete entry
+       *    create delete marker with version-id null;
+       * default:
+       *   just delete the entry
+       */
+      if (versioning_suspended && regular_obj) {
+        ret = delete_obj_impl(dpp, del_params);
+        ret = create_dm(dpp, del_params);
+      } else if (versioning_enabled && regular_obj) {
+        ret = create_dm(dpp, del_params);
+      } else {
+        ret = delete_obj_impl(dpp, del_params);
+      }
+    }
+  } else { // ret == -ENOENT
+     /* case: VersionID given
+      *     return -ENOENT
+      * else: // may or may not be versioned object
+      *     Listversionedobjects
+      *     if (list_entries.empty()) {
+      *         nothing to do..return ENOENT
+      *     } else {
+      *         read top entry
+      *         if (top.flags | FLAG_DELETE_MARKER) {
+      *            // nothing to do
+      *            return -ENOENT;
+      *          }
+      *          if (bucket_versioned)  {
+      *            // create delete marker with new version-id
+      *          } else if (bucket_suspended) {
+      *            // create delete marker with version-id null
+      *          }
+      *          bucket cannot be in unversioned state post having versions
+      *     }
+      */
+     if (!versionid.empty()) {
+       return -ENOENT;
+     }
+     ret = target->list_versioned_objects(dpp, del_params.op.obj.list_entries);
+     if (ret) {
+        ldpp_dout(dpp, 0)<<"ListVersionedObjects failed err:(" <<ret<<")" << dendl;
+        return ret;
+     }
+    if (del_params.op.obj.list_entries.empty()) {
+      return -ENOENT;
+    }
+    auto &ent = del_params.op.obj.list_entries.front();
+    if (ent.flags & rgw_bucket_dir_entry::FLAG_DELETE_MARKER) {
+      // for now do not create another delete marker..just exit
+      return 0;
+    }
+    ret = create_dm(dpp, del_params);
+  }
+  return ret;
+}
+
+int DB::Object::Delete::delete_obj_impl(const DoutPrefixProvider *dpp,
+                                        DBOpParams& del_params) {
+  int ret = 0;
+  DB *store = target->get_store();
+
+  ret = store->ProcessOp(dpp, "DeleteObject", &del_params);
+  if (ret) {
+    ldpp_dout(dpp, 0) << "In DeleteObject failed err:(" <<ret<<")" << dendl;
+    return ret;
+  }
+
+  /* Now that tail objects are associated with objectID, they are not deleted
+   * as part of this DeleteObj operation. Such tail objects (with no head object
+   * in *.object.table are cleaned up later by GC thread.
+   *
+   * To avoid races between writes/reads & GC delete, mtime is maintained for each
+   * tail object. This mtime is updated when tail object is written and also when
+   * its corresponding head object is deleted (like here in this case).
+   */
+  DBOpParams update_params = del_params;
+  update_params.op.obj.state.mtime = real_clock::now();
+  ret = store->ProcessOp(dpp, "UpdateObjectData", &update_params);
+
+  if (ret) {
+    ldpp_dout(dpp, 0) << "Updating tail objects mtime failed err:(" <<ret<<")" << dendl;
+  }
+  return ret;
+}
+
+/*
+ * a) if no versionID specified,
+ *  - create a delete marker with 
+ *    - new version/instanceID (if bucket versioned)
+ *    - null versionID (if versioning suspended)
+ */
+int DB::Object::Delete::create_dm(const DoutPrefixProvider *dpp,
+                                             DBOpParams& del_params) {
+
+  DB *store = target->get_store();
+  bool versioning_suspended = ((params.versioning_status & BUCKET_VERSIONS_SUSPENDED) == BUCKET_VERSIONS_SUSPENDED); 
+  int ret = -1;
+  DBOpParams olh_params = {};
+  std::string version_id;
+  DBOpParams next_params = del_params;
+
+  version_id = del_params.op.obj.state.obj.key.instance;
+
+  DBOpParams dm_params = del_params;
+
+  // create delete marker
+
+  store->InitializeParams(dpp, &dm_params);
+  target->InitializeParamsfromObject(dpp, &dm_params);
+  dm_params.op.obj.category = RGWObjCategory::None;
+
+  if (versioning_suspended) {
+    dm_params.op.obj.state.obj.key.instance = "null";
+  } else {
+    store->gen_rand_obj_instance_name(&dm_params.op.obj.state.obj.key);
+    dm_params.op.obj.obj_id = dm_params.op.obj.state.obj.key.instance;
+  }
+
+  dm_params.op.obj.flags |= (rgw_bucket_dir_entry::FLAG_DELETE_MARKER);
+
+  ret = store->ProcessOp(dpp, "PutObject", &dm_params);
+
+  if (ret) {
+    ldpp_dout(dpp, 0) << "delete_olh: failed to create delete marker - err:(" <<ret<<")" << dendl;
+    return ret;
+  }
+  result.delete_marker = true;
+  result.version_id = dm_params.op.obj.state.obj.key.instance;
+  return ret;
+}
+
+int DB::get_entry(const std::string& oid, const std::string& marker,
+                             std::unique_ptr<rgw::sal::Lifecycle::LCEntry>* entry)
+{
+  int ret = 0;
+  const DoutPrefixProvider *dpp = get_def_dpp();
+
+  DBOpParams params = {};
+  InitializeParams(dpp, &params);
+
+  params.op.lc_entry.index = oid;
+  params.op.lc_entry.entry.set_bucket(marker);
+
+  params.op.query_str = "get_entry";
+  ret = ProcessOp(dpp, "GetLCEntry", &params);
+
+  if (ret) {
+    ldpp_dout(dpp, 0)<<"In GetLCEntry failed err:(" <<ret<<") " << dendl;
+    goto out;
+  }
+
+  if (!params.op.lc_entry.entry.get_start_time() == 0) { //ensure entry found
+    rgw::sal::Lifecycle::LCEntry* e;
+    e = new rgw::sal::StoreLifecycle::StoreLCEntry(params.op.lc_entry.entry);
+    if (!e) {
+      ret = -ENOMEM;
+      goto out;
+    }
+    entry->reset(e);
+  }
+
+out:
+  return ret;
+}
+
+int DB::get_next_entry(const std::string& oid, const std::string& marker,
+                             std::unique_ptr<rgw::sal::Lifecycle::LCEntry>* entry)
+{
+  int ret = 0;
+  const DoutPrefixProvider *dpp = get_def_dpp();
+
+  DBOpParams params = {};
+  InitializeParams(dpp, &params);
+
+  params.op.lc_entry.index = oid;
+  params.op.lc_entry.entry.set_bucket(marker);
+
+  params.op.query_str = "get_next_entry";
+  ret = ProcessOp(dpp, "GetLCEntry", &params);
+
+  if (ret) {
+    ldpp_dout(dpp, 0)<<"In GetLCEntry failed err:(" <<ret<<") " << dendl;
+    goto out;
+  }
+
+  if (!params.op.lc_entry.entry.get_start_time() == 0) { //ensure entry found
+    rgw::sal::Lifecycle::LCEntry* e;
+    e = new rgw::sal::StoreLifecycle::StoreLCEntry(params.op.lc_entry.entry);
+    if (!e) {
+      ret = -ENOMEM;
+      goto out;
+    }
+    entry->reset(e);
+  }
+
+out:
+  return ret;
+}
+
+int DB::set_entry(const std::string& oid, rgw::sal::Lifecycle::LCEntry& entry)
+{
+  int ret = 0;
+  const DoutPrefixProvider *dpp = get_def_dpp();
+
+  DBOpParams params = {};
+  InitializeParams(dpp, &params);
+
+  params.op.lc_entry.index = oid;
+  params.op.lc_entry.entry = entry;
+
+  ret = ProcessOp(dpp, "InsertLCEntry", &params);
+
+  if (ret) {
+    ldpp_dout(dpp, 0)<<"In InsertLCEntry failed err:(" <<ret<<") " << dendl;
+    goto out;
+  }
+
+out:
+  return ret;
+}
+
+int DB::list_entries(const std::string& oid, const std::string& marker,
+                                uint32_t max_entries, std::vector<std::unique_ptr<rgw::sal::Lifecycle::LCEntry>>& entries)
+{
+  int ret = 0;
+  const DoutPrefixProvider *dpp = get_def_dpp();
+
+  entries.clear();
+
+  DBOpParams params = {};
+  InitializeParams(dpp, &params);
+
+  params.op.lc_entry.index = oid;
+  params.op.lc_entry.min_marker = marker;
+  params.op.list_max_count = max_entries;
+
+  ret = ProcessOp(dpp, "ListLCEntries", &params);
+
+  if (ret) {
+    ldpp_dout(dpp, 0)<<"In ListLCEntries failed err:(" <<ret<<") " << dendl;
+    goto out;
+  }
+
+  for (auto& entry : params.op.lc_entry.list_entries) {
+    entries.push_back(std::make_unique<rgw::sal::StoreLifecycle::StoreLCEntry>(std::move(entry)));
+  }
+
+out:
+  return ret;
+}
+
+int DB::rm_entry(const std::string& oid, rgw::sal::Lifecycle::LCEntry& entry)
+{
+  int ret = 0;
+  const DoutPrefixProvider *dpp = get_def_dpp();
+
+  DBOpParams params = {};
+  InitializeParams(dpp, &params);
+
+  params.op.lc_entry.index = oid;
+  params.op.lc_entry.entry = entry;
+
+  ret = ProcessOp(dpp, "RemoveLCEntry", &params);
+
+  if (ret) {
+    ldpp_dout(dpp, 0)<<"In RemoveLCEntry failed err:(" <<ret<<") " << dendl;
+    goto out;
+  }
+
+out:
+  return ret;
+}
+
+int DB::get_head(const std::string& oid, std::unique_ptr<rgw::sal::Lifecycle::LCHead>* head)
+{
+  int ret = 0;
+  const DoutPrefixProvider *dpp = get_def_dpp();
+
+  DBOpParams params = {};
+  InitializeParams(dpp, &params);
+
+  params.op.lc_head.index = oid;
+
+  ret = ProcessOp(dpp, "GetLCHead", &params);
+
+  if (ret) {
+    ldpp_dout(dpp, 0)<<"In GetLCHead failed err:(" <<ret<<") " << dendl;
+    goto out;
+  }
+
+  *head = std::make_unique<rgw::sal::StoreLifecycle::StoreLCHead>(params.op.lc_head.head);
+
+out:
+  return ret;
+}
+
+int DB::put_head(const std::string& oid, rgw::sal::Lifecycle::LCHead& head)
+{
+  int ret = 0;
+  const DoutPrefixProvider *dpp = get_def_dpp();
+
+  DBOpParams params = {};
+  InitializeParams(dpp, &params);
+
+  params.op.lc_head.index = oid;
+  params.op.lc_head.head = head;
+
+  ret = ProcessOp(dpp, "InsertLCHead", &params);
+
+  if (ret) {
+    ldpp_dout(dpp, 0)<<"In InsertLCHead failed err:(" <<ret<<") " << dendl;
+    goto out;
+  }
+
+out:
+  return ret;
+}
+
+int DB::delete_stale_objs(const DoutPrefixProvider *dpp, const std::string& bucket,
+                          uint32_t min_wait) {
+  DBOpParams params = {};
+  int ret = -1;
+
+  params.op.bucket.info.bucket.name = bucket;
+  /* Verify if bucket exists.
+   * XXX: This is needed for now to create objectmap of bucket
+   * in SQLGetBucket
+   */
+  InitializeParams(dpp, &params);
+  ret = ProcessOp(dpp, "GetBucket", &params);
+  if (ret) {
+    ldpp_dout(dpp, 0) << "In GetBucket failed err:(" <<ret<<")" << dendl;
+    return ret;
+  }
+
+  ldpp_dout(dpp, 20) << " Deleting stale_objs of bucket( " << bucket <<")" << dendl;
+  /* XXX: handle reads racing with delete here. Simple approach is maybe
+   * to use locks or sqlite transactions.
+   */
+  InitializeParams(dpp, &params);
+  params.op.obj.state.mtime = (real_clock::now() - make_timespan(min_wait));
+  ret = ProcessOp(dpp, "DeleteStaleObjectData", &params);
+  if (ret) {
+    ldpp_dout(dpp, 0) << "In DeleteStaleObjectData failed err:(" <<ret<<")" << dendl;
+  }
+
+  return ret;
+}
+
+void *DB::GC::entry() {
+  do {
+    std::unique_lock<std::mutex> lk(mtx);
+
+    ldpp_dout(dpp, 2) << " DB GC started " << dendl;
+    int max = 100;
+    RGWUserBuckets buckets;
+    bool is_truncated = false;
+
+    do {
+      std::string& marker = bucket_marker;
+      rgw_user user;
+      user.id = user_marker;
+      buckets.clear();
+      is_truncated = false;
+
+      int r = db->list_buckets(dpp, "all", user, marker, string(),
+                       max, false, &buckets, &is_truncated);
+      if (r < 0) { //do nothing? retry later ?
+        break;
+      }
+
+      for (const auto& ent : buckets.get_buckets()) {
+        const std::string &bname = ent.first;
+
+        r = db->delete_stale_objs(dpp, bname, gc_obj_min_wait);
+
+        if (r < 0) { //do nothing? skip to next entry?
+         ldpp_dout(dpp, 2) << " delete_stale_objs failed for bucket( " << bname <<")" << dendl;
+        }
+        bucket_marker = bname;
+        user_marker = user.id;
+
+        /* XXX: If using locks, unlock here and reacquire in the next iteration */
+        cv.wait_for(lk, std::chrono::milliseconds(100));
+       if (stop_signalled) {
+         goto done;
+       }
+      }
+    } while(is_truncated);
+
+    bucket_marker.clear();
+    cv.wait_for(lk, std::chrono::milliseconds(gc_interval*10));
+  } while(! stop_signalled);
+
+done:
+  return nullptr;
+}
+
+} } // namespace rgw::store
+
diff --git a/src/rgw/driver/dbstore/common/dbstore.h b/src/rgw/driver/dbstore/common/dbstore.h
new file mode 100644 (file)
index 0000000..12ab3f0
--- /dev/null
@@ -0,0 +1,2024 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef DB_STORE_H
+#define DB_STORE_H
+
+#include <errno.h>
+#include <stdlib.h>
+#include <string>
+#include <stdio.h>
+#include <iostream>
+#include <mutex>
+#include <condition_variable>
+// this seems safe to use, at least for now--arguably, we should
+// prefer header-only fmt, in general
+#undef FMT_HEADER_ONLY
+#define FMT_HEADER_ONLY 1
+#include "fmt/format.h"
+#include <map>
+#include "rgw_sal_store.h"
+#include "rgw_common.h"
+#include "rgw_bucket.h"
+#include "global/global_context.h"
+#include "global/global_init.h"
+#include "common/ceph_context.h"
+#include "rgw_obj_manifest.h"
+#include "rgw_multi.h"
+
+namespace rgw { namespace store {
+
+class DB;
+
+struct DBOpUserInfo {
+  RGWUserInfo uinfo = {};
+  obj_version user_version;
+  rgw::sal::Attrs user_attrs;
+};
+
+struct DBOpBucketInfo {
+  RGWBucketEnt ent; // maybe not needed. not used in create/get_bucket
+  RGWBucketInfo info;
+  RGWUser* owner = nullptr;
+  rgw::sal::Attrs bucket_attrs;
+  obj_version bucket_version;
+  ceph::real_time mtime;
+  // used for list query
+  std::string min_marker;
+  std::string max_marker;
+  std::list<RGWBucketEnt> list_entries;
+};
+
+struct DBOpObjectInfo {
+  RGWAccessControlPolicy acls;
+  RGWObjState state = {};
+
+  /* Below are taken from rgw_bucket_dir_entry */
+  RGWObjCategory category;
+  std::string etag;
+  std::string owner;
+  std::string owner_display_name;
+  std::string content_type;
+  std::string storage_class;
+  bool appendable;
+  uint64_t index_ver;
+  std::string tag;
+  uint16_t flags;
+  uint64_t versioned_epoch;
+
+  /* from state.manifest (RGWObjManifest) */
+  std::map<uint64_t, RGWObjManifestPart> objs;
+  uint64_t head_size{0};
+  rgw_placement_rule head_placement_rule;
+  uint64_t max_head_size{0};
+  std::string obj_id;
+  rgw_bucket_placement tail_placement; /* might be different than the original bucket,
+                                          as object might have been copied across pools */
+  std::map<uint64_t, RGWObjManifestRule> rules;
+  std::string tail_instance; /* tail object's instance */
+
+
+  /* Obj's omap <key,value> store */
+  std::map<std::string, bufferlist> omap;
+
+  /* Extra fields */
+  bool is_multipart;
+  std::list<RGWUploadPartInfo> mp_parts;
+
+  bufferlist head_data;
+  std::string min_marker;
+  std::string max_marker;
+  std::string prefix;
+  std::list<rgw_bucket_dir_entry> list_entries;
+  /* XXX: Maybe use std::vector instead of std::list */
+
+  /* for versioned objects */
+  bool is_versioned;
+  uint64_t version_num = 0;
+};
+
+struct DBOpObjectDataInfo {
+  RGWObjState state;
+  uint64_t part_num;
+  std::string multipart_part_str;
+  uint64_t offset;
+  uint64_t size;
+  bufferlist data{};
+};
+
+struct DBOpLCHeadInfo {
+  std::string index;
+  rgw::sal::StoreLifecycle::StoreLCHead head;
+};
+
+struct DBOpLCEntryInfo {
+  std::string index;
+  rgw::sal::StoreLifecycle::StoreLCEntry entry;
+  // used for list query
+  std::string min_marker;
+  std::list<rgw::sal::StoreLifecycle::StoreLCEntry> list_entries;
+};
+
+struct DBOpInfo {
+  std::string name; // Op name
+  /* Support only single access_key for now. So store
+   * it separately as primary access_key_id & secret to
+   * be able to query easily.
+   *
+   * XXX: Swift keys and subuser not supported for now */
+  DBOpUserInfo user;
+  std::string query_str;
+  DBOpBucketInfo bucket;
+  DBOpObjectInfo obj;
+  DBOpObjectDataInfo obj_data;
+  DBOpLCHeadInfo lc_head;
+  DBOpLCEntryInfo lc_entry;
+  uint64_t list_max_count;
+};
+
+struct DBOpParams {
+  CephContext *cct;
+
+  /* Tables */
+  std::string user_table;
+  std::string bucket_table;
+  std::string object_table;
+
+  /* Ops*/
+  DBOpInfo op;
+
+  std::string objectdata_table;
+  std::string object_trigger;
+  std::string object_view;
+  std::string quota_table;
+  std::string lc_head_table;
+  std::string lc_entry_table;
+  std::string obj;
+};
+
+/* Used for prepared schemas.
+ * Difference with above structure is that all 
+ * the fields are strings here to accommodate any
+ * style identifiers used by backend db. By default
+ * initialized with sqlitedb style, can be overriden
+ * using InitPrepareParams()
+ *
+ * These identifiers are used in prepare and bind statements
+ * to get the right index of each param.
+ */
+struct DBOpUserPrepareInfo {
+  static constexpr const char* user_id = ":user_id";
+  static constexpr const char* tenant = ":tenant";
+  static constexpr const char* ns = ":ns";
+  static constexpr const char* display_name = ":display_name";
+  static constexpr const char* user_email = ":user_email";
+  /* Support only single access_key for now. So store
+   * it separately as primary access_key_id & secret to
+   * be able to query easily.
+   *
+   * In future, when need to support & query from multiple
+   * access keys, better to maintain them in a separate table.
+   */
+  static constexpr const char* access_keys_id = ":access_keys_id";
+  static constexpr const char* access_keys_secret = ":access_keys_secret";
+  static constexpr const char* access_keys = ":access_keys";
+  static constexpr const char* swift_keys = ":swift_keys";
+  static constexpr const char* subusers = ":subusers";
+  static constexpr const char* suspended = ":suspended";
+  static constexpr const char* max_buckets = ":max_buckets";
+  static constexpr const char* op_mask = ":op_mask";
+  static constexpr const char* user_caps = ":user_caps";
+  static constexpr const char* admin = ":admin";
+  static constexpr const char* system = ":system";
+  static constexpr const char* placement_name = ":placement_name";
+  static constexpr const char* placement_storage_class = ":placement_storage_class";
+  static constexpr const char* placement_tags = ":placement_tags";
+  static constexpr const char* bucket_quota = ":bucket_quota";
+  static constexpr const char* temp_url_keys = ":temp_url_keys";
+  static constexpr const char* user_quota = ":user_quota";
+  static constexpr const char* type = ":type";
+  static constexpr const char* mfa_ids = ":mfa_ids";
+  static constexpr const char* assumed_role_arn = ":assumed_role_arn";
+  static constexpr const char* user_attrs = ":user_attrs";
+  static constexpr const char* user_ver = ":user_vers";
+  static constexpr const char* user_ver_tag = ":user_ver_tag";
+};
+
+struct DBOpBucketPrepareInfo {
+  static constexpr const char* bucket_name = ":bucket_name";
+  static constexpr const char* tenant = ":tenant";
+  static constexpr const char* marker = ":marker";
+  static constexpr const char* bucket_id = ":bucket_id";
+  static constexpr const char* size = ":size";
+  static constexpr const char* size_rounded = ":size_rounded";
+  static constexpr const char* creation_time = ":creation_time";
+  static constexpr const char* count = ":count";
+  static constexpr const char* placement_name = ":placement_name";
+  static constexpr const char* placement_storage_class = ":placement_storage_class";
+  /* ownerid - maps to DBOpUserPrepareInfo */
+  static constexpr const char* flags = ":flags";
+  static constexpr const char* zonegroup = ":zonegroup";
+  static constexpr const char* has_instance_obj = ":has_instance_obj";
+  static constexpr const char* quota = ":quota";
+  static constexpr const char* requester_pays = ":requester_pays";
+  static constexpr const char* has_website = ":has_website";
+  static constexpr const char* website_conf = ":website_conf";
+  static constexpr const char* swift_versioning = ":swift_versioning";
+  static constexpr const char* swift_ver_location = ":swift_ver_location";
+  static constexpr const char* mdsearch_config = ":mdsearch_config";
+  static constexpr const char* new_bucket_instance_id = ":new_bucket_instance_id";
+  static constexpr const char* obj_lock = ":obj_lock";
+  static constexpr const char* sync_policy_info_groups = ":sync_policy_info_groups";
+  static constexpr const char* bucket_attrs = ":bucket_attrs";
+  static constexpr const char* bucket_ver = ":bucket_vers";
+  static constexpr const char* bucket_ver_tag = ":bucket_ver_tag";
+  static constexpr const char* mtime = ":mtime";
+  static constexpr const char* min_marker = ":min_marker";
+  static constexpr const char* max_marker = ":max_marker";
+};
+
+struct DBOpObjectPrepareInfo {
+  static constexpr const char* obj_name = ":obj_name";
+  static constexpr const char* obj_instance = ":obj_instance";
+  static constexpr const char* obj_ns  = ":obj_ns";
+  static constexpr const char* acls = ":acls";
+  static constexpr const char* index_ver = ":index_ver";
+  static constexpr const char* tag = ":tag";
+  static constexpr const char* flags = ":flags";
+  static constexpr const char* versioned_epoch = ":versioned_epoch";
+  static constexpr const char* obj_category = ":obj_category";
+  static constexpr const char* etag = ":etag";
+  static constexpr const char* owner = ":owner";
+  static constexpr const char* owner_display_name = ":owner_display_name";
+  static constexpr const char* storage_class = ":storage_class";
+  static constexpr const char* appendable = ":appendable";
+  static constexpr const char* content_type = ":content_type";
+  static constexpr const char* index_hash_source = ":index_hash_source";
+  static constexpr const char* obj_size = ":obj_size";
+  static constexpr const char* accounted_size = ":accounted_size";
+  static constexpr const char* mtime = ":mtime";
+  static constexpr const char* epoch = ":epoch";
+  static constexpr const char* obj_tag = ":obj_tag";
+  static constexpr const char* tail_tag = ":tail_tag";
+  static constexpr const char* write_tag = ":write_tag";
+  static constexpr const char* fake_tag = ":fake_tag";
+  static constexpr const char* shadow_obj = ":shadow_obj";
+  static constexpr const char* has_data = ":has_data";
+  static constexpr const char* is_versioned = ":is_versioned";
+  static constexpr const char* version_num = ":version_num";
+  static constexpr const char* pg_ver = ":pg_ver";
+  static constexpr const char* zone_short_id = ":zone_short_id";
+  static constexpr const char* obj_version = ":obj_version";
+  static constexpr const char* obj_version_tag = ":obj_version_tag";
+  static constexpr const char* obj_attrs = ":obj_attrs";
+  static constexpr const char* head_size = ":head_size";
+  static constexpr const char* max_head_size = ":max_head_size";
+  static constexpr const char* obj_id = ":obj_id";
+  static constexpr const char* tail_instance = ":tail_instance";
+  static constexpr const char* head_placement_rule_name = ":head_placement_rule_name";
+  static constexpr const char* head_placement_storage_class  = ":head_placement_storage_class";
+  static constexpr const char* tail_placement_rule_name = ":tail_placement_rule_name";
+  static constexpr const char* tail_placement_storage_class  = ":tail_placement_storage_class";
+  static constexpr const char* manifest_part_objs = ":manifest_part_objs";
+  static constexpr const char* manifest_part_rules = ":manifest_part_rules";
+  static constexpr const char* omap = ":omap";
+  static constexpr const char* is_multipart = ":is_multipart";
+  static constexpr const char* mp_parts = ":mp_parts";
+  static constexpr const char* head_data = ":head_data";
+  static constexpr const char* min_marker = ":min_marker";
+  static constexpr const char* max_marker = ":max_marker";
+  static constexpr const char* prefix = ":prefix";
+  /* Below used to update mp_parts obj name
+   * from meta object to src object on completion */
+  static constexpr const char* new_obj_name = ":new_obj_name";
+  static constexpr const char* new_obj_instance = ":new_obj_instance";
+  static constexpr const char* new_obj_ns  = ":new_obj_ns";
+};
+
+struct DBOpObjectDataPrepareInfo {
+  static constexpr const char* part_num = ":part_num";
+  static constexpr const char* offset = ":offset";
+  static constexpr const char* data = ":data";
+  static constexpr const char* size = ":size";
+  static constexpr const char* multipart_part_str = ":multipart_part_str";
+};
+
+struct DBOpLCEntryPrepareInfo {
+  static constexpr const char* index = ":index";
+  static constexpr const char* bucket_name = ":bucket_name";
+  static constexpr const char* start_time = ":start_time";
+  static constexpr const char* status = ":status";
+  static constexpr const char* min_marker = ":min_marker";
+};
+
+struct DBOpLCHeadPrepareInfo {
+  static constexpr const char* index = ":index";
+  static constexpr const char* start_date = ":start_date";
+  static constexpr const char* marker = ":marker";
+};
+
+struct DBOpPrepareInfo {
+  DBOpUserPrepareInfo user;
+  std::string_view query_str; // view into DBOpInfo::query_str
+  DBOpBucketPrepareInfo bucket;
+  DBOpObjectPrepareInfo obj;
+  DBOpObjectDataPrepareInfo obj_data;
+  DBOpLCHeadPrepareInfo lc_head;
+  DBOpLCEntryPrepareInfo lc_entry;
+  static constexpr const char* list_max_count = ":list_max_count";
+};
+
+struct DBOpPrepareParams {
+  /* Tables */
+  std::string user_table;
+  std::string bucket_table;
+  std::string object_table;
+
+  /* Ops */
+  DBOpPrepareInfo op;
+
+
+  std::string objectdata_table;
+  std::string object_trigger;
+  std::string object_view;
+  std::string quota_table;
+  std::string lc_head_table;
+  std::string lc_entry_table;
+};
+
+struct DBOps {
+  std::shared_ptr<class InsertUserOp> InsertUser;
+  std::shared_ptr<class RemoveUserOp> RemoveUser;
+  std::shared_ptr<class GetUserOp> GetUser;
+  std::shared_ptr<class InsertBucketOp> InsertBucket;
+  std::shared_ptr<class UpdateBucketOp> UpdateBucket;
+  std::shared_ptr<class RemoveBucketOp> RemoveBucket;
+  std::shared_ptr<class GetBucketOp> GetBucket;
+  std::shared_ptr<class ListUserBucketsOp> ListUserBuckets;
+  std::shared_ptr<class InsertLCEntryOp> InsertLCEntry;
+  std::shared_ptr<class RemoveLCEntryOp> RemoveLCEntry;
+  std::shared_ptr<class GetLCEntryOp> GetLCEntry;
+  std::shared_ptr<class ListLCEntriesOp> ListLCEntries;
+  std::shared_ptr<class  InsertLCHeadOp> InsertLCHead;
+  std::shared_ptr<class RemoveLCHeadOp> RemoveLCHead;
+  std::shared_ptr<class GetLCHeadOp> GetLCHead;
+};
+
+class ObjectOp {
+  public:
+    ObjectOp() {};
+
+    virtual ~ObjectOp() {}
+
+    std::shared_ptr<class PutObjectOp> PutObject;
+    std::shared_ptr<class DeleteObjectOp> DeleteObject;
+    std::shared_ptr<class GetObjectOp> GetObject;
+    std::shared_ptr<class UpdateObjectOp> UpdateObject;
+    std::shared_ptr<class ListBucketObjectsOp> ListBucketObjects;
+    std::shared_ptr<class ListVersionedObjectsOp> ListVersionedObjects;
+    std::shared_ptr<class PutObjectDataOp> PutObjectData;
+    std::shared_ptr<class UpdateObjectDataOp> UpdateObjectData;
+    std::shared_ptr<class GetObjectDataOp> GetObjectData;
+    std::shared_ptr<class DeleteObjectDataOp> DeleteObjectData;
+    std::shared_ptr<class DeleteStaleObjectDataOp> DeleteStaleObjectData;
+
+    virtual int InitializeObjectOps(std::string db_name, const DoutPrefixProvider *dpp) { return 0; }
+};
+
+class DBOp {
+  private:
+    static constexpr std::string_view CreateUserTableQ =
+      /* Corresponds to rgw::sal::User
+       *
+       * For now only UserID is made Primary key.
+       * If multiple tenants are stored in single .db handle, should
+       * make both (UserID, Tenant) as Primary Key.
+       *
+       * XXX:
+       * - AccessKeys, SwiftKeys, Subusers (map<>) are stored as blob.
+       *   To enable easy query, first accesskey is stored in separate fields
+       *   AccessKeysID, AccessKeysSecret.
+       *   In future, may be have separate table to store these keys and
+       *   query on that table.
+       * - Quota stored as blob .. should be linked to quota table.
+       */
+      "CREATE TABLE IF NOT EXISTS '{}' (       \
+      UserID TEXT NOT NULL UNIQUE,             \
+      Tenant TEXT ,            \
+      NS TEXT ,                \
+      DisplayName TEXT , \
+      UserEmail TEXT , \
+      AccessKeysID TEXT ,      \
+      AccessKeysSecret TEXT ,  \
+      AccessKeys BLOB ,        \
+      SwiftKeys BLOB , \
+      SubUsers BLOB ,          \
+      Suspended INTEGER ,      \
+      MaxBuckets INTEGER ,     \
+      OpMask   INTEGER ,       \
+      UserCaps BLOB ,          \
+      Admin    INTEGER ,       \
+      System INTEGER ,         \
+      PlacementName TEXT ,     \
+      PlacementStorageClass TEXT ,     \
+      PlacementTags BLOB ,     \
+      BucketQuota BLOB ,       \
+      TempURLKeys BLOB ,       \
+      UserQuota BLOB , \
+      TYPE INTEGER ,           \
+      MfaIDs BLOB ,    \
+      AssumedRoleARN TEXT , \
+      UserAttrs   BLOB,   \
+      UserVersion   INTEGER,    \
+      UserVersionTag TEXT,      \
+      PRIMARY KEY (UserID) \n);";
+
+    static constexpr std::string_view CreateBucketTableQ =
+      /* Corresponds to rgw::sal::Bucket
+       *  
+       *  For now only BucketName is made Primary key. Since buckets should
+       *  be unique across users in rgw, OwnerID is not made part of primary key.
+       *  However it is still referenced as foreign key
+       *
+       *  If multiple tenants are stored in single .db handle, should
+       *  make both (BucketName, Tenant) as Primary Key. Also should
+       *  reference (UserID, Tenant) as Foreign key.
+       *
+       * leaving below RADOS specific fields
+       *   - rgw_data_placement_target explicit_placement (struct rgw_bucket)
+       *   - rgw::BucketLayout layout (struct RGWBucketInfo)
+       *   - const static uint32_t NUM_SHARDS_BLIND_BUCKET (struct RGWBucketInfo),
+       *     should be '0' indicating no sharding.
+       *   - cls_rgw_reshard_status reshard_status (struct RGWBucketInfo)
+       *
+       * XXX:
+       *   - Quota stored as blob .. should be linked to quota table.
+       *   - WebsiteConf stored as BLOB..if required, should be split
+       *   - Storing bucket_version (struct RGWBucket), objv_tracker
+       *     (struct RGWBucketInfo) separately. Are they same?
+       *
+       */
+      "CREATE TABLE IF NOT EXISTS '{}' ( \
+      BucketName TEXT NOT NULL UNIQUE , \
+      Tenant TEXT,        \
+      Marker TEXT,        \
+      BucketID TEXT,      \
+      Size   INTEGER,     \
+      SizeRounded INTEGER,\
+      CreationTime BLOB,  \
+      Count  INTEGER,     \
+      PlacementName TEXT ,     \
+      PlacementStorageClass TEXT ,     \
+      OwnerID TEXT NOT NULL, \
+      Flags   INTEGER,       \
+      Zonegroup TEXT,         \
+      HasInstanceObj BOOLEAN, \
+      Quota   BLOB,       \
+      RequesterPays BOOLEAN,  \
+      HasWebsite  BOOLEAN,    \
+      WebsiteConf BLOB,   \
+      SwiftVersioning BOOLEAN, \
+      SwiftVerLocation TEXT,  \
+      MdsearchConfig  BLOB,   \
+      NewBucketInstanceID TEXT,\
+      ObjectLock BLOB, \
+      SyncPolicyInfoGroups BLOB, \
+      BucketAttrs   BLOB,   \
+      BucketVersion   INTEGER,    \
+      BucketVersionTag TEXT,      \
+      Mtime   BLOB,   \
+      PRIMARY KEY (BucketName) \
+      FOREIGN KEY (OwnerID) \
+      REFERENCES '{}' (UserID) ON DELETE CASCADE ON UPDATE CASCADE \n);";
+
+    static constexpr std::string_view CreateObjectTableTriggerQ =
+      "CREATE TRIGGER IF NOT EXISTS '{}' \
+          AFTER INSERT ON '{}' \
+       BEGIN \
+          UPDATE '{}' \
+          SET VersionNum = (SELECT COALESCE(max(VersionNum), 0) from '{}' where ObjName = new.ObjName) + 1 \
+          where ObjName = new.ObjName and ObjInstance = new.ObjInstance; \
+       END;";
+
+    static constexpr std::string_view CreateObjectTableQ =
+      /* Corresponds to rgw::sal::Object
+       *
+       *  For now only BucketName, ObjName is made Primary key.
+       *  If multiple tenants are stored in single .db handle, should
+       *  include Tenant too in the Primary Key. Also should
+       *  reference (BucketID, Tenant) as Foreign key.
+       * 
+       * referring to 
+       * - rgw_bucket_dir_entry - following are added for now
+       *   flags,
+       *   versioned_epoch
+       *   tag
+       *   index_ver
+       *   meta.category
+       *   meta.etag
+       *   meta.storageclass
+       *   meta.appendable
+       *   meta.content_type
+       *   meta.owner
+       *   meta.owner_display_name
+       *
+       * - RGWObjState. Below are omitted from that struct
+       *    as they seem in-memory variables
+       *    * is_atomic, has_atts, exists, prefetch_data, keep_tail, 
+       * - RGWObjManifest
+       *
+       * Extra field added "IsMultipart" to flag multipart uploads,
+       * HeadData to store first chunk data.
+       */
+      "CREATE TABLE IF NOT EXISTS '{}' ( \
+      ObjName TEXT NOT NULL , \
+      ObjInstance TEXT, \
+      ObjNS TEXT, \
+      BucketName TEXT NOT NULL , \
+      ACLs    BLOB,   \
+      IndexVer    INTEGER,    \
+      Tag TEXT,   \
+      Flags INTEGER, \
+      VersionedEpoch INTEGER, \
+      ObjCategory INTEGER,    \
+      Etag   TEXT,    \
+      Owner TEXT, \
+      OwnerDisplayName TEXT,  \
+      StorageClass    TEXT,   \
+      Appendable  BOOL,   \
+      ContentType TEXT,   \
+      IndexHashSource TEXT, \
+      ObjSize  INTEGER,   \
+      AccountedSize INTEGER,  \
+      Mtime   BLOB,   \
+      Epoch  INTEGER, \
+      ObjTag  BLOB,   \
+      TailTag BLOB,   \
+      WriteTag    TEXT,   \
+      FakeTag BOOL,   \
+      ShadowObj   TEXT,   \
+      HasData  BOOL,  \
+      IsVersioned BOOL,  \
+      VersionNum  INTEGER, \
+      PGVer   INTEGER, \
+      ZoneShortID  INTEGER,  \
+      ObjVersion   INTEGER,    \
+      ObjVersionTag TEXT,      \
+      ObjAttrs    BLOB,   \
+      HeadSize    INTEGER,    \
+      MaxHeadSize    INTEGER,    \
+      ObjID      TEXT NOT NULL, \
+      TailInstance  TEXT, \
+      HeadPlacementRuleName   TEXT, \
+      HeadPlacementRuleStorageClass TEXT, \
+      TailPlacementRuleName   TEXT, \
+      TailPlacementStorageClass TEXT, \
+      ManifestPartObjs    BLOB,   \
+      ManifestPartRules   BLOB,   \
+      Omap    BLOB,   \
+      IsMultipart     BOOL,   \
+      MPPartsList    BLOB,   \
+      HeadData  BLOB,   \
+      PRIMARY KEY (ObjName, ObjInstance, BucketName), \
+      FOREIGN KEY (BucketName) \
+      REFERENCES '{}' (BucketName) ON DELETE CASCADE ON UPDATE CASCADE \n);";
+
+    static constexpr std::string_view CreateObjectDataTableQ =
+      /* Extra field 'MultipartPartStr' added which signifies multipart
+       * <uploadid + partnum>. For regular object, it is '0.0'
+       *
+       *  - part: a collection of stripes that make a contiguous part of an
+       object. A regular object will only have one part (although might have
+       many stripes), a multipart object might have many parts. Each part
+       has a fixed stripe size (ObjChunkSize), although the last stripe of a
+       part might be smaller than that.
+       */
+      "CREATE TABLE IF NOT EXISTS '{}' ( \
+      ObjName TEXT NOT NULL , \
+      ObjInstance TEXT, \
+      ObjNS TEXT, \
+      BucketName TEXT NOT NULL , \
+      ObjID      TEXT NOT NULL , \
+      MultipartPartStr TEXT, \
+      PartNum  INTEGER NOT NULL, \
+      Offset   INTEGER, \
+      Size      INTEGER, \
+      Mtime  BLOB,       \
+      Data     BLOB,             \
+      PRIMARY KEY (ObjName, BucketName, ObjInstance, ObjID, MultipartPartStr, PartNum), \
+      FOREIGN KEY (BucketName) \
+      REFERENCES '{}' (BucketName) ON DELETE CASCADE ON UPDATE CASCADE \n);";
+
+    static constexpr std::string_view CreateObjectViewQ =
+      /* This query creats temporary view with entries from ObjectData table which have
+       * corresponding head object (i.e, with same ObjName, ObjInstance, ObjNS, ObjID)
+       * in the Object table.
+       *
+       * GC thread can use this view to delete stale entries from the ObjectData table which
+       * do not exist in this view.
+       *
+       * XXX: This view is throwing ForeignKey mismatch error, mostly may be because all the keys
+       * of objectdata table are not referenced here. So this view is not used atm.
+       */
+      "CREATE TEMP VIEW IF NOT EXISTS '{}' AS \
+      SELECT s.ObjName, s.ObjInstance, s.ObjID from '{}' as s INNER JOIN '{}' USING \
+      (ObjName, BucketName, ObjInstance, ObjID);";
+
+
+    static constexpr std::string_view CreateQuotaTableQ =
+      "CREATE TABLE IF NOT EXISTS '{}' ( \
+      QuotaID INTEGER PRIMARY KEY AUTOINCREMENT UNIQUE , \
+      MaxSizeSoftThreshold INTEGER ,   \
+      MaxObjsSoftThreshold INTEGER ,   \
+      MaxSize  INTEGER ,               \
+      MaxObjects INTEGER ,             \
+      Enabled Boolean ,                \
+      CheckOnRaw Boolean \n);";
+
+    static constexpr std::string_view CreateLCEntryTableQ =
+      "CREATE TABLE IF NOT EXISTS '{}' ( \
+      LCIndex  TEXT NOT NULL , \
+      BucketName TEXT NOT NULL , \
+      StartTime  INTEGER , \
+      Status     INTEGER , \
+      PRIMARY KEY (LCIndex, BucketName) \n);";
+
+    static constexpr std::string_view CreateLCHeadTableQ =
+      "CREATE TABLE IF NOT EXISTS '{}' ( \
+      LCIndex  TEXT NOT NULL , \
+      Marker TEXT , \
+      StartDate  INTEGER , \
+      PRIMARY KEY (LCIndex) \n);";
+
+    static constexpr std::string_view DropQ = "DROP TABLE IF EXISTS '{}'";
+    static constexpr std::string_view ListAllQ = "SELECT  * from '{}'";
+
+  public:
+    DBOp() {}
+    virtual ~DBOp() {}
+    std::mutex mtx; // to protect prepared stmt
+
+    static std::string CreateTableSchema(std::string_view type,
+                                         const DBOpParams *params) {
+      if (!type.compare("User"))
+        return fmt::format(CreateUserTableQ,
+            params->user_table);
+      if (!type.compare("Bucket"))
+        return fmt::format(CreateBucketTableQ,
+            params->bucket_table,
+            params->user_table);
+      if (!type.compare("Object"))
+        return fmt::format(CreateObjectTableQ,
+            params->object_table,
+            params->bucket_table);
+      if (!type.compare("ObjectTrigger"))
+        return fmt::format(CreateObjectTableTriggerQ,
+            params->object_trigger,
+            params->object_table,
+            params->object_table,
+            params->object_table);
+      if (!type.compare("ObjectData"))
+        return fmt::format(CreateObjectDataTableQ,
+            params->objectdata_table,
+            params->bucket_table);
+      if (!type.compare("ObjectView"))
+        return fmt::format(CreateObjectTableQ,
+            params->object_view,
+            params->objectdata_table,
+            params->object_table);
+      if (!type.compare("Quota"))
+        return fmt::format(CreateQuotaTableQ,
+            params->quota_table);
+      if (!type.compare("LCHead"))
+        return fmt::format(CreateLCHeadTableQ,
+            params->lc_head_table);
+      if (!type.compare("LCEntry"))
+        return fmt::format(CreateLCEntryTableQ,
+            params->lc_entry_table,
+            params->bucket_table);
+
+      ceph_abort_msgf("incorrect table type %.*s", type.size(), type.data());
+    }
+
+    static std::string DeleteTableSchema(std::string_view table) {
+      return fmt::format(DropQ, table);
+    }
+    static std::string ListTableSchema(std::string_view table) {
+      return fmt::format(ListAllQ, table);
+    }
+
+    virtual int Prepare(const DoutPrefixProvider *dpp, DBOpParams *params) { return 0; }
+    virtual int Bind(const DoutPrefixProvider *dpp, DBOpParams *params) { return 0; }
+    virtual int Execute(const DoutPrefixProvider *dpp, DBOpParams *params) { return 0; }
+};
+
+class InsertUserOp : virtual public DBOp {
+  private:
+    /* For existing entires, -
+     * (1) INSERT or REPLACE - it will delete previous entry and then
+     * inserts new one. Since it deletes previos enties, it will
+     * trigger all foriegn key cascade deletes or other triggers.
+     * (2) INSERT or UPDATE - this will set NULL values to unassigned
+     * fields.
+     * more info: https://code-examples.net/en/q/377728
+     *
+     * For now using INSERT or REPLACE. If required of updating existing
+     * record, will use another query.
+     */
+    static constexpr std::string_view Query = "INSERT OR REPLACE INTO '{}'     \
+                          (UserID, Tenant, NS, DisplayName, UserEmail, \
+                           AccessKeysID, AccessKeysSecret, AccessKeys, SwiftKeys,\
+                           SubUsers, Suspended, MaxBuckets, OpMask, UserCaps, Admin, \
+                           System, PlacementName, PlacementStorageClass, PlacementTags, \
+                           BucketQuota, TempURLKeys, UserQuota, Type, MfaIDs, AssumedRoleARN, \
+                           UserAttrs, UserVersion, UserVersionTag) \
+                          VALUES ({}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, \
+                              {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {});";
+
+  public:
+    virtual ~InsertUserOp() {}
+
+    static std::string Schema(DBOpPrepareParams &params) {
+      return fmt::format(Query, params.user_table,
+          params.op.user.user_id, params.op.user.tenant, params.op.user.ns,
+          params.op.user.display_name, params.op.user.user_email,
+          params.op.user.access_keys_id, params.op.user.access_keys_secret,
+          params.op.user.access_keys, params.op.user.swift_keys,
+          params.op.user.subusers, params.op.user.suspended,
+          params.op.user.max_buckets, params.op.user.op_mask,
+          params.op.user.user_caps, params.op.user.admin, params.op.user.system,
+          params.op.user.placement_name, params.op.user.placement_storage_class,
+          params.op.user.placement_tags, params.op.user.bucket_quota,
+          params.op.user.temp_url_keys, params.op.user.user_quota,
+          params.op.user.type, params.op.user.mfa_ids,
+          params.op.user.assumed_role_arn, params.op.user.user_attrs,
+          params.op.user.user_ver, params.op.user.user_ver_tag);
+    }
+
+};
+
+class RemoveUserOp: virtual public DBOp {
+  private:
+    static constexpr std::string_view Query =
+      "DELETE from '{}' where UserID = {}";
+
+  public:
+    virtual ~RemoveUserOp() {}
+
+    static std::string Schema(DBOpPrepareParams &params) {
+      return fmt::format(Query, params.user_table,
+          params.op.user.user_id);
+    }
+};
+
+class GetUserOp: virtual public DBOp {
+  private:
+    /* If below query columns are updated, make sure to update the indexes
+     * in list_user() cbk in sqliteDB.cc */
+    static constexpr std::string_view Query = "SELECT \
+                          UserID, Tenant, NS, DisplayName, UserEmail, \
+                          AccessKeysID, AccessKeysSecret, AccessKeys, SwiftKeys,\
+                          SubUsers, Suspended, MaxBuckets, OpMask, UserCaps, Admin, \
+                          System, PlacementName, PlacementStorageClass, PlacementTags, \
+                          BucketQuota, TempURLKeys, UserQuota, Type, MfaIDs, AssumedRoleARN, \
+                          UserAttrs, UserVersion, UserVersionTag from '{}' where UserID = {}";
+
+    static constexpr std::string_view QueryByEmail = "SELECT \
+                                 UserID, Tenant, NS, DisplayName, UserEmail, \
+                                 AccessKeysID, AccessKeysSecret, AccessKeys, SwiftKeys,\
+                                 SubUsers, Suspended, MaxBuckets, OpMask, UserCaps, Admin, \
+                                 System, PlacementName, PlacementStorageClass, PlacementTags, \
+                                 BucketQuota, TempURLKeys, UserQuota, Type, MfaIDs, AssumedRoleARN, \
+                                 UserAttrs, UserVersion, UserVersionTag from '{}' where UserEmail = {}";
+
+    static constexpr std::string_view QueryByAccessKeys = "SELECT \
+                                      UserID, Tenant, NS, DisplayName, UserEmail, \
+                                      AccessKeysID, AccessKeysSecret, AccessKeys, SwiftKeys,\
+                                      SubUsers, Suspended, MaxBuckets, OpMask, UserCaps, Admin, \
+                                      System, PlacementName, PlacementStorageClass, PlacementTags, \
+                                      BucketQuota, TempURLKeys, UserQuota, Type, MfaIDs, AssumedRoleARN, \
+                                      UserAttrs, UserVersion, UserVersionTag from '{}' where AccessKeysID = {}";
+
+    static constexpr std::string_view QueryByUserID = "SELECT \
+                                  UserID, Tenant, NS, DisplayName, UserEmail, \
+                                  AccessKeysID, AccessKeysSecret, AccessKeys, SwiftKeys,\
+                                  SubUsers, Suspended, MaxBuckets, OpMask, UserCaps, Admin, \
+                                  System, PlacementName, PlacementStorageClass, PlacementTags, \
+                                  BucketQuota, TempURLKeys, UserQuota, Type, MfaIDs, AssumedRoleARN, \
+                                  UserAttrs, UserVersion, UserVersionTag \
+                                  from '{}' where UserID = {}";
+
+  public:
+    virtual ~GetUserOp() {}
+
+    static std::string Schema(DBOpPrepareParams &params) {
+      if (params.op.query_str == "email") {
+        return fmt::format(QueryByEmail, params.user_table,
+            params.op.user.user_email);
+      } else if (params.op.query_str == "access_key") {
+        return fmt::format(QueryByAccessKeys,
+            params.user_table,
+            params.op.user.access_keys_id);
+      } else if (params.op.query_str == "user_id") {
+        return fmt::format(QueryByUserID,
+            params.user_table,
+            params.op.user.user_id);
+      } else {
+        return fmt::format(Query, params.user_table,
+            params.op.user.user_id);
+      }
+    }
+};
+
+class InsertBucketOp: virtual public DBOp {
+  private:
+    static constexpr std::string_view Query =
+      "INSERT OR REPLACE INTO '{}' \
+      (BucketName, Tenant, Marker, BucketID, Size, SizeRounded, CreationTime, \
+       Count, PlacementName, PlacementStorageClass, OwnerID, Flags, Zonegroup, \
+       HasInstanceObj, Quota, RequesterPays, HasWebsite, WebsiteConf, \
+       SwiftVersioning, SwiftVerLocation, \
+       MdsearchConfig, NewBucketInstanceID, ObjectLock, \
+       SyncPolicyInfoGroups, BucketAttrs, BucketVersion, BucketVersionTag, Mtime) \
+      VALUES ({}, {}, {}, {}, {}, {}, {}, {}, {}, \
+          {}, {}, {}, {}, {}, {}, {}, {}, {}, \
+          {}, {}, {}, {}, {}, {}, {}, {}, {}, {})";
+
+  public:
+    virtual ~InsertBucketOp() {}
+
+    static std::string Schema(DBOpPrepareParams &params) {
+      return fmt::format(Query, params.bucket_table,
+          params.op.bucket.bucket_name, params.op.bucket.tenant,
+          params.op.bucket.marker, params.op.bucket.bucket_id,
+          params.op.bucket.size, params.op.bucket.size_rounded,
+          params.op.bucket.creation_time, params.op.bucket.count,
+          params.op.bucket.placement_name, params.op.bucket.placement_storage_class,
+          params.op.user.user_id,
+          params.op.bucket.flags, params.op.bucket.zonegroup, params.op.bucket.has_instance_obj,
+          params.op.bucket.quota, params.op.bucket.requester_pays, params.op.bucket.has_website,
+          params.op.bucket.website_conf, params.op.bucket.swift_versioning,
+          params.op.bucket.swift_ver_location, params.op.bucket.mdsearch_config,
+          params.op.bucket.new_bucket_instance_id, params.op.bucket.obj_lock,
+          params.op.bucket.sync_policy_info_groups, params.op.bucket.bucket_attrs,
+          params.op.bucket.bucket_ver, params.op.bucket.bucket_ver_tag,
+          params.op.bucket.mtime);
+    }
+};
+
+class UpdateBucketOp: virtual public DBOp {
+  private:
+    // Updates Info, Mtime, Version
+    static constexpr std::string_view InfoQuery =
+      "UPDATE '{}' SET Tenant = {}, Marker = {}, BucketID = {}, CreationTime = {}, \
+      Count = {}, PlacementName = {}, PlacementStorageClass = {}, OwnerID = {}, Flags = {}, \
+      Zonegroup = {}, HasInstanceObj = {}, Quota = {}, RequesterPays = {}, HasWebsite = {}, \
+      WebsiteConf = {}, SwiftVersioning = {}, SwiftVerLocation = {}, MdsearchConfig = {}, \
+      NewBucketInstanceID = {}, ObjectLock = {}, SyncPolicyInfoGroups = {}, \
+      BucketVersion = {}, Mtime = {} WHERE BucketName = {}";
+    // Updates Attrs, OwnerID, Mtime, Version
+    static constexpr std::string_view AttrsQuery =
+      "UPDATE '{}' SET OwnerID = {}, BucketAttrs = {}, Mtime = {}, BucketVersion = {} \
+      WHERE BucketName = {}";
+    // Updates OwnerID, CreationTime, Mtime, Version
+    static constexpr std::string_view OwnerQuery =
+      "UPDATE '{}' SET OwnerID = {}, CreationTime = {}, Mtime = {}, BucketVersion = {} WHERE BucketName = {}";
+
+  public:
+    virtual ~UpdateBucketOp() {}
+
+    static std::string Schema(DBOpPrepareParams &params) {
+      if (params.op.query_str == "info") {
+        return fmt::format(InfoQuery, params.bucket_table,
+            params.op.bucket.tenant, params.op.bucket.marker, params.op.bucket.bucket_id,
+            params.op.bucket.creation_time, params.op.bucket.count,
+            params.op.bucket.placement_name, params.op.bucket.placement_storage_class,
+            params.op.user.user_id,
+            params.op.bucket.flags, params.op.bucket.zonegroup, params.op.bucket.has_instance_obj,
+            params.op.bucket.quota, params.op.bucket.requester_pays, params.op.bucket.has_website,
+            params.op.bucket.website_conf, params.op.bucket.swift_versioning,
+            params.op.bucket.swift_ver_location, params.op.bucket.mdsearch_config,
+            params.op.bucket.new_bucket_instance_id, params.op.bucket.obj_lock,
+            params.op.bucket.sync_policy_info_groups,
+            params.op.bucket.bucket_ver, params.op.bucket.mtime,
+            params.op.bucket.bucket_name);
+      }
+      if (params.op.query_str == "attrs") {
+        return fmt::format(AttrsQuery, params.bucket_table,
+            params.op.user.user_id, params.op.bucket.bucket_attrs,
+            params.op.bucket.mtime,
+            params.op.bucket.bucket_ver, params.op.bucket.bucket_name);
+      }
+      if (params.op.query_str == "owner") {
+        return fmt::format(OwnerQuery, params.bucket_table,
+            params.op.user.user_id, params.op.bucket.creation_time,
+            params.op.bucket.mtime,
+            params.op.bucket.bucket_ver, params.op.bucket.bucket_name);
+      }
+      return "";
+    }
+};
+
+class RemoveBucketOp: virtual public DBOp {
+  private:
+    static constexpr std::string_view Query =
+      "DELETE from '{}' where BucketName = {}";
+
+  public:
+    virtual ~RemoveBucketOp() {}
+
+    static std::string Schema(DBOpPrepareParams &params) {
+      return fmt::format(Query, params.bucket_table,
+          params.op.bucket.bucket_name);
+    }
+};
+
+class GetBucketOp: virtual public DBOp {
+  private:
+    static constexpr std::string_view Query = "SELECT  \
+                          BucketName, BucketTable.Tenant, Marker, BucketID, Size, SizeRounded, CreationTime, \
+                          Count, BucketTable.PlacementName, BucketTable.PlacementStorageClass, OwnerID, Flags, Zonegroup, \
+                          HasInstanceObj, Quota, RequesterPays, HasWebsite, WebsiteConf, \
+                          SwiftVersioning, SwiftVerLocation, \
+                          MdsearchConfig, NewBucketInstanceID, ObjectLock, \
+                          SyncPolicyInfoGroups, BucketAttrs, BucketVersion, BucketVersionTag, Mtime, NS \
+                          from '{}' as BucketTable INNER JOIN '{}' ON OwnerID = UserID where BucketName = {}";
+
+  public:
+    virtual ~GetBucketOp() {}
+
+    static std::string Schema(DBOpPrepareParams &params) {
+      //return fmt::format(Query, params.op.bucket.bucket_name,
+      //          params.bucket_table, params.user_table);
+      return fmt::format(Query,
+          params.bucket_table, params.user_table,
+          params.op.bucket.bucket_name);
+    }
+};
+
+class ListUserBucketsOp: virtual public DBOp {
+  private:
+    // once we have stats also stored, may have to update this query to join
+    // these two tables.
+    static constexpr std::string_view Query = "SELECT  \
+                          BucketName, Tenant, Marker, BucketID, Size, SizeRounded, CreationTime, \
+                          Count, PlacementName, PlacementStorageClass, OwnerID, Flags, Zonegroup, \
+                          HasInstanceObj, Quota, RequesterPays, HasWebsite, WebsiteConf, \
+                          SwiftVersioning, SwiftVerLocation, \
+                          MdsearchConfig, NewBucketInstanceID, ObjectLock, \
+                          SyncPolicyInfoGroups, BucketAttrs, BucketVersion, BucketVersionTag, Mtime \
+                          FROM '{}' WHERE OwnerID = {} AND BucketName > {} ORDER BY BucketName ASC LIMIT {}";
+
+    /* BucketNames are unique across users. Hence userid/OwnerID is not used as
+     * marker or for ordering here in the below query 
+     */
+    static constexpr std::string_view AllQuery = "SELECT  \
+                          BucketName, Tenant, Marker, BucketID, Size, SizeRounded, CreationTime, \
+                          Count, PlacementName, PlacementStorageClass, OwnerID, Flags, Zonegroup, \
+                          HasInstanceObj, Quota, RequesterPays, HasWebsite, WebsiteConf, \
+                          SwiftVersioning, SwiftVerLocation, \
+                          MdsearchConfig, NewBucketInstanceID, ObjectLock, \
+                          SyncPolicyInfoGroups, BucketAttrs, BucketVersion, BucketVersionTag, Mtime \
+                          FROM '{}' WHERE BucketName > {} ORDER BY BucketName ASC LIMIT {}";
+
+  public:
+    virtual ~ListUserBucketsOp() {}
+
+    static std::string Schema(DBOpPrepareParams &params) {
+      if (params.op.query_str == "all") {
+        return fmt::format(AllQuery, params.bucket_table,
+          params.op.bucket.min_marker,
+          params.op.list_max_count);
+      } else {
+        return fmt::format(Query, params.bucket_table,
+          params.op.user.user_id, params.op.bucket.min_marker,
+          params.op.list_max_count);
+      }
+    }
+};
+
+class PutObjectOp: virtual public DBOp {
+  private:
+    static constexpr std::string_view Query =
+      "INSERT OR REPLACE INTO '{}' \
+      (ObjName, ObjInstance, ObjNS, BucketName, ACLs, IndexVer, Tag, \
+       Flags, VersionedEpoch, ObjCategory, Etag, Owner, OwnerDisplayName, \
+       StorageClass, Appendable, ContentType, IndexHashSource, ObjSize, \
+       AccountedSize, Mtime, Epoch, ObjTag, TailTag, WriteTag, FakeTag, \
+       ShadowObj, HasData, IsVersioned, VersionNum, PGVer, ZoneShortID, \
+       ObjVersion, ObjVersionTag, ObjAttrs, HeadSize, MaxHeadSize, \
+       ObjID, TailInstance, HeadPlacementRuleName, HeadPlacementRuleStorageClass, \
+       TailPlacementRuleName, TailPlacementStorageClass, \
+       ManifestPartObjs, ManifestPartRules, Omap, IsMultipart, MPPartsList, \
+       HeadData)     \
+      VALUES ({}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, \
+          {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, \
+          {}, {}, {}, \
+          {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {})";
+
+  public:
+    virtual ~PutObjectOp() {}
+
+    static std::string Schema(DBOpPrepareParams &params) {
+      return fmt::format(Query,
+          params.object_table, params.op.obj.obj_name,
+          params.op.obj.obj_instance, params.op.obj.obj_ns,
+          params.op.bucket.bucket_name, params.op.obj.acls, params.op.obj.index_ver,
+          params.op.obj.tag, params.op.obj.flags, params.op.obj.versioned_epoch,
+          params.op.obj.obj_category, params.op.obj.etag, params.op.obj.owner,
+          params.op.obj.owner_display_name, params.op.obj.storage_class,
+          params.op.obj.appendable, params.op.obj.content_type,
+          params.op.obj.index_hash_source, params.op.obj.obj_size,
+          params.op.obj.accounted_size, params.op.obj.mtime,
+          params.op.obj.epoch, params.op.obj.obj_tag, params.op.obj.tail_tag,
+          params.op.obj.write_tag, params.op.obj.fake_tag, params.op.obj.shadow_obj,
+          params.op.obj.has_data, params.op.obj.is_versioned,
+          params.op.obj.version_num,
+          params.op.obj.pg_ver, params.op.obj.zone_short_id,
+          params.op.obj.obj_version, params.op.obj.obj_version_tag,
+          params.op.obj.obj_attrs, params.op.obj.head_size,
+          params.op.obj.max_head_size, params.op.obj.obj_id,
+          params.op.obj.tail_instance,
+          params.op.obj.head_placement_rule_name,
+          params.op.obj.head_placement_storage_class,
+          params.op.obj.tail_placement_rule_name,
+          params.op.obj.tail_placement_storage_class,
+          params.op.obj.manifest_part_objs,
+          params.op.obj.manifest_part_rules, params.op.obj.omap,
+          params.op.obj.is_multipart, params.op.obj.mp_parts,
+          params.op.obj.head_data);
+    }
+};
+
+class DeleteObjectOp: virtual public DBOp {
+  private:
+    static constexpr std::string_view Query =
+      "DELETE from '{}' where BucketName = {} and ObjName = {} and ObjInstance = {}";
+
+  public:
+    virtual ~DeleteObjectOp() {}
+
+    static std::string Schema(DBOpPrepareParams &params) {
+      return fmt::format(Query, params.object_table,
+          params.op.bucket.bucket_name,
+          params.op.obj.obj_name,
+          params.op.obj.obj_instance);
+    }
+};
+
+class GetObjectOp: virtual public DBOp {
+  private:
+    static constexpr std::string_view Query =
+      "SELECT  \
+      ObjName, ObjInstance, ObjNS, BucketName, ACLs, IndexVer, Tag, \
+      Flags, VersionedEpoch, ObjCategory, Etag, Owner, OwnerDisplayName, \
+      StorageClass, Appendable, ContentType, IndexHashSource, ObjSize, \
+      AccountedSize, Mtime, Epoch, ObjTag, TailTag, WriteTag, FakeTag, \
+      ShadowObj, HasData, IsVersioned, VersionNum, PGVer, ZoneShortID, \
+      ObjVersion, ObjVersionTag, ObjAttrs, HeadSize, MaxHeadSize, \
+      ObjID, TailInstance, HeadPlacementRuleName, HeadPlacementRuleStorageClass, \
+      TailPlacementRuleName, TailPlacementStorageClass, \
+      ManifestPartObjs, ManifestPartRules, Omap, IsMultipart, MPPartsList, \
+      HeadData from '{}' \
+      where BucketName = {} and ObjName = {} and ObjInstance = {}";
+
+  public:
+    virtual ~GetObjectOp() {}
+
+    static std::string Schema(DBOpPrepareParams &params) {
+      return fmt::format(Query,
+          params.object_table,
+          params.op.bucket.bucket_name,
+          params.op.obj.obj_name,
+          params.op.obj.obj_instance);
+    }
+};
+
+class ListBucketObjectsOp: virtual public DBOp {
+  private:
+    // once we have stats also stored, may have to update this query to join
+    // these two tables.
+    static constexpr std::string_view Query =
+      "SELECT  \
+      ObjName, ObjInstance, ObjNS, BucketName, ACLs, IndexVer, Tag, \
+      Flags, VersionedEpoch, ObjCategory, Etag, Owner, OwnerDisplayName, \
+      StorageClass, Appendable, ContentType, IndexHashSource, ObjSize, \
+      AccountedSize, Mtime, Epoch, ObjTag, TailTag, WriteTag, FakeTag, \
+      ShadowObj, HasData, IsVersioned, VersionNum, PGVer, ZoneShortID, \
+      ObjVersion, ObjVersionTag, ObjAttrs, HeadSize, MaxHeadSize, \
+      ObjID, TailInstance, HeadPlacementRuleName, HeadPlacementRuleStorageClass, \
+      TailPlacementRuleName, TailPlacementStorageClass, \
+      ManifestPartObjs, ManifestPartRules, Omap, IsMultipart, MPPartsList, HeadData from '{}' \
+      where BucketName = {} and ObjName >= {} and ObjName LIKE {} ORDER BY ObjName ASC, VersionNum DESC LIMIT {}";
+  public:
+    virtual ~ListBucketObjectsOp() {}
+
+    static std::string Schema(DBOpPrepareParams &params) {
+      /* XXX: Include obj_id, delim */
+      return fmt::format(Query,
+          params.object_table,
+          params.op.bucket.bucket_name,
+          params.op.obj.min_marker,
+          params.op.obj.prefix,
+          params.op.list_max_count);
+    }
+};
+
+#define MAX_VERSIONED_OBJECTS 20
+class ListVersionedObjectsOp: virtual public DBOp {
+  private:
+    // once we have stats also stored, may have to update this query to join
+    // these two tables.
+    static constexpr std::string_view Query =
+      "SELECT  \
+      ObjName, ObjInstance, ObjNS, BucketName, ACLs, IndexVer, Tag, \
+      Flags, VersionedEpoch, ObjCategory, Etag, Owner, OwnerDisplayName, \
+      StorageClass, Appendable, ContentType, IndexHashSource, ObjSize, \
+      AccountedSize, Mtime, Epoch, ObjTag, TailTag, WriteTag, FakeTag, \
+      ShadowObj, HasData, IsVersioned, VersionNum, PGVer, ZoneShortID, \
+      ObjVersion, ObjVersionTag, ObjAttrs, HeadSize, MaxHeadSize, \
+      ObjID, TailInstance, HeadPlacementRuleName, HeadPlacementRuleStorageClass, \
+      TailPlacementRuleName, TailPlacementStorageClass, \
+      ManifestPartObjs, ManifestPartRules, Omap, IsMultipart, MPPartsList, \
+      HeadData from '{}' \
+      where BucketName = {} and ObjName = {} ORDER BY VersionNum DESC LIMIT {}";
+  public:
+    virtual ~ListVersionedObjectsOp() {}
+
+    static std::string Schema(DBOpPrepareParams &params) {
+      /* XXX: Include obj_id, delim */
+      return fmt::format(Query,
+          params.object_table,
+          params.op.bucket.bucket_name,
+          params.op.obj.obj_name,
+          params.op.list_max_count);
+    }
+};
+
+class UpdateObjectOp: virtual public DBOp {
+  private:
+    // Updates Omap
+    static constexpr std::string_view OmapQuery =
+      "UPDATE '{}' SET Omap = {}, Mtime = {} \
+      where BucketName = {} and ObjName = {} and ObjInstance = {}";
+    static constexpr std::string_view AttrsQuery =
+      "UPDATE '{}' SET ObjAttrs = {}, Mtime = {}  \
+      where BucketName = {} and ObjName = {} and ObjInstance = {}";
+    static constexpr std::string_view MPQuery =
+      "UPDATE '{}' SET MPPartsList = {}, Mtime = {}  \
+      where BucketName = {} and ObjName = {} and ObjInstance = {}";
+    static constexpr std::string_view MetaQuery =
+      "UPDATE '{}' SET \
+       ObjNS = {}, ACLs = {}, IndexVer = {}, Tag = {}, Flags = {}, VersionedEpoch = {}, \
+       ObjCategory = {}, Etag = {}, Owner = {}, OwnerDisplayName = {}, \
+       StorageClass = {}, Appendable = {}, ContentType = {}, \
+       IndexHashSource = {}, ObjSize = {}, AccountedSize = {}, Mtime = {}, \
+       Epoch = {}, ObjTag = {}, TailTag = {}, WriteTag = {}, FakeTag = {}, \
+       ShadowObj = {}, HasData = {}, IsVersioned = {}, VersionNum = {}, PGVer = {}, \
+       ZoneShortID = {}, ObjVersion = {}, ObjVersionTag = {}, ObjAttrs = {}, \
+       HeadSize = {}, MaxHeadSize = {}, ObjID = {}, TailInstance = {}, \
+       HeadPlacementRuleName = {}, HeadPlacementRuleStorageClass = {}, \
+       TailPlacementRuleName = {}, TailPlacementStorageClass = {}, \
+       ManifestPartObjs = {}, ManifestPartRules = {}, Omap = {}, \
+       IsMultipart = {}, MPPartsList = {}, HeadData = {} \
+       WHERE ObjName = {} and ObjInstance = {} and BucketName = {}";
+
+  public:
+    virtual ~UpdateObjectOp() {}
+
+    static std::string Schema(DBOpPrepareParams &params) {
+      if (params.op.query_str == "omap") {
+        return fmt::format(OmapQuery,
+            params.object_table, params.op.obj.omap,
+            params.op.obj.mtime,
+            params.op.bucket.bucket_name,
+            params.op.obj.obj_name,
+            params.op.obj.obj_instance);
+      }
+      if (params.op.query_str == "attrs") {
+        return fmt::format(AttrsQuery,
+            params.object_table, params.op.obj.obj_attrs,
+            params.op.obj.mtime,
+            params.op.bucket.bucket_name,
+            params.op.obj.obj_name,
+            params.op.obj.obj_instance);
+      }
+      if (params.op.query_str == "mp") {
+        return fmt::format(MPQuery,
+            params.object_table, params.op.obj.mp_parts,
+            params.op.obj.mtime,
+            params.op.bucket.bucket_name,
+            params.op.obj.obj_name,
+            params.op.obj.obj_instance);
+      }
+      if (params.op.query_str == "meta") {
+        return fmt::format(MetaQuery,
+          params.object_table,
+          params.op.obj.obj_ns, params.op.obj.acls, params.op.obj.index_ver,
+          params.op.obj.tag, params.op.obj.flags, params.op.obj.versioned_epoch,
+          params.op.obj.obj_category, params.op.obj.etag, params.op.obj.owner,
+          params.op.obj.owner_display_name, params.op.obj.storage_class,
+          params.op.obj.appendable, params.op.obj.content_type,
+          params.op.obj.index_hash_source, params.op.obj.obj_size,
+          params.op.obj.accounted_size, params.op.obj.mtime,
+          params.op.obj.epoch, params.op.obj.obj_tag, params.op.obj.tail_tag,
+          params.op.obj.write_tag, params.op.obj.fake_tag, params.op.obj.shadow_obj,
+          params.op.obj.has_data, params.op.obj.is_versioned, params.op.obj.version_num,
+          params.op.obj.pg_ver, params.op.obj.zone_short_id,
+          params.op.obj.obj_version, params.op.obj.obj_version_tag,
+          params.op.obj.obj_attrs, params.op.obj.head_size,
+          params.op.obj.max_head_size, params.op.obj.obj_id,
+          params.op.obj.tail_instance,
+          params.op.obj.head_placement_rule_name,
+          params.op.obj.head_placement_storage_class,
+          params.op.obj.tail_placement_rule_name,
+          params.op.obj.tail_placement_storage_class,
+          params.op.obj.manifest_part_objs,
+          params.op.obj.manifest_part_rules, params.op.obj.omap,
+          params.op.obj.is_multipart, params.op.obj.mp_parts,
+          params.op.obj.head_data, 
+          params.op.obj.obj_name, params.op.obj.obj_instance,
+          params.op.bucket.bucket_name);
+      }
+      return "";
+    }
+};
+
+class PutObjectDataOp: virtual public DBOp {
+  private:
+    static constexpr std::string_view Query =
+      "INSERT OR REPLACE INTO '{}' \
+      (ObjName, ObjInstance, ObjNS, BucketName, ObjID, MultipartPartStr, PartNum, Offset, Size, Mtime, Data) \
+      VALUES ({}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {})";
+
+  public:
+    virtual ~PutObjectDataOp() {}
+
+    static std::string Schema(DBOpPrepareParams &params) {
+      return fmt::format(Query,
+          params.objectdata_table,
+          params.op.obj.obj_name, params.op.obj.obj_instance,
+          params.op.obj.obj_ns,
+          params.op.bucket.bucket_name,
+          params.op.obj.obj_id,
+          params.op.obj_data.multipart_part_str,
+          params.op.obj_data.part_num,
+          params.op.obj_data.offset,
+          params.op.obj_data.size,
+          params.op.obj.mtime,
+          params.op.obj_data.data);
+    }
+};
+
+/* XXX: Recheck if this is really needed */
+class UpdateObjectDataOp: virtual public DBOp {
+  private:
+    static constexpr std::string_view Query =
+      "UPDATE '{}' \
+      SET Mtime = {} WHERE ObjName = {} and ObjInstance = {} and \
+      BucketName = {} and ObjID = {}";
+
+  public:
+    virtual ~UpdateObjectDataOp() {}
+
+    static std::string Schema(DBOpPrepareParams &params) {
+      return fmt::format(Query,
+          params.objectdata_table,
+          params.op.obj.mtime,
+          params.op.obj.obj_name, params.op.obj.obj_instance,
+          params.op.bucket.bucket_name,
+          params.op.obj.obj_id);
+    }
+};
+
+class GetObjectDataOp: virtual public DBOp {
+  private:
+    static constexpr std::string_view Query =
+      "SELECT  \
+      ObjName, ObjInstance, ObjNS, BucketName, ObjID, MultipartPartStr, PartNum, Offset, Size, Mtime, Data \
+      from '{}' where BucketName = {} and ObjName = {} and ObjInstance = {} and ObjID = {} ORDER BY MultipartPartStr, PartNum";
+
+  public:
+    virtual ~GetObjectDataOp() {}
+
+    static std::string Schema(DBOpPrepareParams &params) {
+      return fmt::format(Query,
+          params.objectdata_table,
+          params.op.bucket.bucket_name,
+          params.op.obj.obj_name,
+          params.op.obj.obj_instance,
+          params.op.obj.obj_id);
+    }
+};
+
+class DeleteObjectDataOp: virtual public DBOp {
+  private:
+    static constexpr std::string_view Query =
+      "DELETE from '{}' where BucketName = {} and ObjName = {} and ObjInstance = {} and ObjID = {}";
+
+  public:
+    virtual ~DeleteObjectDataOp() {}
+
+    static std::string Schema(DBOpPrepareParams &params) {
+      return fmt::format(Query,
+          params.objectdata_table,
+          params.op.bucket.bucket_name,
+          params.op.obj.obj_name,
+          params.op.obj.obj_instance,
+          params.op.obj.obj_id);
+    }
+};
+
+class DeleteStaleObjectDataOp: virtual public DBOp {
+  private:
+    static constexpr std::string_view Query =
+      "DELETE from '{}' WHERE (ObjName, ObjInstance, ObjID) NOT IN (SELECT s.ObjName, s.ObjInstance, s.ObjID from '{}' as s INNER JOIN '{}' USING (ObjName, BucketName, ObjInstance, ObjID)) and Mtime < {}";
+
+  public:
+    virtual ~DeleteStaleObjectDataOp() {}
+
+    static std::string Schema(DBOpPrepareParams &params) {
+      return fmt::format(Query,
+          params.objectdata_table,
+          params.objectdata_table,
+          params.object_table,
+          params.op.obj.mtime);
+    }
+};
+
+class InsertLCEntryOp: virtual public DBOp {
+  private:
+    static constexpr std::string_view Query =
+      "INSERT OR REPLACE INTO '{}' \
+      (LCIndex, BucketName, StartTime, Status) \
+      VALUES ({}, {}, {}, {})";
+
+  public:
+    virtual ~InsertLCEntryOp() {}
+
+    static std::string Schema(DBOpPrepareParams &params) {
+      return fmt::format(Query, params.lc_entry_table,
+          params.op.lc_entry.index, params.op.lc_entry.bucket_name,
+          params.op.lc_entry.start_time, params.op.lc_entry.status);
+    }
+};
+
+class RemoveLCEntryOp: virtual public DBOp {
+  private:
+    static constexpr std::string_view Query =
+      "DELETE from '{}' where LCIndex = {} and BucketName = {}";
+
+  public:
+    virtual ~RemoveLCEntryOp() {}
+
+    static std::string Schema(DBOpPrepareParams &params) {
+      return fmt::format(Query, params.lc_entry_table,
+          params.op.lc_entry.index, params.op.lc_entry.bucket_name);
+    }
+};
+
+class GetLCEntryOp: virtual public DBOp {
+  private:
+    static constexpr std::string_view Query = "SELECT  \
+                          LCIndex, BucketName, StartTime, Status \
+                          from '{}' where LCIndex = {} and BucketName = {}";
+    static constexpr std::string_view NextQuery = "SELECT  \
+                          LCIndex, BucketName, StartTime, Status \
+                          from '{}' where LCIndex = {} and BucketName > {} ORDER BY BucketName ASC";
+
+  public:
+    virtual ~GetLCEntryOp() {}
+
+    static std::string Schema(DBOpPrepareParams &params) {
+      if (params.op.query_str == "get_next_entry") {
+        return fmt::format(NextQuery, params.lc_entry_table,
+            params.op.lc_entry.index, params.op.lc_entry.bucket_name);
+      }
+      // default 
+      return fmt::format(Query, params.lc_entry_table,
+          params.op.lc_entry.index, params.op.lc_entry.bucket_name);
+    }
+};
+
+class ListLCEntriesOp: virtual public DBOp {
+  private:
+    static constexpr std::string_view Query = "SELECT  \
+                          LCIndex, BucketName, StartTime, Status \
+                          FROM '{}' WHERE LCIndex = {} AND BucketName > {} ORDER BY BucketName ASC LIMIT {}";
+
+  public:
+    virtual ~ListLCEntriesOp() {}
+
+    static std::string Schema(DBOpPrepareParams &params) {
+      return fmt::format(Query, params.lc_entry_table,
+          params.op.lc_entry.index, params.op.lc_entry.min_marker,
+          params.op.list_max_count);
+    }
+};
+
+class InsertLCHeadOp: virtual public DBOp {
+  private:
+    static constexpr std::string_view Query =
+      "INSERT OR REPLACE INTO '{}' \
+      (LCIndex, Marker, StartDate) \
+      VALUES ({}, {}, {})";
+
+  public:
+    virtual ~InsertLCHeadOp() {}
+
+    static std::string Schema(DBOpPrepareParams &params) {
+      return fmt::format(Query, params.lc_head_table,
+          params.op.lc_head.index, params.op.lc_head.marker,
+          params.op.lc_head.start_date);
+    }
+};
+
+class RemoveLCHeadOp: virtual public DBOp {
+  private:
+    static constexpr std::string_view Query =
+      "DELETE from '{}' where LCIndex = {}";
+
+  public:
+    virtual ~RemoveLCHeadOp() {}
+
+    static std::string Schema(DBOpPrepareParams &params) {
+      return fmt::format(Query, params.lc_head_table,
+          params.op.lc_head.index);
+    }
+};
+
+class GetLCHeadOp: virtual public DBOp {
+  private:
+    static constexpr std::string_view Query = "SELECT  \
+                          LCIndex, Marker, StartDate \
+                          from '{}' where LCIndex = {}";
+
+  public:
+    virtual ~GetLCHeadOp() {}
+
+    static std::string Schema(DBOpPrepareParams &params) {
+      return fmt::format(Query, params.lc_head_table,
+          params.op.lc_head.index);
+    }
+};
+
+/* taken from rgw_rados.h::RGWOLHInfo */
+struct DBOLHInfo {
+  rgw_obj target;
+  bool removed;
+  DBOLHInfo() : removed(false) {}
+  void encode(bufferlist& bl) const {
+    ENCODE_START(1, 1, bl);
+    encode(target, bl);
+    encode(removed, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) {
+    DECODE_START(1, bl);
+    decode(target, bl);
+    decode(removed, bl);
+    DECODE_FINISH(bl);
+  }
+};
+WRITE_CLASS_ENCODER(DBOLHInfo)
+
+class DB {
+  private:
+    const std::string db_name;
+    rgw::sal::Driver* driver;
+    const std::string user_table;
+    const std::string bucket_table;
+    const std::string quota_table;
+    const std::string lc_head_table;
+    const std::string lc_entry_table;
+    static std::map<std::string, class ObjectOp*> objectmap;
+
+  protected:
+    void *db;
+    CephContext *cct;
+    const DoutPrefix dp;
+    uint64_t max_bucket_id = 0;
+    // XXX: default ObjStripeSize or ObjChunk size - 4M, make them configurable?
+    uint64_t ObjHeadSize = 1024; /* 1K - default head data size */
+    uint64_t ObjChunkSize = (get_blob_limit() - 1000); /* 1000 to accommodate other fields */
+    // Below mutex is to protect objectmap and other shared
+    // objects if any.
+    std::mutex mtx;
+
+  public:
+    DB(std::string db_name, CephContext *_cct) : db_name(db_name),
+    user_table(db_name+"_user_table"),
+    bucket_table(db_name+"_bucket_table"),
+    quota_table(db_name+"_quota_table"),
+    lc_head_table(db_name+"_lc_head_table"),
+    lc_entry_table(db_name+"_lc_entry_table"),
+    cct(_cct),
+    dp(_cct, ceph_subsys_rgw, "rgw DBStore backend: ")
+  {}
+    /* DB() {}*/
+
+    DB(CephContext *_cct) : db_name("default_db"),
+    user_table(db_name+"_user_table"),
+    bucket_table(db_name+"_bucket_table"),
+    quota_table(db_name+"_quota_table"),
+    lc_head_table(db_name+"_lc_head_table"),
+    lc_entry_table(db_name+"_lc_entry_table"),
+    cct(_cct),
+    dp(_cct, ceph_subsys_rgw, "rgw DBStore backend: ")
+  {}
+    virtual    ~DB() {}
+
+    const std::string getDBname() { return db_name; }
+    const std::string getDBfile() { return db_name + ".db"; }
+    const std::string getUserTable() { return user_table; }
+    const std::string getBucketTable() { return bucket_table; }
+    const std::string getQuotaTable() { return quota_table; }
+    const std::string getLCHeadTable() { return lc_head_table; }
+    const std::string getLCEntryTable() { return lc_entry_table; }
+    const std::string getObjectTable(std::string bucket) {
+      return db_name+"_"+bucket+"_object_table"; }
+    const std::string getObjectDataTable(std::string bucket) {
+      return db_name+"_"+bucket+"_objectdata_table"; }
+    const std::string getObjectView(std::string bucket) {
+      return db_name+"_"+bucket+"_object_view"; }
+    const std::string getObjectTrigger(std::string bucket) {
+      return db_name+"_"+bucket+"_object_trigger"; }
+
+    std::map<std::string, class ObjectOp*> getObjectMap();
+
+    struct DBOps dbops; // DB operations, make it private?
+
+    void set_driver(rgw::sal::Driver* _driver) {
+      driver = _driver;
+    }
+
+    void set_context(CephContext *_cct) {
+      cct = _cct;
+    }
+
+    CephContext *ctx() { return cct; }
+    const DoutPrefixProvider *get_def_dpp() { return &dp; }
+
+    int Initialize(std::string logfile, int loglevel);
+    int Destroy(const DoutPrefixProvider *dpp);
+    int LockInit(const DoutPrefixProvider *dpp);
+    int LockDestroy(const DoutPrefixProvider *dpp);
+    int Lock(const DoutPrefixProvider *dpp);
+    int Unlock(const DoutPrefixProvider *dpp);
+
+    int InitializeParams(const DoutPrefixProvider *dpp, DBOpParams *params);
+    int ProcessOp(const DoutPrefixProvider *dpp, std::string_view Op, DBOpParams *params);
+    std::shared_ptr<class DBOp> getDBOp(const DoutPrefixProvider *dpp, std::string_view Op, const DBOpParams *params);
+    int objectmapInsert(const DoutPrefixProvider *dpp, std::string bucket, class ObjectOp* ptr);
+    int objectmapDelete(const DoutPrefixProvider *dpp, std::string bucket);
+
+    virtual uint64_t get_blob_limit() { return 0; };
+    virtual void *openDB(const DoutPrefixProvider *dpp) { return NULL; }
+    virtual int closeDB(const DoutPrefixProvider *dpp) { return 0; }
+    virtual int createTables(const DoutPrefixProvider *dpp) { return 0; }
+    virtual int InitializeDBOps(const DoutPrefixProvider *dpp) { return 0; }
+    virtual int InitPrepareParams(const DoutPrefixProvider *dpp,
+                                  DBOpPrepareParams &p_params,
+                                  DBOpParams* params) = 0;
+    virtual int createLCTables(const DoutPrefixProvider *dpp) = 0;
+
+    virtual int ListAllBuckets(const DoutPrefixProvider *dpp, DBOpParams *params) = 0;
+    virtual int ListAllUsers(const DoutPrefixProvider *dpp, DBOpParams *params) = 0;
+    virtual int ListAllObjects(const DoutPrefixProvider *dpp, DBOpParams *params) = 0;
+
+    int get_user(const DoutPrefixProvider *dpp,
+        const std::string& query_str, const std::string& query_str_val,
+        RGWUserInfo& uinfo, std::map<std::string, bufferlist> *pattrs,
+        RGWObjVersionTracker *pobjv_tracker);
+    int store_user(const DoutPrefixProvider *dpp,
+        RGWUserInfo& uinfo, bool exclusive, std::map<std::string, bufferlist> *pattrs,
+        RGWObjVersionTracker *pobjv_tracker, RGWUserInfo* pold_info);
+    int remove_user(const DoutPrefixProvider *dpp,
+        RGWUserInfo& uinfo, RGWObjVersionTracker *pobjv_tracker);
+    int get_bucket_info(const DoutPrefixProvider *dpp, const std::string& query_str,
+        const std::string& query_str_val,
+        RGWBucketInfo& info, rgw::sal::Attrs* pattrs, ceph::real_time* pmtime,
+        obj_version* pbucket_version);
+    int create_bucket(const DoutPrefixProvider *dpp,
+        const RGWUserInfo& owner, rgw_bucket& bucket,
+        const std::string& zonegroup_id,
+        const rgw_placement_rule& placement_rule,
+        const std::string& swift_ver_location,
+        const RGWQuotaInfo * pquota_info,
+        std::map<std::string, bufferlist>& attrs,
+        RGWBucketInfo& info,
+        obj_version *pobjv,
+        obj_version *pep_objv,
+        real_time creation_time,
+        rgw_bucket *pmaster_bucket,
+        uint32_t *pmaster_num_shards,
+        optional_yield y,
+        bool exclusive);
+
+    int next_bucket_id() { return ++max_bucket_id; };
+
+    int remove_bucket(const DoutPrefixProvider *dpp, const RGWBucketInfo info);
+    int list_buckets(const DoutPrefixProvider *dpp, const std::string& query_str,
+        rgw_user& user,
+        const std::string& marker,
+        const std::string& end_marker,
+        uint64_t max,
+        bool need_stats,
+        RGWUserBuckets *buckets,
+        bool *is_truncated);
+    int update_bucket(const DoutPrefixProvider *dpp, const std::string& query_str,
+        RGWBucketInfo& info, bool exclusive,
+        const rgw_user* powner_id, std::map<std::string, bufferlist>* pattrs,
+        ceph::real_time* pmtime, RGWObjVersionTracker* pobjv);
+
+    uint64_t get_max_head_size() { return ObjHeadSize; }
+    uint64_t get_max_chunk_size() { return ObjChunkSize; }
+    void gen_rand_obj_instance_name(rgw_obj_key *target_key);
+
+    // db raw obj string is of format -
+    // "<bucketname>_<objname>_<objinstance>_<multipart-part-str>_<partnum>"
+    static constexpr std::string_view raw_obj_oid = "{0}_{1}_{2}_{3}_{4}";
+
+    std::string to_oid(std::string_view bucket, std::string_view obj_name,
+                       std::string_view obj_instance, std::string_view obj_id,
+                       std::string_view mp_str, uint64_t partnum) {
+      return fmt::format(raw_obj_oid, bucket, obj_name, obj_instance, obj_id, mp_str, partnum);
+    }
+    int from_oid(const std::string& oid, std::string& bucket, std::string& obj_name, std::string& obj_id,
+        std::string& obj_instance,
+        std::string& mp_str, uint64_t& partnum) {
+      // TODO: use ceph::split() from common/split.h
+      // XXX: doesn't this break if obj_name has underscores in it?
+      std::vector<std::string> result;
+      boost::split(result, oid, boost::is_any_of("_"));
+      bucket = result[0];
+      obj_name = result[1];
+      obj_instance = result[2];
+      obj_id = result[3];
+      mp_str = result[4];
+      partnum = stoi(result[5]);
+
+      return 0;
+    }
+
+    struct raw_obj {
+      DB* db;
+
+      std::string bucket_name;
+      std::string obj_name;
+      std::string obj_instance;
+      std::string obj_ns;
+      std::string obj_id;
+      std::string multipart_part_str;
+      uint64_t part_num;
+
+      std::string obj_table;
+      std::string obj_data_table;
+
+      raw_obj(DB* _db) {
+        db = _db;
+      }
+
+      raw_obj(DB* _db, std::string& _bname, std::string& _obj_name, std::string& _obj_instance,
+          std::string& _obj_ns, std::string& _obj_id, std::string _mp_part_str, int _part_num) {
+        db = _db;
+        bucket_name = _bname;
+        obj_name = _obj_name;
+        obj_instance = _obj_instance;
+        obj_ns = _obj_ns;
+        obj_id = _obj_id;
+        multipart_part_str = _mp_part_str;
+        part_num = _part_num;
+
+        obj_table = bucket_name+".object.table";
+        obj_data_table = bucket_name+".objectdata.table";
+      }
+
+      raw_obj(DB* _db, std::string& oid) {
+        int r;
+
+        db = _db;
+        r = db->from_oid(oid, bucket_name, obj_name, obj_instance, obj_id, multipart_part_str,
+            part_num);
+        if (r < 0) {
+          multipart_part_str = "0.0";
+          part_num = 0;
+        }
+
+        obj_table = db->getObjectTable(bucket_name);
+        obj_data_table = db->getObjectDataTable(bucket_name);
+      }
+
+      int InitializeParamsfromRawObj (const DoutPrefixProvider *dpp, DBOpParams* params);
+
+      int read(const DoutPrefixProvider *dpp, int64_t ofs, uint64_t end, bufferlist& bl);
+      int write(const DoutPrefixProvider *dpp, int64_t ofs, int64_t write_ofs, uint64_t len, bufferlist& bl);
+    };
+
+    class GC : public Thread {
+      const DoutPrefixProvider *dpp;
+      DB *db;
+      /* Default time interval for GC 
+       * XXX: Make below options configurable
+       *
+       * gc_interval: The time between successive gc thread runs
+       * gc_obj_min_wait: Min. time to wait before deleting any data post its creation.
+       *                    
+       */
+      std::mutex mtx;
+      std::condition_variable cv;
+      bool stop_signalled = false;
+      uint32_t gc_interval = 24*60*60; //sec ; default: 24*60*60
+      uint32_t gc_obj_min_wait = 60*60; //60*60sec default
+      std::string bucket_marker;
+      std::string user_marker;
+
+    public:
+      GC(const DoutPrefixProvider *_dpp, DB* _db) :
+            dpp(_dpp), db(_db) {}
+
+      void *entry() override;
+
+      void signal_stop() {
+       std::lock_guard<std::mutex> lk_guard(mtx);
+       stop_signalled = true;
+       cv.notify_one();
+      }
+
+      friend class DB;
+    };
+    std::unique_ptr<DB::GC> gc_worker;
+
+    class Bucket {
+      friend class DB;
+      DB* store;
+
+      RGWBucketInfo bucket_info;
+
+      public:
+        Bucket(DB *_store, const RGWBucketInfo& _binfo) : store(_store), bucket_info(_binfo) {}
+        DB *get_store() { return store; }
+        rgw_bucket& get_bucket() { return bucket_info.bucket; }
+        RGWBucketInfo& get_bucket_info() { return bucket_info; }
+
+      class List {
+      protected:
+        // absolute maximum number of objects that
+        // list_objects_(un)ordered can return
+        static constexpr int64_t bucket_list_objects_absolute_max = 25000;
+
+        DB::Bucket *target;
+        rgw_obj_key next_marker;
+
+      public:
+
+        struct Params {
+          std::string prefix;
+          std::string delim;
+          rgw_obj_key marker;
+          rgw_obj_key end_marker;
+          std::string ns;
+          bool enforce_ns;
+          RGWAccessListFilter* access_list_filter;
+          RGWBucketListNameFilter force_check_filter;
+          bool list_versions;
+         bool allow_unordered;
+
+          Params() :
+               enforce_ns(true),
+               access_list_filter(nullptr),
+               list_versions(false),
+               allow_unordered(false)
+               {}
+        } params;
+
+        explicit List(DB::Bucket *_target) : target(_target) {}
+
+        /* XXX: Handle ordered and unordered separately.
+         * For now returning only ordered entries */
+        int list_objects(const DoutPrefixProvider *dpp, int64_t max,
+                          std::vector<rgw_bucket_dir_entry> *result,
+                          std::map<std::string, bool> *common_prefixes, bool *is_truncated);
+        rgw_obj_key& get_next_marker() {
+          return next_marker;
+        }
+      };
+    };
+
+    class Object {
+      friend class DB;
+      DB* store;
+
+      RGWBucketInfo bucket_info;
+      rgw_obj obj;
+
+      RGWObjState obj_state;
+      std::string obj_id;
+
+      bool versioning_disabled;
+
+      bool bs_initialized;
+
+      public:
+      Object(DB *_store, const RGWBucketInfo& _bucket_info, const rgw_obj& _obj) : store(_store), bucket_info(_bucket_info),
+      obj(_obj),
+      versioning_disabled(false),
+      bs_initialized(false) {}
+
+      Object(DB *_store, const RGWBucketInfo& _bucket_info, const rgw_obj& _obj, const std::string& _obj_id) : store(_store), bucket_info(_bucket_info), obj(_obj), obj_id(_obj_id) {}
+
+      struct Read {
+        DB::Object *source;
+
+        struct GetObjState {
+          rgw_obj obj;
+        } state;
+
+        struct ConditionParams {
+          const ceph::real_time *mod_ptr;
+          const ceph::real_time *unmod_ptr;
+          bool high_precision_time;
+          uint32_t mod_zone_id;
+          uint64_t mod_pg_ver;
+          const char *if_match;
+          const char *if_nomatch;
+
+          ConditionParams() :
+            mod_ptr(NULL), unmod_ptr(NULL), high_precision_time(false), mod_zone_id(0), mod_pg_ver(0),
+            if_match(NULL), if_nomatch(NULL) {}
+        } conds;
+
+        struct Params {
+          ceph::real_time *lastmod;
+          uint64_t *obj_size;
+         std::map<std::string, bufferlist> *attrs;
+          rgw_obj *target_obj;
+
+          Params() : lastmod(nullptr), obj_size(nullptr), attrs(nullptr),
+          target_obj(nullptr) {}
+        } params;
+
+        explicit Read(DB::Object *_source) : source(_source) {}
+
+        int prepare(const DoutPrefixProvider *dpp);
+        static int range_to_ofs(uint64_t obj_size, int64_t &ofs, int64_t &end);
+        int read(int64_t ofs, int64_t end, bufferlist& bl, const DoutPrefixProvider *dpp);
+        int iterate(const DoutPrefixProvider *dpp, int64_t ofs, int64_t end, RGWGetDataCB *cb);
+        int get_attr(const DoutPrefixProvider *dpp, const char *name, bufferlist& dest);
+      };
+
+      struct Write {
+        DB::Object *target;
+        RGWObjState obj_state;
+        std::string mp_part_str = "0.0"; // multipart num
+
+        struct MetaParams {
+          ceph::real_time *mtime;
+         std::map<std::string, bufferlist>* rmattrs;
+          const bufferlist *data;
+          RGWObjManifest *manifest;
+          const std::string *ptag;
+          std::list<rgw_obj_index_key> *remove_objs;
+          ceph::real_time set_mtime;
+          rgw_user owner;
+          RGWObjCategory category;
+          int flags;
+          const char *if_match;
+          const char *if_nomatch;
+          std::optional<uint64_t> olh_epoch;
+          ceph::real_time delete_at;
+          bool canceled;
+          const std::string *user_data;
+          rgw_zone_set *zones_trace;
+          bool modify_tail;
+          bool completeMultipart;
+          bool appendable;
+
+          MetaParams() : mtime(NULL), rmattrs(NULL), data(NULL), manifest(NULL), ptag(NULL),
+          remove_objs(NULL), category(RGWObjCategory::Main), flags(0),
+          if_match(NULL), if_nomatch(NULL), canceled(false), user_data(nullptr), zones_trace(nullptr),
+          modify_tail(false),  completeMultipart(false), appendable(false) {}
+        } meta;
+
+        explicit Write(DB::Object *_target) : target(_target) {}
+
+        void set_mp_part_str(std::string _mp_part_str) { mp_part_str = _mp_part_str;}
+        int prepare(const DoutPrefixProvider* dpp);
+        int write_data(const DoutPrefixProvider* dpp,
+                               bufferlist& data, uint64_t ofs);
+        int _do_write_meta(const DoutPrefixProvider *dpp,
+            uint64_t size, uint64_t accounted_size,
+           std::map<std::string, bufferlist>& attrs,
+            bool assume_noent, bool modify_tail);
+        int write_meta(const DoutPrefixProvider *dpp, uint64_t size,
+           uint64_t accounted_size, std::map<std::string, bufferlist>& attrs);
+      };
+
+      struct Delete {
+        DB::Object *target;
+
+        struct DeleteParams {
+          rgw_user bucket_owner;
+          int versioning_status;
+          ACLOwner obj_owner; /* needed for creation of deletion marker */
+          uint64_t olh_epoch;
+          std::string marker_version_id;
+          uint32_t bilog_flags;
+          std::list<rgw_obj_index_key> *remove_objs;
+          ceph::real_time expiration_time;
+          ceph::real_time unmod_since;
+          ceph::real_time mtime; /* for setting delete marker mtime */
+          bool high_precision_time;
+          rgw_zone_set *zones_trace;
+          bool abortmp;
+          uint64_t parts_accounted_size;
+
+          DeleteParams() : versioning_status(0), olh_epoch(0), bilog_flags(0), remove_objs(NULL), high_precision_time(false), zones_trace(nullptr), abortmp(false), parts_accounted_size(0) {}
+        } params;
+
+        struct DeleteResult {
+          bool delete_marker;
+          std::string version_id;
+
+          DeleteResult() : delete_marker(false) {}
+        } result;
+
+        explicit Delete(DB::Object *_target) : target(_target) {}
+
+        int delete_obj(const DoutPrefixProvider *dpp);
+        int delete_obj_impl(const DoutPrefixProvider *dpp, DBOpParams& del_params);
+        int create_dm(const DoutPrefixProvider *dpp, DBOpParams& del_params);
+      };
+
+      /* XXX: the parameters may be subject to change. All we need is bucket name
+       * & obj name,instance - keys */
+      int get_object_impl(const DoutPrefixProvider *dpp, DBOpParams& params);
+      int get_obj_state(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info,
+                        const rgw_obj& obj,
+                        bool follow_olh, RGWObjState **state);
+      int get_state(const DoutPrefixProvider *dpp, RGWObjState **pstate, bool follow_olh);
+      int list_versioned_objects(const DoutPrefixProvider *dpp,
+                                 std::list<rgw_bucket_dir_entry>& list_entries);
+
+      DB *get_store() { return store; }
+      rgw_obj& get_obj() { return obj; }
+      RGWBucketInfo& get_bucket_info() { return bucket_info; }
+
+      int InitializeParamsfromObject(const DoutPrefixProvider *dpp, DBOpParams* params);
+      int set_attrs(const DoutPrefixProvider *dpp, std::map<std::string, bufferlist>& setattrs,
+          std::map<std::string, bufferlist>* rmattrs);
+      int transition(const DoutPrefixProvider *dpp,
+                     const rgw_placement_rule& rule, const real_time& mtime,
+                     uint64_t olh_epoch);
+      int obj_omap_set_val_by_key(const DoutPrefixProvider *dpp, const std::string& key, bufferlist& val, bool must_exist);
+      int obj_omap_get_vals_by_keys(const DoutPrefixProvider *dpp, const std::string& oid,
+          const std::set<std::string>& keys,
+          std::map<std::string, bufferlist>* vals);
+      int obj_omap_get_all(const DoutPrefixProvider *dpp, std::map<std::string, bufferlist> *m);
+      int obj_omap_get_vals(const DoutPrefixProvider *dpp, const std::string& marker, uint64_t count,
+          std::map<std::string, bufferlist> *m, bool* pmore);
+      using iterate_obj_cb = int (*)(const DoutPrefixProvider*, const raw_obj&, off_t, off_t,
+          bool, RGWObjState*, void*);
+      int add_mp_part(const DoutPrefixProvider *dpp, RGWUploadPartInfo info);
+      int get_mp_parts_list(const DoutPrefixProvider *dpp, std::list<RGWUploadPartInfo>& info);
+
+      int iterate_obj(const DoutPrefixProvider *dpp,
+          const RGWBucketInfo& bucket_info, const rgw_obj& obj,
+          off_t ofs, off_t end, uint64_t max_chunk_size,
+          iterate_obj_cb cb, void *arg);
+    };
+    int get_obj_iterate_cb(const DoutPrefixProvider *dpp,
+        const raw_obj& read_obj, off_t obj_ofs,
+        off_t len, bool is_head_obj,
+        RGWObjState *astate, void *arg);
+
+    int get_entry(const std::string& oid, const std::string& marker,
+                 std::unique_ptr<rgw::sal::Lifecycle::LCEntry>* entry);
+    int get_next_entry(const std::string& oid, const std::string& marker,
+                 std::unique_ptr<rgw::sal::Lifecycle::LCEntry>* entry);
+    int set_entry(const std::string& oid, rgw::sal::Lifecycle::LCEntry& entry);
+    int list_entries(const std::string& oid, const std::string& marker,
+                          uint32_t max_entries, std::vector<std::unique_ptr<rgw::sal::Lifecycle::LCEntry>>& entries);
+    int rm_entry(const std::string& oid, rgw::sal::Lifecycle::LCEntry& entry);
+    int get_head(const std::string& oid, std::unique_ptr<rgw::sal::Lifecycle::LCHead>* head);
+    int put_head(const std::string& oid, rgw::sal::Lifecycle::LCHead& head);
+    int delete_stale_objs(const DoutPrefixProvider *dpp, const std::string& bucket,
+                          uint32_t min_wait);
+    int createGC(const DoutPrefixProvider *_dpp);
+    int stopGC();
+};
+
+struct db_get_obj_data {
+  DB* store;
+  RGWGetDataCB* client_cb = nullptr;
+  uint64_t offset; // next offset to write to client
+
+  db_get_obj_data(DB* db, RGWGetDataCB* cb, uint64_t offset) :
+    store(db), client_cb(cb), offset(offset) {}
+  ~db_get_obj_data() {}
+};
+
+} } // namespace rgw::store
+
+#endif
diff --git a/src/rgw/driver/dbstore/common/dbstore_log.h b/src/rgw/driver/dbstore/common/dbstore_log.h
new file mode 100644 (file)
index 0000000..8d981d5
--- /dev/null
@@ -0,0 +1,18 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef DB_STORE_LOG_H
+#define DB_STORE_LOG_H
+
+#include <cerrno>
+#include <cstdlib>
+#include <string>
+#include <cstdio>
+#include <iostream>
+#include <fstream>
+#include "common/dout.h"
+
+#undef dout_prefix
+#define dout_prefix *_dout << "rgw dbstore: "
+
+#endif
diff --git a/src/rgw/driver/dbstore/config/sqlite.cc b/src/rgw/driver/dbstore/config/sqlite.cc
new file mode 100644 (file)
index 0000000..051dc34
--- /dev/null
@@ -0,0 +1,2072 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2022 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include <charconv>
+#include <initializer_list>
+#include <map>
+
+#undef FMT_HEADER_ONLY
+#define FMT_HEADER_ONLY 1
+#include <fmt/format.h>
+
+#include <sqlite3.h>
+
+#include "include/buffer.h"
+#include "include/encoding.h"
+#include "common/dout.h"
+#include "common/random_string.h"
+#include "rgw_zone.h"
+
+#include "common/connection_pool.h"
+#include "sqlite/connection.h"
+#include "sqlite/error.h"
+#include "sqlite/statement.h"
+#include "sqlite_schema.h"
+#include "sqlite.h"
+
+#define dout_subsys ceph_subsys_rgw_dbstore
+
+namespace rgw::dbstore::config {
+
+struct Prefix : DoutPrefixPipe {
+  std::string_view prefix;
+  Prefix(const DoutPrefixProvider& dpp, std::string_view prefix)
+      : DoutPrefixPipe(dpp), prefix(prefix) {}
+  unsigned get_subsys() const override { return dout_subsys; }
+  void add_prefix(std::ostream& out) const override {
+    out << prefix;
+  }
+};
+
+namespace {
+
+// parameter names for prepared statement bindings
+static constexpr const char* P1 = ":1";
+static constexpr const char* P2 = ":2";
+static constexpr const char* P3 = ":3";
+static constexpr const char* P4 = ":4";
+static constexpr const char* P5 = ":5";
+static constexpr const char* P6 = ":6";
+
+
+void read_text_rows(const DoutPrefixProvider* dpp,
+                    const sqlite::stmt_execution& stmt,
+                    std::span<std::string> entries,
+                    sal::ListResult<std::string>& result)
+{
+  result.entries = sqlite::read_text_rows(dpp, stmt, entries);
+  if (result.entries.size() < entries.size()) { // end of listing
+    result.next.clear();
+  } else {
+    result.next = result.entries.back();
+  }
+}
+
+struct RealmRow {
+  RGWRealm info;
+  int ver;
+  std::string tag;
+};
+
+void read_realm_row(const sqlite::stmt_execution& stmt, RealmRow& row)
+{
+  row.info.id = sqlite::column_text(stmt, 0);
+  row.info.name = sqlite::column_text(stmt, 1);
+  row.info.current_period = sqlite::column_text(stmt, 2);
+  row.info.epoch = sqlite::column_int(stmt, 3);
+  row.ver = sqlite::column_int(stmt, 4);
+  row.tag = sqlite::column_text(stmt, 5);
+}
+
+void read_period_row(const sqlite::stmt_execution& stmt, RGWPeriod& row)
+{
+  // just read the Data column and decode everything else from that
+  std::string data = sqlite::column_text(stmt, 3);
+
+  bufferlist bl = bufferlist::static_from_string(data);
+  auto p = bl.cbegin();
+  decode(row, p);
+}
+
+struct ZoneGroupRow {
+  RGWZoneGroup info;
+  int ver;
+  std::string tag;
+};
+
+void read_zonegroup_row(const sqlite::stmt_execution& stmt, ZoneGroupRow& row)
+{
+  std::string data = sqlite::column_text(stmt, 3);
+  row.ver = sqlite::column_int(stmt, 4);
+  row.tag = sqlite::column_text(stmt, 5);
+
+  bufferlist bl = bufferlist::static_from_string(data);
+  auto p = bl.cbegin();
+  decode(row.info, p);
+}
+
+struct ZoneRow {
+  RGWZoneParams info;
+  int ver;
+  std::string tag;
+};
+
+void read_zone_row(const sqlite::stmt_execution& stmt, ZoneRow& row)
+{
+  std::string data = sqlite::column_text(stmt, 3);
+  row.ver = sqlite::column_int(stmt, 4);
+  row.tag = sqlite::column_text(stmt, 5);
+
+  bufferlist bl = bufferlist::static_from_string(data);
+  auto p = bl.cbegin();
+  decode(row.info, p);
+}
+
+std::string generate_version_tag(CephContext* cct)
+{
+  static constexpr auto TAG_LEN = 24;
+  return gen_rand_alphanumeric(cct, TAG_LEN);
+}
+
+using SQLiteConnectionHandle = ConnectionHandle<sqlite::Connection>;
+
+using SQLiteConnectionPool = ConnectionPool<
+    sqlite::Connection, sqlite::ConnectionFactory>;
+
+} // anonymous namespace
+
+class SQLiteImpl : public SQLiteConnectionPool {
+ public:
+  using SQLiteConnectionPool::SQLiteConnectionPool;
+};
+
+
+SQLiteConfigStore::SQLiteConfigStore(std::unique_ptr<SQLiteImpl> impl)
+  : impl(std::move(impl))
+{
+}
+
+SQLiteConfigStore::~SQLiteConfigStore() = default;
+
+
+// Realm
+
+class SQLiteRealmWriter : public sal::RealmWriter {
+  SQLiteImpl* impl;
+  int ver;
+  std::string tag;
+  std::string realm_id;
+  std::string realm_name;
+ public:
+  SQLiteRealmWriter(SQLiteImpl* impl, int ver, std::string tag,
+                    std::string_view realm_id, std::string_view realm_name)
+    : impl(impl), ver(ver), tag(std::move(tag)),
+      realm_id(realm_id), realm_name(realm_name)
+  {}
+
+  int write(const DoutPrefixProvider* dpp, optional_yield y,
+            const RGWRealm& info) override
+  {
+    Prefix prefix{*dpp, "dbconfig:sqlite:realm_write "}; dpp = &prefix;
+
+    if (!impl) {
+      return -EINVAL; // can't write after a conflict or delete
+    }
+    if (realm_id != info.id || realm_name != info.name) {
+      return -EINVAL; // can't modify realm id or name directly
+    }
+
+    try {
+      auto conn = impl->get(dpp);
+      auto& stmt = conn->statements["realm_upd"];
+      if (!stmt) {
+        const std::string sql = fmt::format(schema::realm_update5,
+                                            P1, P2, P3, P4, P5);
+        stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql);
+      }
+      auto binding = sqlite::stmt_binding{stmt.get()};
+      sqlite::bind_text(dpp, binding, P1, info.id);
+      sqlite::bind_text(dpp, binding, P2, info.current_period);
+      sqlite::bind_int(dpp, binding, P3, info.epoch);
+      sqlite::bind_int(dpp, binding, P4, ver);
+      sqlite::bind_text(dpp, binding, P5, tag);
+
+      auto reset = sqlite::stmt_execution{stmt.get()};
+      sqlite::eval0(dpp, reset);
+
+      if (!::sqlite3_changes(conn->db.get())) { // VersionNumber/Tag mismatch
+        // our version is no longer consistent, so later writes would fail too
+        impl = nullptr;
+        return -ECANCELED;
+      }
+    } catch (const sqlite::error& e) {
+      ldpp_dout(dpp, 20) << "realm update failed: " << e.what() << dendl;
+      if (e.code() == sqlite::errc::foreign_key_constraint) {
+        return -EINVAL; // refers to nonexistent CurrentPeriod
+      } else if (e.code() == sqlite::errc::busy) {
+        return -EBUSY;
+      }
+      return -EIO;
+    }
+    ++ver;
+    return 0;
+  }
+
+  int rename(const DoutPrefixProvider* dpp, optional_yield y,
+             RGWRealm& info, std::string_view new_name) override
+  {
+    Prefix prefix{*dpp, "dbconfig:sqlite:realm_rename "}; dpp = &prefix;
+
+    if (!impl) {
+      return -EINVAL; // can't write after conflict or delete
+    }
+    if (realm_id != info.id || realm_name != info.name) {
+      return -EINVAL; // can't modify realm id or name directly
+    }
+    if (new_name.empty()) {
+      ldpp_dout(dpp, 0) << "realm cannot have an empty name" << dendl;
+      return -EINVAL;
+    }
+
+    try {
+      auto conn = impl->get(dpp);
+      auto& stmt = conn->statements["realm_rename"];
+      if (!stmt) {
+        const std::string sql = fmt::format(schema::realm_rename4,
+                                            P1, P2, P3, P4);
+        stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql);
+      }
+      auto binding = sqlite::stmt_binding{stmt.get()};
+      sqlite::bind_text(dpp, binding, P1, realm_id);
+      sqlite::bind_text(dpp, binding, P2, new_name);
+      sqlite::bind_int(dpp, binding, P3, ver);
+      sqlite::bind_text(dpp, binding, P4, tag);
+
+      auto reset = sqlite::stmt_execution{stmt.get()};
+      sqlite::eval0(dpp, reset);
+
+      if (!::sqlite3_changes(conn->db.get())) { // VersionNumber/Tag mismatch
+        impl = nullptr;
+        return -ECANCELED;
+      }
+    } catch (const sqlite::error& e) {
+      ldpp_dout(dpp, 20) << "realm rename failed: " << e.what() << dendl;
+      if (e.code() == sqlite::errc::unique_constraint) {
+        return -EEXIST; // Name already taken
+      } else if (e.code() == sqlite::errc::busy) {
+        return -EBUSY;
+      }
+      return -EIO;
+    }
+    info.name = std::string{new_name};
+    ++ver;
+    return 0;
+  }
+
+  int remove(const DoutPrefixProvider* dpp, optional_yield y) override
+  {
+    Prefix prefix{*dpp, "dbconfig:sqlite:realm_remove "}; dpp = &prefix;
+
+    if (!impl) {
+      return -EINVAL; // can't write after conflict or delete
+    }
+    try {
+      auto conn = impl->get(dpp);
+      auto& stmt = conn->statements["realm_del"];
+      if (!stmt) {
+        const std::string sql = fmt::format(schema::realm_delete3, P1, P2, P3);
+        stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql);
+      }
+      auto binding = sqlite::stmt_binding{stmt.get()};
+      sqlite::bind_text(dpp, binding, P1, realm_id);
+      sqlite::bind_int(dpp, binding, P2, ver);
+      sqlite::bind_text(dpp, binding, P3, tag);
+
+      auto reset = sqlite::stmt_execution{stmt.get()};
+      sqlite::eval0(dpp, reset);
+
+      impl = nullptr; // prevent any further writes after delete
+      if (!::sqlite3_changes(conn->db.get())) {
+        return -ECANCELED; // VersionNumber/Tag mismatch
+      }
+    } catch (const sqlite::error& e) {
+      ldpp_dout(dpp, 20) << "realm delete failed: " << e.what() << dendl;
+      if (e.code() == sqlite::errc::busy) {
+        return -EBUSY;
+      }
+      return -EIO;
+    }
+    return 0;
+  }
+}; // SQLiteRealmWriter
+
+
+int SQLiteConfigStore::write_default_realm_id(const DoutPrefixProvider* dpp,
+                                              optional_yield y, bool exclusive,
+                                              std::string_view realm_id)
+{
+  Prefix prefix{*dpp, "dbconfig:sqlite:write_default_realm_id "}; dpp = &prefix;
+
+  if (realm_id.empty()) {
+    ldpp_dout(dpp, 0) << "requires a realm id" << dendl;
+    return -EINVAL;
+  }
+
+  try {
+    auto conn = impl->get(dpp);
+    sqlite::stmt_ptr* stmt = nullptr;
+    if (exclusive) {
+      stmt = &conn->statements["def_realm_ins"];
+      if (!*stmt) {
+        const std::string sql = fmt::format(schema::default_realm_insert1, P1);
+        *stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql);
+      }
+    } else {
+      stmt = &conn->statements["def_realm_ups"];
+      if (!*stmt) {
+        const std::string sql = fmt::format(schema::default_realm_upsert1, P1);
+        *stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql);
+      }
+    }
+    auto binding = sqlite::stmt_binding{stmt->get()};
+    sqlite::bind_text(dpp, binding, P1, realm_id);
+
+    auto reset = sqlite::stmt_execution{stmt->get()};
+    sqlite::eval0(dpp, reset);
+  } catch (const sqlite::error& e) {
+    ldpp_dout(dpp, 20) << "default realm insert failed: " << e.what() << dendl;
+    if (e.code() == sqlite::errc::primary_key_constraint) {
+      return -EEXIST;
+    } else if (e.code() == sqlite::errc::busy) {
+      return -EBUSY;
+    }
+    return -EIO;
+  }
+  return 0;
+}
+
+int SQLiteConfigStore::read_default_realm_id(const DoutPrefixProvider* dpp,
+                                             optional_yield y,
+                                             std::string& realm_id)
+{
+  Prefix prefix{*dpp, "dbconfig:sqlite:read_default_realm_id "}; dpp = &prefix;
+
+  try {
+    auto conn = impl->get(dpp);
+    auto& stmt = conn->statements["def_realm_sel"];
+    if (!stmt) {
+      static constexpr std::string_view sql = schema::default_realm_select0;
+      stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql);
+    }
+    auto reset = sqlite::stmt_execution{stmt.get()};
+    sqlite::eval1(dpp, reset);
+
+    realm_id = sqlite::column_text(reset, 0);
+  } catch (const sqlite::error& e) {
+    ldpp_dout(dpp, 20) << "default realm select failed: " << e.what() << dendl;
+    if (e.code() == sqlite::errc::busy) {
+      return -EBUSY;
+    }
+    return -EIO;
+  }
+  return 0;
+}
+
+int SQLiteConfigStore::delete_default_realm_id(const DoutPrefixProvider* dpp,
+                                               optional_yield y)
+
+{
+  Prefix prefix{*dpp, "dbconfig:sqlite:delete_default_realm_id "}; dpp = &prefix;
+
+  try {
+    auto conn = impl->get(dpp);
+    auto& stmt = conn->statements["def_realm_del"];
+    if (!stmt) {
+      static constexpr std::string_view sql = schema::default_realm_delete0;
+      stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql);
+    }
+    auto reset = sqlite::stmt_execution{stmt.get()};
+    sqlite::eval0(dpp, reset);
+
+    if (!::sqlite3_changes(conn->db.get())) {
+      return -ENOENT;
+    }
+  } catch (const sqlite::error& e) {
+    ldpp_dout(dpp, 20) << "default realm delete failed: " << e.what() << dendl;
+    if (e.code() == sqlite::errc::busy) {
+      return -EBUSY;
+    }
+    return -EIO;
+  }
+  return 0;
+}
+
+
+int SQLiteConfigStore::create_realm(const DoutPrefixProvider* dpp,
+                                    optional_yield y, bool exclusive,
+                                    const RGWRealm& info,
+                                    std::unique_ptr<sal::RealmWriter>* writer)
+{
+  Prefix prefix{*dpp, "dbconfig:sqlite:create_realm "}; dpp = &prefix;
+
+  if (info.id.empty()) {
+    ldpp_dout(dpp, 0) << "realm cannot have an empty id" << dendl;
+    return -EINVAL;
+  }
+  if (info.name.empty()) {
+    ldpp_dout(dpp, 0) << "realm cannot have an empty name" << dendl;
+    return -EINVAL;
+  }
+
+  int ver = 1;
+  auto tag = generate_version_tag(dpp->get_cct());
+
+  try {
+    auto conn = impl->get(dpp);
+    sqlite::stmt_ptr* stmt = nullptr;
+    if (exclusive) {
+      stmt = &conn->statements["realm_ins"];
+      if (!*stmt) {
+        const std::string sql = fmt::format(schema::realm_insert4,
+                                            P1, P2, P3, P4);
+        *stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql);
+      }
+    } else {
+      stmt = &conn->statements["realm_ups"];
+      if (!*stmt) {
+        const std::string sql = fmt::format(schema::realm_upsert4,
+                                            P1, P2, P3, P4);
+        *stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql);
+      }
+    }
+    auto binding = sqlite::stmt_binding{stmt->get()};
+    sqlite::bind_text(dpp, binding, P1, info.id);
+    sqlite::bind_text(dpp, binding, P2, info.name);
+    sqlite::bind_int(dpp, binding, P3, ver);
+    sqlite::bind_text(dpp, binding, P4, tag);
+
+    auto reset = sqlite::stmt_execution{stmt->get()};
+    sqlite::eval0(dpp, reset);
+  } catch (const sqlite::error& e) {
+    ldpp_dout(dpp, 20) << "realm insert failed: " << e.what() << dendl;
+    if (e.code() == sqlite::errc::primary_key_constraint) {
+      return -EEXIST; // ID already taken
+    } else if (e.code() == sqlite::errc::unique_constraint) {
+      return -EEXIST; // Name already taken
+    } else if (e.code() == sqlite::errc::busy) {
+      return -EBUSY;
+    }
+    return -EIO;
+  }
+
+  if (writer) {
+    *writer = std::make_unique<SQLiteRealmWriter>(
+        impl.get(), ver, std::move(tag), info.id, info.name);
+  }
+  return 0;
+}
+
+int SQLiteConfigStore::read_realm_by_id(const DoutPrefixProvider* dpp,
+                                        optional_yield y,
+                                        std::string_view realm_id,
+                                        RGWRealm& info,
+                                        std::unique_ptr<sal::RealmWriter>* writer)
+{
+  Prefix prefix{*dpp, "dbconfig:sqlite:read_realm_by_id "}; dpp = &prefix;
+
+  if (realm_id.empty()) {
+    ldpp_dout(dpp, 0) << "requires a realm id" << dendl;
+    return -EINVAL;
+  }
+
+  RealmRow row;
+  try {
+    auto conn = impl->get(dpp);
+    auto& stmt = conn->statements["realm_sel_id"];
+    if (!stmt) {
+      const std::string sql = fmt::format(schema::realm_select_id1, P1);
+      stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql);
+    }
+    auto binding = sqlite::stmt_binding{stmt.get()};
+    sqlite::bind_text(dpp, binding, P1, realm_id);
+
+    auto reset = sqlite::stmt_execution{stmt.get()};
+    sqlite::eval1(dpp, reset);
+
+    read_realm_row(reset, row);
+  } catch (const buffer::error& e) {
+    ldpp_dout(dpp, 20) << "realm decode failed: " << e.what() << dendl;
+    return -EIO;
+  } catch (const sqlite::error& e) {
+    ldpp_dout(dpp, 20) << "realm select failed: " << e.what() << dendl;
+    if (e.code() == sqlite::errc::done) {
+      return -ENOENT;
+    } else if (e.code() == sqlite::errc::busy) {
+      return -EBUSY;
+    }
+    return -EIO;
+  }
+
+  info = std::move(row.info);
+  if (writer) {
+    *writer = std::make_unique<SQLiteRealmWriter>(
+        impl.get(), row.ver, std::move(row.tag), info.id, info.name);
+  }
+  return 0;
+}
+
+static void realm_select_by_name(const DoutPrefixProvider* dpp,
+                                 sqlite::Connection& conn,
+                                 std::string_view realm_name,
+                                 RealmRow& row)
+{
+  auto& stmt = conn.statements["realm_sel_name"];
+  if (!stmt) {
+    const std::string sql = fmt::format(schema::realm_select_name1, P1);
+    stmt = sqlite::prepare_statement(dpp, conn.db.get(), sql);
+  }
+  auto binding = sqlite::stmt_binding{stmt.get()};
+  sqlite::bind_text(dpp, binding, P1, realm_name);
+
+  auto reset = sqlite::stmt_execution{stmt.get()};
+  sqlite::eval1(dpp, reset);
+
+  read_realm_row(reset, row);
+}
+
+int SQLiteConfigStore::read_realm_by_name(const DoutPrefixProvider* dpp,
+                                          optional_yield y,
+                                          std::string_view realm_name,
+                                          RGWRealm& info,
+                                          std::unique_ptr<sal::RealmWriter>* writer)
+{
+  Prefix prefix{*dpp, "dbconfig:sqlite:read_realm_by_name "}; dpp = &prefix;
+
+  if (realm_name.empty()) {
+    ldpp_dout(dpp, 0) << "requires a realm name" << dendl;
+    return -EINVAL;
+  }
+
+  RealmRow row;
+  try {
+    auto conn = impl->get(dpp);
+    realm_select_by_name(dpp, *conn, realm_name, row);
+  } catch (const buffer::error& e) {
+    ldpp_dout(dpp, 20) << "realm decode failed: " << e.what() << dendl;
+    return -EIO;
+  } catch (const sqlite::error& e) {
+    ldpp_dout(dpp, 20) << "realm select failed: " << e.what() << dendl;
+    if (e.code() == sqlite::errc::done) {
+      return -ENOENT;
+    } else if (e.code() == sqlite::errc::busy) {
+      return -EBUSY;
+    }
+    return -EIO;
+  }
+
+  info = std::move(row.info);
+  if (writer) {
+    *writer = std::make_unique<SQLiteRealmWriter>(
+        impl.get(), row.ver, std::move(row.tag), info.id, info.name);
+  }
+  return 0;
+}
+
+int SQLiteConfigStore::read_default_realm(const DoutPrefixProvider* dpp,
+                                          optional_yield y,
+                                          RGWRealm& info,
+                                          std::unique_ptr<sal::RealmWriter>* writer)
+{
+  Prefix prefix{*dpp, "dbconfig:sqlite:read_default_realm "}; dpp = &prefix;
+
+  RealmRow row;
+  try {
+    auto conn = impl->get(dpp);
+    auto& stmt = conn->statements["realm_sel_def"];
+    if (!stmt) {
+      static constexpr std::string_view sql = schema::realm_select_default0;
+      stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql);
+    }
+    auto reset = sqlite::stmt_execution{stmt.get()};
+    sqlite::eval1(dpp, reset);
+
+    read_realm_row(reset, row);
+  } catch (const buffer::error& e) {
+    ldpp_dout(dpp, 20) << "realm decode failed: " << e.what() << dendl;
+    return -EIO;
+  } catch (const sqlite::error& e) {
+    ldpp_dout(dpp, 20) << "realm select failed: " << e.what() << dendl;
+    if (e.code() == sqlite::errc::done) {
+      return -ENOENT;
+    } else if (e.code() == sqlite::errc::busy) {
+      return -EBUSY;
+    }
+    return -EIO;
+  }
+
+  info = std::move(row.info);
+  if (writer) {
+    *writer = std::make_unique<SQLiteRealmWriter>(
+        impl.get(), row.ver, std::move(row.tag), info.id, info.name);
+  }
+  return 0;
+}
+
+int SQLiteConfigStore::read_realm_id(const DoutPrefixProvider* dpp,
+                                     optional_yield y,
+                                     std::string_view realm_name,
+                                     std::string& realm_id)
+{
+  Prefix prefix{*dpp, "dbconfig:sqlite:read_realm_id "}; dpp = &prefix;
+
+  if (realm_name.empty()) {
+    ldpp_dout(dpp, 0) << "requires a realm name" << dendl;
+    return -EINVAL;
+  }
+
+  try {
+    auto conn = impl->get(dpp);
+
+    RealmRow row;
+    realm_select_by_name(dpp, *conn, realm_name, row);
+
+    realm_id = std::move(row.info.id);
+  } catch (const buffer::error& e) {
+    ldpp_dout(dpp, 20) << "realm decode failed: " << e.what() << dendl;
+    return -EIO;
+  } catch (const sqlite::error& e) {
+    ldpp_dout(dpp, 20) << "realm select failed: " << e.what() << dendl;
+    if (e.code() == sqlite::errc::done) {
+      return -ENOENT;
+    } else if (e.code() == sqlite::errc::busy) {
+      return -EBUSY;
+    }
+    return -EIO;
+  }
+
+  return 0;
+}
+
+int SQLiteConfigStore::realm_notify_new_period(const DoutPrefixProvider* dpp,
+                                               optional_yield y,
+                                               const RGWPeriod& period)
+{
+  return -ENOTSUP;
+}
+
+int SQLiteConfigStore::list_realm_names(const DoutPrefixProvider* dpp,
+                                        optional_yield y, const std::string& marker,
+                                        std::span<std::string> entries,
+                                        sal::ListResult<std::string>& result)
+{
+  Prefix prefix{*dpp, "dbconfig:sqlite:list_realm_names "}; dpp = &prefix;
+
+  try {
+    auto conn = impl->get(dpp);
+    auto& stmt = conn->statements["realm_sel_names"];
+    if (!stmt) {
+      const std::string sql = fmt::format(schema::realm_select_names2, P1, P2);
+      stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql);
+    }
+    auto binding = sqlite::stmt_binding{stmt.get()};
+    sqlite::bind_text(dpp, binding, P1, marker);
+    sqlite::bind_int(dpp, binding, P2, entries.size());
+
+    auto reset = sqlite::stmt_execution{stmt.get()};
+    read_text_rows(dpp, reset, entries, result);
+  } catch (const sqlite::error& e) {
+    ldpp_dout(dpp, 20) << "realm select failed: " << e.what() << dendl;
+    if (e.code() == sqlite::errc::busy) {
+      return -EBUSY;
+    }
+    return -EIO;
+  }
+  return 0;
+}
+
+
+// Period
+
+int SQLiteConfigStore::create_period(const DoutPrefixProvider* dpp,
+                                     optional_yield y, bool exclusive,
+                                     const RGWPeriod& info)
+{
+  Prefix prefix{*dpp, "dbconfig:sqlite:create_period "}; dpp = &prefix;
+
+  if (info.id.empty()) {
+    ldpp_dout(dpp, 0) << "period cannot have an empty id" << dendl;
+    return -EINVAL;
+  }
+
+  bufferlist bl;
+  encode(info, bl);
+  const auto data = std::string_view{bl.c_str(), bl.length()};
+
+  try {
+    auto conn = impl->get(dpp);
+    sqlite::stmt_ptr* stmt = nullptr;
+    if (exclusive) {
+      stmt = &conn->statements["period_ins"];
+      if (!*stmt) {
+        const std::string sql = fmt::format(schema::period_insert4,
+                                            P1, P2, P3, P4);
+        *stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql);
+      }
+    } else {
+      stmt = &conn->statements["period_ups"];
+      if (!*stmt) {
+        const std::string sql = fmt::format(schema::period_upsert4,
+                                            P1, P2, P3, P4);
+        *stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql);
+      }
+    }
+    auto binding = sqlite::stmt_binding{stmt->get()};
+    sqlite::bind_text(dpp, binding, P1, info.id);
+    sqlite::bind_int(dpp, binding, P2, info.epoch);
+    sqlite::bind_text(dpp, binding, P3, info.realm_id);
+    sqlite::bind_text(dpp, binding, P4, data);
+
+    auto reset = sqlite::stmt_execution{stmt->get()};
+    sqlite::eval0(dpp, reset);
+  } catch (const sqlite::error& e) {
+    ldpp_dout(dpp, 20) << "period insert failed: " << e.what() << dendl;
+    if (e.code() == sqlite::errc::foreign_key_constraint) {
+      return -EINVAL; // refers to nonexistent RealmID
+    } else if (e.code() == sqlite::errc::busy) {
+      return -EBUSY;
+    }
+    return -EIO;
+  }
+  return 0;
+}
+
+static void period_select_epoch(const DoutPrefixProvider* dpp,
+                                sqlite::Connection& conn,
+                                std::string_view id, uint32_t epoch,
+                                RGWPeriod& row)
+{
+  auto& stmt = conn.statements["period_sel_epoch"];
+  if (!stmt) {
+    const std::string sql = fmt::format(schema::period_select_epoch2, P1, P2);
+    stmt = sqlite::prepare_statement(dpp, conn.db.get(), sql);
+  }
+  auto binding = sqlite::stmt_binding{stmt.get()};
+  sqlite::bind_text(dpp, binding, P1, id);
+  sqlite::bind_int(dpp, binding, P2, epoch);
+
+  auto reset = sqlite::stmt_execution{stmt.get()};
+  sqlite::eval1(dpp, reset);
+
+  read_period_row(reset, row);
+}
+
+static void period_select_latest(const DoutPrefixProvider* dpp,
+                                 sqlite::Connection& conn,
+                                 std::string_view id, RGWPeriod& row)
+{
+  auto& stmt = conn.statements["period_sel_latest"];
+  if (!stmt) {
+    const std::string sql = fmt::format(schema::period_select_latest1, P1);
+    stmt = sqlite::prepare_statement(dpp, conn.db.get(), sql);
+  }
+  auto binding = sqlite::stmt_binding{stmt.get()};
+  sqlite::bind_text(dpp, binding, P1, id);
+
+  auto reset = sqlite::stmt_execution{stmt.get()};
+  sqlite::eval1(dpp, reset);
+
+  read_period_row(reset, row);
+}
+
+int SQLiteConfigStore::read_period(const DoutPrefixProvider* dpp,
+                                   optional_yield y,
+                                   std::string_view period_id,
+                                   std::optional<uint32_t> epoch,
+                                   RGWPeriod& info)
+{
+  Prefix prefix{*dpp, "dbconfig:sqlite:read_period "}; dpp = &prefix;
+
+  if (period_id.empty()) {
+    ldpp_dout(dpp, 0) << "requires a period id" << dendl;
+    return -EINVAL;
+  }
+
+  try {
+    auto conn = impl->get(dpp);
+    if (epoch) {
+      period_select_epoch(dpp, *conn, period_id, *epoch, info);
+    } else {
+      period_select_latest(dpp, *conn, period_id, info);
+    }
+  } catch (const buffer::error& e) {
+    ldpp_dout(dpp, 20) << "period decode failed: " << e.what() << dendl;
+    return -EIO;
+  } catch (const sqlite::error& e) {
+    ldpp_dout(dpp, 20) << "period select failed: " << e.what() << dendl;
+    if (e.code() == sqlite::errc::done) {
+      return -ENOENT;
+    } else if (e.code() == sqlite::errc::busy) {
+      return -EBUSY;
+    }
+    return -EIO;
+  }
+  return 0;
+}
+
+int SQLiteConfigStore::delete_period(const DoutPrefixProvider* dpp,
+                                     optional_yield y,
+                                     std::string_view period_id)
+{
+  Prefix prefix{*dpp, "dbconfig:sqlite:delete_period "}; dpp = &prefix;
+
+  if (period_id.empty()) {
+    ldpp_dout(dpp, 0) << "requires a period id" << dendl;
+    return -EINVAL;
+  }
+
+  try {
+    auto conn = impl->get(dpp);
+    auto& stmt = conn->statements["period_del"];
+    if (!stmt) {
+      const std::string sql = fmt::format(schema::period_delete1, P1);
+      stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql);
+    }
+    auto binding = sqlite::stmt_binding{stmt.get()};
+    sqlite::bind_text(dpp, binding, P1, period_id);
+
+    auto reset = sqlite::stmt_execution{stmt.get()};
+    sqlite::eval0(dpp, reset);
+
+    if (!::sqlite3_changes(conn->db.get())) {
+      return -ENOENT;
+    }
+  } catch (const sqlite::error& e) {
+    ldpp_dout(dpp, 20) << "period delete failed: " << e.what() << dendl;
+    if (e.code() == sqlite::errc::busy) {
+      return -EBUSY;
+    }
+    return -EIO;
+  }
+  return 0;
+}
+
+int SQLiteConfigStore::list_period_ids(const DoutPrefixProvider* dpp,
+                                       optional_yield y,
+                                       const std::string& marker,
+                                       std::span<std::string> entries,
+                                       sal::ListResult<std::string>& result)
+{
+  Prefix prefix{*dpp, "dbconfig:sqlite:list_period_ids "}; dpp = &prefix;
+
+  try {
+    auto conn = impl->get(dpp);
+    auto& stmt = conn->statements["period_sel_ids"];
+    if (!stmt) {
+      const std::string sql = fmt::format(schema::period_select_ids2, P1, P2);
+      stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql);
+    }
+    auto binding = sqlite::stmt_binding{stmt.get()};
+    sqlite::bind_text(dpp, binding, P1, marker);
+    sqlite::bind_int(dpp, binding, P2, entries.size());
+
+    auto reset = sqlite::stmt_execution{stmt.get()};
+    read_text_rows(dpp, reset, entries, result);
+  } catch (const sqlite::error& e) {
+    ldpp_dout(dpp, 20) << "period select failed: " << e.what() << dendl;
+    if (e.code() == sqlite::errc::busy) {
+      return -EBUSY;
+    }
+    return -EIO;
+  }
+  return 0;
+}
+
+
+// ZoneGroup
+
+class SQLiteZoneGroupWriter : public sal::ZoneGroupWriter {
+  SQLiteImpl* impl;
+  int ver;
+  std::string tag;
+  std::string zonegroup_id;
+  std::string zonegroup_name;
+ public:
+  SQLiteZoneGroupWriter(SQLiteImpl* impl, int ver, std::string tag,
+                        std::string_view zonegroup_id,
+                        std::string_view zonegroup_name)
+    : impl(impl), ver(ver), tag(std::move(tag)),
+      zonegroup_id(zonegroup_id), zonegroup_name(zonegroup_name)
+  {}
+
+  int write(const DoutPrefixProvider* dpp, optional_yield y,
+            const RGWZoneGroup& info) override
+  {
+    Prefix prefix{*dpp, "dbconfig:sqlite:zonegroup_write "}; dpp = &prefix;
+
+    if (!impl) {
+      return -EINVAL; // can't write after conflict or delete
+    }
+    if (zonegroup_id != info.id || zonegroup_name != info.name) {
+      return -EINVAL; // can't modify zonegroup id or name directly
+    }
+
+    bufferlist bl;
+    encode(info, bl);
+    const auto data = std::string_view{bl.c_str(), bl.length()};
+
+    try {
+      auto conn = impl->get(dpp);
+      auto& stmt = conn->statements["zonegroup_upd"];
+      if (!stmt) {
+        const std::string sql = fmt::format(schema::zonegroup_update5,
+                                            P1, P2, P3, P4, P5);
+        stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql);
+      }
+      auto binding = sqlite::stmt_binding{stmt.get()};
+      sqlite::bind_text(dpp, binding, P1, info.id);
+      sqlite::bind_text(dpp, binding, P2, info.realm_id);
+      sqlite::bind_text(dpp, binding, P3, data);
+      sqlite::bind_int(dpp, binding, P4, ver);
+      sqlite::bind_text(dpp, binding, P5, tag);
+
+      auto reset = sqlite::stmt_execution{stmt.get()};
+      sqlite::eval0(dpp, reset);
+
+      if (!::sqlite3_changes(conn->db.get())) { // VersionNumber/Tag mismatch
+        impl = nullptr;
+        return -ECANCELED;
+      }
+    } catch (const sqlite::error& e) {
+      ldpp_dout(dpp, 20) << "zonegroup update failed: " << e.what() << dendl;
+      if (e.code() == sqlite::errc::foreign_key_constraint) {
+        return -EINVAL; // refers to nonexistent RealmID
+      } else if (e.code() == sqlite::errc::busy) {
+        return -EBUSY;
+      }
+      return -EIO;
+    }
+    return 0;
+  }
+
+  int rename(const DoutPrefixProvider* dpp, optional_yield y,
+             RGWZoneGroup& info, std::string_view new_name) override
+  {
+    Prefix prefix{*dpp, "dbconfig:sqlite:zonegroup_rename "}; dpp = &prefix;
+
+    if (!impl) {
+      return -EINVAL; // can't write after conflict or delete
+    }
+    if (zonegroup_id != info.get_id() || zonegroup_name != info.get_name()) {
+      return -EINVAL; // can't modify zonegroup id or name directly
+    }
+    if (new_name.empty()) {
+      ldpp_dout(dpp, 0) << "zonegroup cannot have an empty name" << dendl;
+      return -EINVAL;
+    }
+
+    try {
+      auto conn = impl->get(dpp);
+      auto& stmt = conn->statements["zonegroup_rename"];
+      if (!stmt) {
+        const std::string sql = fmt::format(schema::zonegroup_rename4,
+                                            P1, P2, P3, P4);
+        stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql);
+      }
+      auto binding = sqlite::stmt_binding{stmt.get()};
+      sqlite::bind_text(dpp, binding, P1, info.id);
+      sqlite::bind_text(dpp, binding, P2, new_name);
+      sqlite::bind_int(dpp, binding, P3, ver);
+      sqlite::bind_text(dpp, binding, P4, tag);
+
+      auto reset = sqlite::stmt_execution{stmt.get()};
+      sqlite::eval0(dpp, reset);
+
+      if (!::sqlite3_changes(conn->db.get())) { // VersionNumber/Tag mismatch
+        impl = nullptr;
+        return -ECANCELED;
+      }
+    } catch (const sqlite::error& e) {
+      ldpp_dout(dpp, 20) << "zonegroup rename failed: " << e.what() << dendl;
+      if (e.code() == sqlite::errc::unique_constraint) {
+        return -EEXIST; // Name already taken
+      } else if (e.code() == sqlite::errc::busy) {
+        return -EBUSY;
+      }
+      return -EIO;
+    }
+    info.name = std::string{new_name};
+    return 0;
+  }
+
+  int remove(const DoutPrefixProvider* dpp, optional_yield y) override
+  {
+    Prefix prefix{*dpp, "dbconfig:sqlite:zonegroup_remove "}; dpp = &prefix;
+
+    if (!impl) {
+      return -EINVAL; // can't write after conflict or delete
+    }
+    try {
+      auto conn = impl->get(dpp);
+      auto& stmt = conn->statements["zonegroup_del"];
+      if (!stmt) {
+        const std::string sql = fmt::format(schema::zonegroup_delete3,
+                                            P1, P2, P3);
+        stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql);
+      }
+      auto binding = sqlite::stmt_binding{stmt.get()};
+      sqlite::bind_text(dpp, binding, P1, zonegroup_id);
+      sqlite::bind_int(dpp, binding, P2, ver);
+      sqlite::bind_text(dpp, binding, P3, tag);
+
+      auto reset = sqlite::stmt_execution{stmt.get()};
+      sqlite::eval0(dpp, reset);
+
+      impl = nullptr;
+      if (!::sqlite3_changes(conn->db.get())) { // VersionNumber/Tag mismatch
+        return -ECANCELED;
+      }
+    } catch (const sqlite::error& e) {
+      ldpp_dout(dpp, 20) << "zonegroup delete failed: " << e.what() << dendl;
+      if (e.code() == sqlite::errc::busy) {
+        return -EBUSY;
+      }
+      return -EIO;
+    }
+    return 0;
+  }
+}; // SQLiteZoneGroupWriter
+
+
+int SQLiteConfigStore::write_default_zonegroup_id(const DoutPrefixProvider* dpp,
+                                                  optional_yield y, bool exclusive,
+                                                  std::string_view realm_id,
+                                                  std::string_view zonegroup_id)
+{
+  Prefix prefix{*dpp, "dbconfig:sqlite:write_default_zonegroup_id "}; dpp = &prefix;
+
+  try {
+    auto conn = impl->get(dpp);
+    sqlite::stmt_ptr* stmt = nullptr;
+    if (exclusive) {
+      stmt = &conn->statements["def_zonegroup_ins"];
+      if (!*stmt) {
+        const std::string sql = fmt::format(schema::default_zonegroup_insert2,
+                                            P1, P2);
+        *stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql);
+      }
+    } else {
+      stmt = &conn->statements["def_zonegroup_ups"];
+      if (!*stmt) {
+        const std::string sql = fmt::format(schema::default_zonegroup_upsert2,
+                                            P1, P2);
+        *stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql);
+      }
+    }
+    auto binding = sqlite::stmt_binding{stmt->get()};
+    sqlite::bind_text(dpp, binding, P1, realm_id);
+    sqlite::bind_text(dpp, binding, P2, zonegroup_id);
+
+    auto reset = sqlite::stmt_execution{stmt->get()};
+    sqlite::eval0(dpp, reset);
+  } catch (const sqlite::error& e) {
+    ldpp_dout(dpp, 20) << "default zonegroup insert failed: " << e.what() << dendl;
+    if (e.code() == sqlite::errc::busy) {
+      return -EBUSY;
+    }
+    return -EIO;
+  }
+  return 0;
+}
+
+int SQLiteConfigStore::read_default_zonegroup_id(const DoutPrefixProvider* dpp,
+                                                 optional_yield y,
+                                                 std::string_view realm_id,
+                                                 std::string& zonegroup_id)
+{
+  Prefix prefix{*dpp, "dbconfig:sqlite:read_default_zonegroup_id "}; dpp = &prefix;
+
+  try {
+    auto conn = impl->get(dpp);
+    auto& stmt = conn->statements["def_zonegroup_sel"];
+    if (!stmt) {
+      const std::string sql = fmt::format(schema::default_zonegroup_select1, P1);
+      stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql);
+    }
+    auto binding = sqlite::stmt_binding{stmt.get()};
+    sqlite::bind_text(dpp, binding, P1, realm_id);
+
+    auto reset = sqlite::stmt_execution{stmt.get()};
+    sqlite::eval1(dpp, reset);
+
+    zonegroup_id = sqlite::column_text(reset, 0);
+  } catch (const sqlite::error& e) {
+    ldpp_dout(dpp, 20) << "default zonegroup select failed: " << e.what() << dendl;
+    if (e.code() == sqlite::errc::done) {
+      return -ENOENT;
+    } else if (e.code() == sqlite::errc::busy) {
+      return -EBUSY;
+    }
+    return -EIO;
+  }
+  return 0;
+}
+
+int SQLiteConfigStore::delete_default_zonegroup_id(const DoutPrefixProvider* dpp,
+                                                   optional_yield y,
+                                                   std::string_view realm_id)
+{
+  Prefix prefix{*dpp, "dbconfig:sqlite:delete_default_zonegroup_id "}; dpp = &prefix;
+
+  try {
+    auto conn = impl->get(dpp);
+    auto& stmt = conn->statements["def_zonegroup_del"];
+    if (!stmt) {
+      const std::string sql = fmt::format(schema::default_zonegroup_delete1, P1);
+      stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql);
+    }
+    auto binding = sqlite::stmt_binding{stmt.get()};
+    sqlite::bind_text(dpp, binding, P1, realm_id);
+
+    auto reset = sqlite::stmt_execution{stmt.get()};
+    sqlite::eval0(dpp, reset);
+
+    if (!::sqlite3_changes(conn->db.get())) {
+      return -ENOENT;
+    }
+  } catch (const sqlite::error& e) {
+    ldpp_dout(dpp, 20) << "default zonegroup delete failed: " << e.what() << dendl;
+    if (e.code() == sqlite::errc::busy) {
+      return -EBUSY;
+    }
+    return -EIO;
+  }
+  return 0;
+}
+
+
+int SQLiteConfigStore::create_zonegroup(const DoutPrefixProvider* dpp,
+                                        optional_yield y, bool exclusive,
+                                        const RGWZoneGroup& info,
+                                        std::unique_ptr<sal::ZoneGroupWriter>* writer)
+{
+  Prefix prefix{*dpp, "dbconfig:sqlite:create_zonegroup "}; dpp = &prefix;
+
+  if (info.id.empty()) {
+    ldpp_dout(dpp, 0) << "zonegroup cannot have an empty id" << dendl;
+    return -EINVAL;
+  }
+  if (info.name.empty()) {
+    ldpp_dout(dpp, 0) << "zonegroup cannot have an empty name" << dendl;
+    return -EINVAL;
+  }
+
+  int ver = 1;
+  auto tag = generate_version_tag(dpp->get_cct());
+
+  bufferlist bl;
+  encode(info, bl);
+  const auto data = std::string_view{bl.c_str(), bl.length()};
+
+  try {
+    auto conn = impl->get(dpp);
+    sqlite::stmt_ptr* stmt = nullptr;
+    if (exclusive) {
+      stmt = &conn->statements["zonegroup_ins"];
+      if (!*stmt) {
+        const std::string sql = fmt::format(schema::zonegroup_insert6,
+                                            P1, P2, P3, P4, P5, P6);
+        *stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql);
+      }
+    } else {
+      stmt = &conn->statements["zonegroup_ups"];
+      if (!*stmt) {
+        const std::string sql = fmt::format(schema::zonegroup_upsert6,
+                                            P1, P2, P3, P4, P5, P6);
+        *stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql);
+      }
+    }
+    auto binding = sqlite::stmt_binding{stmt->get()};
+    sqlite::bind_text(dpp, binding, P1, info.id);
+    sqlite::bind_text(dpp, binding, P2, info.name);
+    sqlite::bind_text(dpp, binding, P3, info.realm_id);
+    sqlite::bind_text(dpp, binding, P4, data);
+    sqlite::bind_int(dpp, binding, P5, ver);
+    sqlite::bind_text(dpp, binding, P6, tag);
+
+    auto reset = sqlite::stmt_execution{stmt->get()};
+    sqlite::eval0(dpp, reset);
+  } catch (const sqlite::error& e) {
+    ldpp_dout(dpp, 20) << "zonegroup insert failed: " << e.what() << dendl;
+    if (e.code() == sqlite::errc::foreign_key_constraint) {
+      return -EINVAL; // refers to nonexistent RealmID
+    } else if (e.code() == sqlite::errc::primary_key_constraint) {
+      return -EEXIST; // ID already taken
+    } else if (e.code() == sqlite::errc::unique_constraint) {
+      return -EEXIST; // Name already taken
+    } else if (e.code() == sqlite::errc::busy) {
+      return -EBUSY;
+    }
+    return -EIO;
+  }
+
+  if (writer) {
+    *writer = std::make_unique<SQLiteZoneGroupWriter>(
+        impl.get(), ver, std::move(tag), info.id, info.name);
+  }
+  return 0;
+}
+
+int SQLiteConfigStore::read_zonegroup_by_id(const DoutPrefixProvider* dpp,
+                                            optional_yield y,
+                                            std::string_view zonegroup_id,
+                                            RGWZoneGroup& info,
+                                            std::unique_ptr<sal::ZoneGroupWriter>* writer)
+{
+  Prefix prefix{*dpp, "dbconfig:sqlite:read_zonegroup_by_id "}; dpp = &prefix;
+
+  if (zonegroup_id.empty()) {
+    ldpp_dout(dpp, 0) << "requires a zonegroup id" << dendl;
+    return -EINVAL;
+  }
+
+  ZoneGroupRow row;
+  try {
+    auto conn = impl->get(dpp);
+    auto& stmt = conn->statements["zonegroup_sel_id"];
+    if (!stmt) {
+      const std::string sql = fmt::format(schema::zonegroup_select_id1, P1);
+      stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql);
+    }
+    auto binding = sqlite::stmt_binding{stmt.get()};
+    sqlite::bind_text(dpp, binding, P1, zonegroup_id);
+
+    auto reset = sqlite::stmt_execution{stmt.get()};
+    sqlite::eval1(dpp, reset);
+
+    read_zonegroup_row(reset, row);
+  } catch (const buffer::error& e) {
+    ldpp_dout(dpp, 20) << "zonegroup decode failed: " << e.what() << dendl;
+    return -EIO;
+  } catch (const sqlite::error& e) {
+    ldpp_dout(dpp, 20) << "zonegroup select failed: " << e.what() << dendl;
+    if (e.code() == sqlite::errc::done) {
+      return -ENOENT;
+    } else if (e.code() == sqlite::errc::busy) {
+      return -EBUSY;
+    }
+    return -EIO;
+  }
+
+  info = std::move(row.info);
+  if (writer) {
+    *writer = std::make_unique<SQLiteZoneGroupWriter>(
+        impl.get(), row.ver, std::move(row.tag), info.id, info.name);
+  }
+  return 0;
+}
+
+int SQLiteConfigStore::read_zonegroup_by_name(const DoutPrefixProvider* dpp,
+                                              optional_yield y,
+                                              std::string_view zonegroup_name,
+                                              RGWZoneGroup& info,
+                                              std::unique_ptr<sal::ZoneGroupWriter>* writer)
+{
+  Prefix prefix{*dpp, "dbconfig:sqlite:read_zonegroup_by_name "}; dpp = &prefix;
+
+  if (zonegroup_name.empty()) {
+    ldpp_dout(dpp, 0) << "requires a zonegroup name" << dendl;
+    return -EINVAL;
+  }
+
+  ZoneGroupRow row;
+  try {
+    auto conn = impl->get(dpp);
+    auto& stmt = conn->statements["zonegroup_sel_name"];
+    if (!stmt) {
+      const std::string sql = fmt::format(schema::zonegroup_select_name1, P1);
+      stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql);
+    }
+    auto binding = sqlite::stmt_binding{stmt.get()};
+    sqlite::bind_text(dpp, binding, P1, zonegroup_name);
+
+    auto reset = sqlite::stmt_execution{stmt.get()};
+    sqlite::eval1(dpp, reset);
+
+    read_zonegroup_row(reset, row);
+  } catch (const buffer::error& e) {
+    ldpp_dout(dpp, 20) << "zonegroup decode failed: " << e.what() << dendl;
+    return -EIO;
+  } catch (const sqlite::error& e) {
+    ldpp_dout(dpp, 20) << "zonegroup select failed: " << e.what() << dendl;
+    if (e.code() == sqlite::errc::done) {
+      return -ENOENT;
+    } else if (e.code() == sqlite::errc::busy) {
+      return -EBUSY;
+    }
+    return -EIO;
+  }
+
+  info = std::move(row.info);
+  if (writer) {
+    *writer = std::make_unique<SQLiteZoneGroupWriter>(
+        impl.get(), row.ver, std::move(row.tag), info.id, info.name);
+  }
+  return 0;
+}
+
+int SQLiteConfigStore::read_default_zonegroup(const DoutPrefixProvider* dpp,
+                                              optional_yield y,
+                                              std::string_view realm_id,
+                                              RGWZoneGroup& info,
+                                              std::unique_ptr<sal::ZoneGroupWriter>* writer)
+{
+  Prefix prefix{*dpp, "dbconfig:sqlite:read_default_zonegroup "}; dpp = &prefix;
+
+  ZoneGroupRow row;
+  try {
+    auto conn = impl->get(dpp);
+    auto& stmt = conn->statements["zonegroup_sel_def"];
+    if (!stmt) {
+      static constexpr std::string_view sql = schema::zonegroup_select_default0;
+      stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql);
+    }
+    auto reset = sqlite::stmt_execution{stmt.get()};
+    sqlite::eval1(dpp, reset);
+
+    read_zonegroup_row(reset, row);
+  } catch (const buffer::error& e) {
+    ldpp_dout(dpp, 20) << "zonegroup decode failed: " << e.what() << dendl;
+    return -EIO;
+  } catch (const sqlite::error& e) {
+    ldpp_dout(dpp, 20) << "zonegroup select failed: " << e.what() << dendl;
+    if (e.code() == sqlite::errc::done) {
+      return -ENOENT;
+    } else if (e.code() == sqlite::errc::busy) {
+      return -EBUSY;
+    }
+    return -EIO;
+  }
+
+  info = std::move(row.info);
+  if (writer) {
+    *writer = std::make_unique<SQLiteZoneGroupWriter>(
+        impl.get(), row.ver, std::move(row.tag), info.id, info.name);
+  }
+  return 0;
+}
+
+int SQLiteConfigStore::list_zonegroup_names(const DoutPrefixProvider* dpp,
+                                            optional_yield y,
+                                            const std::string& marker,
+                                            std::span<std::string> entries,
+                                            sal::ListResult<std::string>& result)
+{
+  Prefix prefix{*dpp, "dbconfig:sqlite:list_zonegroup_names "}; dpp = &prefix;
+
+  try {
+    auto conn = impl->get(dpp);
+    auto& stmt = conn->statements["zonegroup_sel_names"];
+    if (!stmt) {
+      const std::string sql = fmt::format(schema::zonegroup_select_names2, P1, P2);
+      stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql);
+    }
+    auto binding = sqlite::stmt_binding{stmt.get()};
+    auto reset = sqlite::stmt_execution{stmt.get()};
+
+    sqlite::bind_text(dpp, binding, P1, marker);
+    sqlite::bind_int(dpp, binding, P2, entries.size());
+
+    read_text_rows(dpp, reset, entries, result);
+  } catch (const sqlite::error& e) {
+    ldpp_dout(dpp, 20) << "zonegroup select failed: " << e.what() << dendl;
+    if (e.code() == sqlite::errc::busy) {
+      return -EBUSY;
+    }
+    return -EIO;
+  }
+  return 0;
+}
+
+
+// Zone
+
+class SQLiteZoneWriter : public sal::ZoneWriter {
+  SQLiteImpl* impl;
+  int ver;
+  std::string tag;
+  std::string zone_id;
+  std::string zone_name;
+ public:
+  SQLiteZoneWriter(SQLiteImpl* impl, int ver, std::string tag,
+                   std::string_view zone_id, std::string_view zone_name)
+    : impl(impl), ver(ver), tag(std::move(tag)),
+      zone_id(zone_id), zone_name(zone_name)
+  {}
+
+  int write(const DoutPrefixProvider* dpp, optional_yield y,
+            const RGWZoneParams& info) override
+  {
+    Prefix prefix{*dpp, "dbconfig:sqlite:zone_write "}; dpp = &prefix;
+
+    if (!impl) {
+      return -EINVAL; // can't write after conflict or delete
+    }
+    if (zone_id != info.id || zone_name != info.name) {
+      return -EINVAL; // can't modify zone id or name directly
+    }
+
+    bufferlist bl;
+    encode(info, bl);
+    const auto data = std::string_view{bl.c_str(), bl.length()};
+
+    try {
+      auto conn = impl->get(dpp);
+      auto& stmt = conn->statements["zone_upd"];
+      if (!stmt) {
+        const std::string sql = fmt::format(schema::zone_update5,
+                                            P1, P2, P3, P4, P5);
+        stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql);
+      }
+      auto binding = sqlite::stmt_binding{stmt.get()};
+      sqlite::bind_text(dpp, binding, P1, info.id);
+      sqlite::bind_text(dpp, binding, P2, info.realm_id);
+      sqlite::bind_text(dpp, binding, P3, data);
+      sqlite::bind_int(dpp, binding, P4, ver);
+      sqlite::bind_text(dpp, binding, P5, tag);
+
+      auto reset = sqlite::stmt_execution{stmt.get()};
+      sqlite::eval0(dpp, reset);
+
+      if (!::sqlite3_changes(conn->db.get())) { // VersionNumber/Tag mismatch
+        impl = nullptr;
+        return -ECANCELED;
+      }
+    } catch (const sqlite::error& e) {
+      ldpp_dout(dpp, 20) << "zone update failed: " << e.what() << dendl;
+      if (e.code() == sqlite::errc::foreign_key_constraint) {
+        return -EINVAL; // refers to nonexistent RealmID
+      } else if (e.code() == sqlite::errc::busy) {
+        return -EBUSY;
+      }
+      return -EIO;
+    }
+    ++ver;
+    return 0;
+  }
+
+  int rename(const DoutPrefixProvider* dpp, optional_yield y,
+             RGWZoneParams& info, std::string_view new_name) override
+  {
+    Prefix prefix{*dpp, "dbconfig:sqlite:zone_rename "}; dpp = &prefix;
+
+    if (!impl) {
+      return -EINVAL; // can't write after conflict or delete
+    }
+    if (zone_id != info.id || zone_name != info.name) {
+      return -EINVAL; // can't modify zone id or name directly
+    }
+    if (new_name.empty()) {
+      ldpp_dout(dpp, 0) << "zonegroup cannot have an empty name" << dendl;
+      return -EINVAL;
+    }
+
+    try {
+      auto conn = impl->get(dpp);
+      auto& stmt = conn->statements["zone_rename"];
+      if (!stmt) {
+        const std::string sql = fmt::format(schema::zone_rename4, P1, P2, P2, P3);
+        stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql);
+      }
+      auto binding = sqlite::stmt_binding{stmt.get()};
+      sqlite::bind_text(dpp, binding, P1, info.id);
+      sqlite::bind_text(dpp, binding, P2, new_name);
+      sqlite::bind_int(dpp, binding, P3, ver);
+      sqlite::bind_text(dpp, binding, P4, tag);
+
+      auto reset = sqlite::stmt_execution{stmt.get()};
+      sqlite::eval0(dpp, reset);
+
+      if (!::sqlite3_changes(conn->db.get())) { // VersionNumber/Tag mismatch
+        impl = nullptr;
+        return -ECANCELED;
+      }
+    } catch (const sqlite::error& e) {
+      ldpp_dout(dpp, 20) << "zone rename failed: " << e.what() << dendl;
+      if (e.code() == sqlite::errc::unique_constraint) {
+        return -EEXIST; // Name already taken
+      } else if (e.code() == sqlite::errc::busy) {
+        return -EBUSY;
+      }
+      return -EIO;
+    }
+    info.name = std::string{new_name};
+    ++ver;
+    return 0;
+  }
+
+  int remove(const DoutPrefixProvider* dpp, optional_yield y) override
+  {
+    Prefix prefix{*dpp, "dbconfig:sqlite:zone_remove "}; dpp = &prefix;
+
+    if (!impl) {
+      return -EINVAL; // can't write after conflict or delete
+    }
+    try {
+      auto conn = impl->get(dpp);
+      auto& stmt = conn->statements["zone_del"];
+      if (!stmt) {
+        const std::string sql = fmt::format(schema::zone_delete3, P1, P2, P3);
+        stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql);
+      }
+      auto binding = sqlite::stmt_binding{stmt.get()};
+      sqlite::bind_text(dpp, binding, P1, zone_id);
+      sqlite::bind_int(dpp, binding, P2, ver);
+      sqlite::bind_text(dpp, binding, P3, tag);
+
+      auto reset = sqlite::stmt_execution{stmt.get()};
+      sqlite::eval0(dpp, reset);
+
+      impl = nullptr;
+      if (!::sqlite3_changes(conn->db.get())) { // VersionNumber/Tag mismatch
+        return -ECANCELED;
+      }
+    } catch (const sqlite::error& e) {
+      ldpp_dout(dpp, 20) << "zone delete failed: " << e.what() << dendl;
+      if (e.code() == sqlite::errc::busy) {
+        return -EBUSY;
+      }
+      return -EIO;
+    }
+    return 0;
+  }
+}; // SQLiteZoneWriter
+
+
+int SQLiteConfigStore::write_default_zone_id(const DoutPrefixProvider* dpp,
+                                             optional_yield y, bool exclusive,
+                                             std::string_view realm_id,
+                                             std::string_view zone_id)
+{
+  Prefix prefix{*dpp, "dbconfig:sqlite:write_default_zone_id "}; dpp = &prefix;
+
+  if (zone_id.empty()) {
+    ldpp_dout(dpp, 0) << "requires a zone id" << dendl;
+    return -EINVAL;
+  }
+
+  try {
+    auto conn = impl->get(dpp);
+    sqlite::stmt_ptr* stmt = nullptr;
+    if (exclusive) {
+      stmt = &conn->statements["def_zone_ins"];
+      if (!*stmt) {
+        const std::string sql = fmt::format(schema::default_zone_insert2, P1, P2);
+        *stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql);
+      }
+    } else {
+      stmt = &conn->statements["def_zone_ups"];
+      if (!*stmt) {
+        const std::string sql = fmt::format(schema::default_zone_upsert2, P1, P2);
+        *stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql);
+      }
+    }
+    auto binding = sqlite::stmt_binding{stmt->get()};
+    sqlite::bind_text(dpp, binding, P1, realm_id);
+    sqlite::bind_text(dpp, binding, P2, zone_id);
+
+    auto reset = sqlite::stmt_execution{stmt->get()};
+    sqlite::eval0(dpp, reset);
+  } catch (const sqlite::error& e) {
+    ldpp_dout(dpp, 20) << "default zone insert failed: " << e.what() << dendl;
+    if (e.code() == sqlite::errc::busy) {
+      return -EBUSY;
+    }
+    return -EIO;
+  }
+  return 0;
+}
+
+int SQLiteConfigStore::read_default_zone_id(const DoutPrefixProvider* dpp,
+                                            optional_yield y,
+                                            std::string_view realm_id,
+                                            std::string& zone_id)
+{
+  Prefix prefix{*dpp, "dbconfig:sqlite:read_default_zone_id "}; dpp = &prefix;
+
+  try {
+    auto conn = impl->get(dpp);
+    auto& stmt = conn->statements["def_zone_sel"];
+    if (!stmt) {
+      const std::string sql = fmt::format(schema::default_zone_select1, P1);
+      stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql);
+    }
+    auto binding = sqlite::stmt_binding{stmt.get()};
+    sqlite::bind_text(dpp, binding, P1, realm_id);
+
+    auto reset = sqlite::stmt_execution{stmt.get()};
+    sqlite::eval1(dpp, reset);
+
+    zone_id = sqlite::column_text(reset, 0);
+  } catch (const sqlite::error& e) {
+    ldpp_dout(dpp, 20) << "default zone select failed: " << e.what() << dendl;
+    if (e.code() == sqlite::errc::done) {
+      return -ENOENT;
+    } else if (e.code() == sqlite::errc::busy) {
+      return -EBUSY;
+    }
+    return -EIO;
+  }
+  return 0;
+}
+
+int SQLiteConfigStore::delete_default_zone_id(const DoutPrefixProvider* dpp,
+                                              optional_yield y,
+                                              std::string_view realm_id)
+{
+  Prefix prefix{*dpp, "dbconfig:sqlite:delete_default_zone_id "}; dpp = &prefix;
+
+  try {
+    auto conn = impl->get(dpp);
+    auto& stmt = conn->statements["def_zone_del"];
+    if (!stmt) {
+      const std::string sql = fmt::format(schema::default_zone_delete1, P1);
+      stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql);
+    }
+    auto binding = sqlite::stmt_binding{stmt.get()};
+    sqlite::bind_text(dpp, binding, P1, realm_id);
+
+    auto reset = sqlite::stmt_execution{stmt.get()};
+    sqlite::eval0(dpp, reset);
+
+    if (!::sqlite3_changes(conn->db.get())) {
+      return -ENOENT;
+    }
+  } catch (const sqlite::error& e) {
+    ldpp_dout(dpp, 20) << "default zone delete failed: " << e.what() << dendl;
+    if (e.code() == sqlite::errc::busy) {
+      return -EBUSY;
+    }
+    return -EIO;
+  }
+  return 0;
+}
+
+
+int SQLiteConfigStore::create_zone(const DoutPrefixProvider* dpp,
+                                   optional_yield y, bool exclusive,
+                                   const RGWZoneParams& info,
+                                   std::unique_ptr<sal::ZoneWriter>* writer)
+{
+  Prefix prefix{*dpp, "dbconfig:sqlite:create_zone "}; dpp = &prefix;
+
+  if (info.id.empty()) {
+    ldpp_dout(dpp, 0) << "zone cannot have an empty id" << dendl;
+    return -EINVAL;
+  }
+  if (info.name.empty()) {
+    ldpp_dout(dpp, 0) << "zone cannot have an empty name" << dendl;
+    return -EINVAL;
+  }
+
+  int ver = 1;
+  auto tag = generate_version_tag(dpp->get_cct());
+
+  bufferlist bl;
+  encode(info, bl);
+  const auto data = std::string_view{bl.c_str(), bl.length()};
+
+  try {
+    auto conn = impl->get(dpp);
+    sqlite::stmt_ptr* stmt = nullptr;
+    if (exclusive) {
+      stmt = &conn->statements["zone_ins"];
+      if (!*stmt) {
+        const std::string sql = fmt::format(schema::zone_insert6,
+                                            P1, P2, P3, P4, P5, P6);
+        *stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql);
+      }
+    } else {
+      stmt = &conn->statements["zone_ups"];
+      if (!*stmt) {
+        const std::string sql = fmt::format(schema::zone_upsert6,
+                                            P1, P2, P3, P4, P5, P6);
+        *stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql);
+      }
+    }
+    auto binding = sqlite::stmt_binding{stmt->get()};
+    sqlite::bind_text(dpp, binding, P1, info.id);
+    sqlite::bind_text(dpp, binding, P2, info.name);
+    sqlite::bind_text(dpp, binding, P3, info.realm_id);
+    sqlite::bind_text(dpp, binding, P4, data);
+    sqlite::bind_int(dpp, binding, P5, ver);
+    sqlite::bind_text(dpp, binding, P6, tag);
+
+    auto reset = sqlite::stmt_execution{stmt->get()};
+    sqlite::eval0(dpp, reset);
+  } catch (const sqlite::error& e) {
+    ldpp_dout(dpp, 20) << "zone insert failed: " << e.what() << dendl;
+    if (e.code() == sqlite::errc::foreign_key_constraint) {
+      return -EINVAL; // refers to nonexistent RealmID
+    } else if (e.code() == sqlite::errc::primary_key_constraint) {
+      return -EEXIST; // ID already taken
+    } else if (e.code() == sqlite::errc::unique_constraint) {
+      return -EEXIST; // Name already taken
+    } else if (e.code() == sqlite::errc::busy) {
+      return -EBUSY;
+    }
+    return -EIO;
+  }
+
+  if (writer) {
+    *writer = std::make_unique<SQLiteZoneWriter>(
+        impl.get(), ver, std::move(tag), info.id, info.name);
+  }
+  return 0;
+}
+
+int SQLiteConfigStore::read_zone_by_id(const DoutPrefixProvider* dpp,
+                                       optional_yield y,
+                                       std::string_view zone_id,
+                                       RGWZoneParams& info,
+                                       std::unique_ptr<sal::ZoneWriter>* writer)
+{
+  Prefix prefix{*dpp, "dbconfig:sqlite:read_zone_by_id "}; dpp = &prefix;
+
+  if (zone_id.empty()) {
+    ldpp_dout(dpp, 0) << "requires a zone id" << dendl;
+    return -EINVAL;
+  }
+
+  ZoneRow row;
+  try {
+    auto conn = impl->get(dpp);
+    auto& stmt = conn->statements["zone_sel_id"];
+    if (!stmt) {
+      const std::string sql = fmt::format(schema::zone_select_id1, P1);
+      stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql);
+    }
+    auto binding = sqlite::stmt_binding{stmt.get()};
+    sqlite::bind_text(dpp, binding, P1, zone_id);
+
+    auto reset = sqlite::stmt_execution{stmt.get()};
+    sqlite::eval1(dpp, reset);
+
+    read_zone_row(reset, row);
+  } catch (const sqlite::error& e) {
+    ldpp_dout(dpp, 20) << "zone select failed: " << e.what() << dendl;
+    if (e.code() == sqlite::errc::done) {
+      return -ENOENT;
+    } else if (e.code() == sqlite::errc::busy) {
+      return -EBUSY;
+    }
+    return -EIO;
+  }
+
+  info = std::move(row.info);
+  if (writer) {
+    *writer = std::make_unique<SQLiteZoneWriter>(
+        impl.get(), row.ver, std::move(row.tag), info.id, info.name);
+  }
+  return 0;
+}
+
+int SQLiteConfigStore::read_zone_by_name(const DoutPrefixProvider* dpp,
+                                         optional_yield y,
+                                         std::string_view zone_name,
+                                         RGWZoneParams& info,
+                                         std::unique_ptr<sal::ZoneWriter>* writer)
+{
+  Prefix prefix{*dpp, "dbconfig:sqlite:read_zone_by_name "}; dpp = &prefix;
+
+  if (zone_name.empty()) {
+    ldpp_dout(dpp, 0) << "requires a zone name" << dendl;
+    return -EINVAL;
+  }
+
+  ZoneRow row;
+  try {
+    auto conn = impl->get(dpp);
+    auto& stmt = conn->statements["zone_sel_name"];
+    if (!stmt) {
+      const std::string sql = fmt::format(schema::zone_select_name1, P1);
+      stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql);
+    }
+    auto binding = sqlite::stmt_binding{stmt.get()};
+    sqlite::bind_text(dpp, binding, P1, zone_name);
+
+    auto reset = sqlite::stmt_execution{stmt.get()};
+    sqlite::eval1(dpp, reset);
+
+    read_zone_row(reset, row);
+  } catch (const sqlite::error& e) {
+    ldpp_dout(dpp, 20) << "zone select failed: " << e.what() << dendl;
+    if (e.code() == sqlite::errc::done) {
+      return -ENOENT;
+    } else if (e.code() == sqlite::errc::busy) {
+      return -EBUSY;
+    }
+    return -EIO;
+  }
+
+  info = std::move(row.info);
+  if (writer) {
+    *writer = std::make_unique<SQLiteZoneWriter>(
+        impl.get(), row.ver, std::move(row.tag), info.id, info.name);
+  }
+  return 0;
+}
+
+int SQLiteConfigStore::read_default_zone(const DoutPrefixProvider* dpp,
+                                         optional_yield y,
+                                         std::string_view realm_id,
+                                         RGWZoneParams& info,
+                                         std::unique_ptr<sal::ZoneWriter>* writer)
+{
+  Prefix prefix{*dpp, "dbconfig:sqlite:read_default_zone "}; dpp = &prefix;
+
+  ZoneRow row;
+  try {
+    auto conn = impl->get(dpp);
+    auto& stmt = conn->statements["zone_sel_def"];
+    if (!stmt) {
+      static constexpr std::string_view sql = schema::zone_select_default0;
+      stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql);
+    }
+    auto reset = sqlite::stmt_execution{stmt.get()};
+    sqlite::eval1(dpp, reset);
+
+    read_zone_row(reset, row);
+  } catch (const sqlite::error& e) {
+    ldpp_dout(dpp, 20) << "zone select failed: " << e.what() << dendl;
+    if (e.code() == sqlite::errc::done) {
+      return -ENOENT;
+    } else if (e.code() == sqlite::errc::busy) {
+      return -EBUSY;
+    }
+    return -EIO;
+  }
+
+  info = std::move(row.info);
+  if (writer) {
+    *writer = std::make_unique<SQLiteZoneWriter>(
+        impl.get(), row.ver, std::move(row.tag), info.id, info.name);
+  }
+  return 0;
+}
+
+int SQLiteConfigStore::list_zone_names(const DoutPrefixProvider* dpp,
+                                       optional_yield y,
+                                       const std::string& marker,
+                                       std::span<std::string> entries,
+                                       sal::ListResult<std::string>& result)
+{
+  Prefix prefix{*dpp, "dbconfig:sqlite:list_zone_names "}; dpp = &prefix;
+
+  try {
+    auto conn = impl->get(dpp);
+    auto& stmt = conn->statements["zone_sel_names"];
+    if (!stmt) {
+      const std::string sql = fmt::format(schema::zone_select_names2, P1, P2);
+      stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql);
+    }
+    auto binding = sqlite::stmt_binding{stmt.get()};
+    sqlite::bind_text(dpp, binding, P1, marker);
+    sqlite::bind_int(dpp, binding, P2, entries.size());
+
+    auto reset = sqlite::stmt_execution{stmt.get()};
+    read_text_rows(dpp, reset, entries, result);
+  } catch (const sqlite::error& e) {
+    ldpp_dout(dpp, 20) << "zone select failed: " << e.what() << dendl;
+    if (e.code() == sqlite::errc::busy) {
+      return -EBUSY;
+    }
+    return -EIO;
+  }
+  return 0;
+}
+
+
+// PeriodConfig
+
+int SQLiteConfigStore::read_period_config(const DoutPrefixProvider* dpp,
+                                          optional_yield y,
+                                          std::string_view realm_id,
+                                          RGWPeriodConfig& info)
+{
+  Prefix prefix{*dpp, "dbconfig:sqlite:read_period_config "}; dpp = &prefix;
+
+  try {
+    auto conn = impl->get(dpp);
+    auto& stmt = conn->statements["period_conf_sel"];
+    if (!stmt) {
+      const std::string sql = fmt::format(schema::period_config_select1, P1);
+      stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql);
+    }
+    auto binding = sqlite::stmt_binding{stmt.get()};
+    sqlite::bind_text(dpp, binding, P1, realm_id);
+
+    auto reset = sqlite::stmt_execution{stmt.get()};
+    sqlite::eval1(dpp, reset);
+
+    std::string data = sqlite::column_text(reset, 0);
+    bufferlist bl = bufferlist::static_from_string(data);
+    auto p = bl.cbegin();
+    decode(info, p);
+
+  } catch (const buffer::error& e) {
+    ldpp_dout(dpp, 20) << "period config decode failed: " << e.what() << dendl;
+    return -EIO;
+  } catch (const sqlite::error& e) {
+    ldpp_dout(dpp, 20) << "period config select failed: " << e.what() << dendl;
+    if (e.code() == sqlite::errc::done) {
+      return -ENOENT;
+    } else if (e.code() == sqlite::errc::busy) {
+      return -EBUSY;
+    }
+    return -EIO;
+  }
+  return 0;
+}
+
+int SQLiteConfigStore::write_period_config(const DoutPrefixProvider* dpp,
+                                           optional_yield y, bool exclusive,
+                                           std::string_view realm_id,
+                                           const RGWPeriodConfig& info)
+{
+  Prefix prefix{*dpp, "dbconfig:sqlite:write_period_config "}; dpp = &prefix;
+
+  bufferlist bl;
+  encode(info, bl);
+  const auto data = std::string_view{bl.c_str(), bl.length()};
+
+  try {
+    auto conn = impl->get(dpp);
+    sqlite::stmt_ptr* stmt = nullptr;
+    if (exclusive) {
+      stmt = &conn->statements["period_conf_ins"];
+      if (!*stmt) {
+        const std::string sql = fmt::format(schema::period_config_insert2, P1, P2);
+        *stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql);
+      }
+    } else {
+      stmt = &conn->statements["period_conf_ups"];
+      if (!*stmt) {
+        const std::string sql = fmt::format(schema::period_config_upsert2, P1, P2);
+        *stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql);
+      }
+    }
+    auto binding = sqlite::stmt_binding{stmt->get()};
+    sqlite::bind_text(dpp, binding, P1, realm_id);
+    sqlite::bind_text(dpp, binding, P2, data);
+
+    auto reset = sqlite::stmt_execution{stmt->get()};
+    sqlite::eval0(dpp, reset);
+  } catch (const buffer::error& e) {
+    ldpp_dout(dpp, 20) << "period config decode failed: " << e.what() << dendl;
+    return -EIO;
+  } catch (const sqlite::error& e) {
+    ldpp_dout(dpp, 20) << "period config insert failed: " << e.what() << dendl;
+    if (e.code() == sqlite::errc::primary_key_constraint) {
+      return -EEXIST;
+    } else if (e.code() == sqlite::errc::busy) {
+      return -EBUSY;
+    }
+    return -EIO;
+  }
+  return 0;
+}
+
+namespace {
+
+int version_cb(void* user, int count, char** values, char** names)
+{
+  if (count != 1) {
+    return EINVAL;
+  }
+  std::string_view name = names[0];
+  if (name != "user_version") {
+    return EINVAL;
+  }
+  std::string_view value = values[0];
+  auto result = std::from_chars(value.begin(), value.end(),
+                                *reinterpret_cast<uint32_t*>(user));
+  if (result.ec != std::errc{}) {
+    return static_cast<int>(result.ec);
+  }
+  return 0;
+}
+
+void apply_schema_migrations(const DoutPrefixProvider* dpp, sqlite3* db)
+{
+  sqlite::execute(dpp, db, "PRAGMA foreign_keys = ON", nullptr, nullptr);
+
+  // initiate a transaction and read the current schema version
+  uint32_t version = 0;
+  sqlite::execute(dpp, db, "BEGIN; PRAGMA user_version", version_cb, &version);
+
+  const uint32_t initial_version = version;
+  ldpp_dout(dpp, 4) << "current schema version " << version << dendl;
+
+  // use the version as an index into schema::migrations
+  auto m = std::next(schema::migrations.begin(), version);
+
+  for (; m != schema::migrations.end(); ++m, ++version) {
+    try {
+      sqlite::execute(dpp, db, m->up, nullptr, nullptr);
+    } catch (const sqlite::error&) {
+      ldpp_dout(dpp, -1) << "ERROR: schema migration failed on v" << version
+          << ": " << m->description << dendl;
+      throw;
+    }
+  }
+
+  if (version > initial_version) {
+    // update the user_version and commit the transaction
+    const auto commit = fmt::format("PRAGMA user_version = {}; COMMIT", version);
+    sqlite::execute(dpp, db, commit.c_str(), nullptr, nullptr);
+
+    ldpp_dout(dpp, 4) << "upgraded database schema to version " << version << dendl;
+  } else {
+    // nothing to commit
+    sqlite::execute(dpp, db, "ROLLBACK", nullptr, nullptr);
+  }
+}
+
+} // anonymous namespace
+
+
+auto create_sqlite_store(const DoutPrefixProvider* dpp, const std::string& uri)
+  -> std::unique_ptr<config::SQLiteConfigStore>
+{
+  Prefix prefix{*dpp, "dbconfig:sqlite:create_sqlite_store "}; dpp = &prefix;
+
+  // build the connection pool
+  int flags = SQLITE_OPEN_CREATE | SQLITE_OPEN_URI | SQLITE_OPEN_READWRITE |
+      SQLITE_OPEN_NOMUTEX;
+  auto factory = sqlite::ConnectionFactory{uri, flags};
+
+  // sqlite does not support concurrent writers. we enforce this limitation by
+  // using a connection pool of size=1
+  static constexpr size_t max_connections = 1;
+  auto impl = std::make_unique<SQLiteImpl>(std::move(factory), max_connections);
+
+  // open a connection to apply schema migrations
+  auto conn = impl->get(dpp);
+  apply_schema_migrations(dpp, conn->db.get());
+
+  return std::make_unique<SQLiteConfigStore>(std::move(impl));
+}
+
+} // namespace rgw::dbstore::config
diff --git a/src/rgw/driver/dbstore/config/sqlite.h b/src/rgw/driver/dbstore/config/sqlite.h
new file mode 100644 (file)
index 0000000..d79e040
--- /dev/null
@@ -0,0 +1,172 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2022 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#pragma once
+
+#include "rgw_sal_config.h"
+
+class DoutPrefixProvider;
+
+namespace rgw::dbstore::config {
+
+struct SQLiteImpl;
+
+class SQLiteConfigStore : public sal::ConfigStore {
+ public:
+  explicit SQLiteConfigStore(std::unique_ptr<SQLiteImpl> impl);
+  ~SQLiteConfigStore() override;
+
+  int write_default_realm_id(const DoutPrefixProvider* dpp,
+                             optional_yield y, bool exclusive,
+                             std::string_view realm_id) override;
+  int read_default_realm_id(const DoutPrefixProvider* dpp,
+                            optional_yield y,
+                            std::string& realm_id) override;
+  int delete_default_realm_id(const DoutPrefixProvider* dpp,
+                              optional_yield y) override;
+
+  int create_realm(const DoutPrefixProvider* dpp,
+                   optional_yield y, bool exclusive,
+                   const RGWRealm& info,
+                   std::unique_ptr<sal::RealmWriter>* writer) override;
+  int read_realm_by_id(const DoutPrefixProvider* dpp,
+                       optional_yield y,
+                       std::string_view realm_id,
+                       RGWRealm& info,
+                       std::unique_ptr<sal::RealmWriter>* writer) override;
+  int read_realm_by_name(const DoutPrefixProvider* dpp,
+                         optional_yield y,
+                         std::string_view realm_name,
+                         RGWRealm& info,
+                         std::unique_ptr<sal::RealmWriter>* writer) override;
+  int read_default_realm(const DoutPrefixProvider* dpp,
+                         optional_yield y,
+                         RGWRealm& info,
+                         std::unique_ptr<sal::RealmWriter>* writer) override;
+  int read_realm_id(const DoutPrefixProvider* dpp,
+                    optional_yield y, std::string_view realm_name,
+                    std::string& realm_id) override;
+  int realm_notify_new_period(const DoutPrefixProvider* dpp,
+                              optional_yield y,
+                              const RGWPeriod& period) override;
+  int list_realm_names(const DoutPrefixProvider* dpp,
+                       optional_yield y, const std::string& marker,
+                       std::span<std::string> entries,
+                       sal::ListResult<std::string>& result) override;
+
+  int create_period(const DoutPrefixProvider* dpp,
+                    optional_yield y, bool exclusive,
+                    const RGWPeriod& info) override;
+  int read_period(const DoutPrefixProvider* dpp,
+                  optional_yield y, std::string_view period_id,
+                  std::optional<uint32_t> epoch, RGWPeriod& info) override;
+  int delete_period(const DoutPrefixProvider* dpp,
+                    optional_yield y,
+                    std::string_view period_id) override;
+  int list_period_ids(const DoutPrefixProvider* dpp,
+                      optional_yield y, const std::string& marker,
+                      std::span<std::string> entries,
+                      sal::ListResult<std::string>& result) override;
+
+  int write_default_zonegroup_id(const DoutPrefixProvider* dpp,
+                                 optional_yield y, bool exclusive,
+                                 std::string_view realm_id,
+                                 std::string_view zonegroup_id) override;
+  int read_default_zonegroup_id(const DoutPrefixProvider* dpp,
+                                optional_yield y,
+                                std::string_view realm_id,
+                                std::string& zonegroup_id) override;
+  int delete_default_zonegroup_id(const DoutPrefixProvider* dpp,
+                                  optional_yield y,
+                                  std::string_view realm_id) override;
+
+  int create_zonegroup(const DoutPrefixProvider* dpp,
+                       optional_yield y, bool exclusive,
+                       const RGWZoneGroup& info,
+                       std::unique_ptr<sal::ZoneGroupWriter>* writer) override;
+  int read_zonegroup_by_id(const DoutPrefixProvider* dpp,
+                           optional_yield y,
+                           std::string_view zonegroup_id,
+                           RGWZoneGroup& info,
+                           std::unique_ptr<sal::ZoneGroupWriter>* writer) override;
+  int read_zonegroup_by_name(const DoutPrefixProvider* dpp,
+                             optional_yield y,
+                             std::string_view zonegroup_name,
+                             RGWZoneGroup& info,
+                             std::unique_ptr<sal::ZoneGroupWriter>* writer) override;
+  int read_default_zonegroup(const DoutPrefixProvider* dpp,
+                             optional_yield y,
+                             std::string_view realm_id,
+                             RGWZoneGroup& info,
+                             std::unique_ptr<sal::ZoneGroupWriter>* writer) override;
+  int list_zonegroup_names(const DoutPrefixProvider* dpp,
+                           optional_yield y, const std::string& marker,
+                           std::span<std::string> entries,
+                           sal::ListResult<std::string>& result) override;
+
+  int write_default_zone_id(const DoutPrefixProvider* dpp,
+                            optional_yield y, bool exclusive,
+                            std::string_view realm_id,
+                            std::string_view zone_id) override;
+  int read_default_zone_id(const DoutPrefixProvider* dpp,
+                           optional_yield y,
+                           std::string_view realm_id,
+                           std::string& zone_id) override;
+  int delete_default_zone_id(const DoutPrefixProvider* dpp,
+                             optional_yield y,
+                             std::string_view realm_id) override;
+
+  int create_zone(const DoutPrefixProvider* dpp,
+                  optional_yield y, bool exclusive,
+                  const RGWZoneParams& info,
+                  std::unique_ptr<sal::ZoneWriter>* writer) override;
+  int read_zone_by_id(const DoutPrefixProvider* dpp,
+                      optional_yield y,
+                      std::string_view zone_id,
+                      RGWZoneParams& info,
+                      std::unique_ptr<sal::ZoneWriter>* writer) override;
+  int read_zone_by_name(const DoutPrefixProvider* dpp,
+                        optional_yield y,
+                        std::string_view zone_name,
+                        RGWZoneParams& info,
+                        std::unique_ptr<sal::ZoneWriter>* writer) override;
+  int read_default_zone(const DoutPrefixProvider* dpp,
+                        optional_yield y,
+                        std::string_view realm_id,
+                        RGWZoneParams& info,
+                        std::unique_ptr<sal::ZoneWriter>* writer) override;
+  int list_zone_names(const DoutPrefixProvider* dpp,
+                      optional_yield y, const std::string& marker,
+                      std::span<std::string> entries,
+                      sal::ListResult<std::string>& result) override;
+
+  int read_period_config(const DoutPrefixProvider* dpp,
+                         optional_yield y,
+                         std::string_view realm_id,
+                         RGWPeriodConfig& info) override;
+  int write_period_config(const DoutPrefixProvider* dpp,
+                          optional_yield y, bool exclusive,
+                          std::string_view realm_id,
+                          const RGWPeriodConfig& info) override;
+
+ private:
+  std::unique_ptr<SQLiteImpl> impl;
+}; // SQLiteConfigStore
+
+
+auto create_sqlite_store(const DoutPrefixProvider* dpp, const std::string& uri)
+  -> std::unique_ptr<config::SQLiteConfigStore>;
+
+} // namespace rgw::dbstore::config
diff --git a/src/rgw/driver/dbstore/config/sqlite_schema.h b/src/rgw/driver/dbstore/config/sqlite_schema.h
new file mode 100644 (file)
index 0000000..c8a8fce
--- /dev/null
@@ -0,0 +1,299 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2022 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#pragma once
+
+#include <initializer_list>
+
+namespace rgw::dbstore::config::schema {
+
+struct Migration {
+  // human-readable description to help with debugging migration errors
+  const char* description = nullptr;
+  // series of sql statements to apply the schema migration
+  const char* up = nullptr;
+  // series of sql statements to undo the schema migration
+  const char* down = nullptr;
+};
+
+static constexpr std::initializer_list<Migration> migrations {{
+    .description = "create the initial ConfigStore tables",
+    .up = R"(
+CREATE TABLE IF NOT EXISTS Realms (
+  ID TEXT PRIMARY KEY NOT NULL,
+  Name TEXT UNIQUE NOT NULL,
+  CurrentPeriod TEXT,
+  Epoch INTEGER DEFAULT 0,
+  VersionNumber INTEGER,
+  VersionTag TEXT
+);
+CREATE TABLE IF NOT EXISTS Periods (
+  ID TEXT NOT NULL,
+  Epoch INTEGER DEFAULT 0,
+  RealmID TEXT NOT NULL REFERENCES Realms (ID),
+  Data TEXT NOT NULL,
+  PRIMARY KEY (ID, Epoch)
+);
+CREATE TABLE IF NOT EXISTS PeriodConfigs (
+  RealmID TEXT PRIMARY KEY NOT NULL REFERENCES Realms (ID),
+  Data TEXT NOT NULL
+);
+CREATE TABLE IF NOT EXISTS ZoneGroups (
+  ID TEXT PRIMARY KEY NOT NULL,
+  Name TEXT UNIQUE NOT NULL,
+  RealmID TEXT NOT NULL REFERENCES Realms (ID),
+  Data TEXT NOT NULL,
+  VersionNumber INTEGER,
+  VersionTag TEXT
+);
+CREATE TABLE IF NOT EXISTS Zones (
+  ID TEXT PRIMARY KEY NOT NULL,
+  Name TEXT UNIQUE NOT NULL,
+  RealmID TEXT NOT NULL REFERENCES Realms (ID),
+  Data TEXT NOT NULL,
+  VersionNumber INTEGER,
+  VersionTag TEXT
+);
+CREATE TABLE IF NOT EXISTS DefaultRealms (
+  ID TEXT,
+  Empty TEXT PRIMARY KEY
+);
+CREATE TABLE IF NOT EXISTS DefaultZoneGroups (
+  ID TEXT,
+  RealmID TEXT PRIMARY KEY REFERENCES Realms (ID)
+);
+CREATE TABLE IF NOT EXISTS DefaultZones (
+  ID TEXT,
+  RealmID TEXT PRIMARY KEY REFERENCES Realms (ID)
+);
+)",
+    .down = R"(
+DROP TABLE IF EXISTS Realms;
+DROP TABLE IF EXISTS Periods;
+DROP TABLE IF EXISTS PeriodConfigs;
+DROP TABLE IF EXISTS ZoneGroups;
+DROP TABLE IF EXISTS Zones;
+DROP TABLE IF EXISTS DefaultRealms;
+DROP TABLE IF EXISTS DefaultZoneGroups;
+DROP TABLE IF EXISTS DefaultZones;
+)"
+  }
+};
+
+
+// DefaultRealms
+
+static constexpr const char* default_realm_insert1 =
+"INSERT INTO DefaultRealms (ID, Empty) VALUES ({}, '')";
+
+static constexpr const char* default_realm_upsert1 =
+R"(INSERT INTO DefaultRealms (ID, Empty) VALUES ({0}, '')
+ON CONFLICT(Empty) DO UPDATE SET ID = {0})";
+
+static constexpr const char* default_realm_select0 =
+"SELECT ID FROM DefaultRealms LIMIT 1";
+
+static constexpr const char* default_realm_delete0 =
+"DELETE FROM DefaultRealms";
+
+
+// Realms
+
+static constexpr const char* realm_update5 =
+"UPDATE Realms SET CurrentPeriod = {1}, Epoch = {2}, VersionNumber = {3} + 1 \
+WHERE ID = {0} AND VersionNumber = {3} AND VersionTag = {4}";
+
+static constexpr const char* realm_rename4 =
+"UPDATE Realms SET Name = {1}, VersionNumber = {2} + 1 \
+WHERE ID = {0} AND VersionNumber = {2} AND VersionTag = {3}";
+
+static constexpr const char* realm_delete3 =
+"DELETE FROM Realms WHERE ID = {} AND VersionNumber = {} AND VersionTag = {}";
+
+static constexpr const char* realm_insert4 =
+"INSERT INTO Realms (ID, Name, VersionNumber, VersionTag) \
+VALUES ({}, {}, {}, {})";
+
+static constexpr const char* realm_upsert4 =
+"INSERT INTO Realms (ID, Name, VersionNumber, VersionTag) \
+VALUES ({0}, {1}, {2}, {3}) \
+ON CONFLICT(ID) DO UPDATE SET Name = {1}, \
+VersionNumber = {2}, VersionTag = {3}";
+
+static constexpr const char* realm_select_id1 =
+"SELECT * FROM Realms WHERE ID = {} LIMIT 1";
+
+static constexpr const char* realm_select_name1 =
+"SELECT * FROM Realms WHERE Name = {} LIMIT 1";
+
+static constexpr const char* realm_select_default0 =
+"SELECT r.* FROM Realms r \
+INNER JOIN DefaultRealms d \
+ON d.ID = r.ID LIMIT 1";
+
+static constexpr const char* realm_select_names2 =
+"SELECT Name FROM Realms WHERE Name > {} \
+ORDER BY Name ASC LIMIT {}";
+
+
+// Periods
+
+static constexpr const char* period_insert4 =
+"INSERT INTO Periods (ID, Epoch, RealmID, Data) \
+VALUES ({}, {}, {}, {})";
+
+static constexpr const char* period_upsert4 =
+"INSERT INTO Periods (ID, Epoch, RealmID, Data) \
+VALUES ({0}, {1}, {2}, {3}) \
+ON CONFLICT DO UPDATE SET RealmID = {2}, Data = {3}";
+
+static constexpr const char* period_select_epoch2 =
+"SELECT * FROM Periods WHERE ID = {} AND Epoch = {} LIMIT 1";
+
+static constexpr const char* period_select_latest1 =
+"SELECT * FROM Periods WHERE ID = {} ORDER BY Epoch DESC LIMIT 1";
+
+static constexpr const char* period_delete1 =
+"DELETE FROM Periods WHERE ID = {}";
+
+static constexpr const char* period_select_ids2 =
+"SELECT ID FROM Periods WHERE ID > {} ORDER BY ID ASC LIMIT {}";
+
+
+// DefaultZoneGroups
+
+static constexpr const char* default_zonegroup_insert2 =
+"INSERT INTO DefaultZoneGroups (RealmID, ID) VALUES ({}, {})";
+
+static constexpr const char* default_zonegroup_upsert2 =
+"INSERT INTO DefaultZoneGroups (RealmID, ID) \
+VALUES ({0}, {1}) \
+ON CONFLICT(RealmID) DO UPDATE SET ID = {1}";
+
+static constexpr const char* default_zonegroup_select1 =
+"SELECT ID FROM DefaultZoneGroups WHERE RealmID = {}";
+
+static constexpr const char* default_zonegroup_delete1 =
+"DELETE FROM DefaultZoneGroups WHERE RealmID = {}";
+
+
+// ZoneGroups
+
+static constexpr const char* zonegroup_update5 =
+"UPDATE ZoneGroups SET RealmID = {1}, Data = {2}, VersionNumber = {3} + 1 \
+WHERE ID = {0} AND VersionNumber = {3} AND VersionTag = {4}";
+
+static constexpr const char* zonegroup_rename4 =
+"UPDATE ZoneGroups SET Name = {1}, VersionNumber = {2} + 1 \
+WHERE ID = {0} AND VersionNumber = {2} AND VersionTag = {3}";
+
+static constexpr const char* zonegroup_delete3 =
+"DELETE FROM ZoneGroups WHERE ID = {} \
+AND VersionNumber = {} AND VersionTag = {}";
+
+static constexpr const char* zonegroup_insert6 =
+"INSERT INTO ZoneGroups (ID, Name, RealmID, Data, VersionNumber, VersionTag) \
+VALUES ({}, {}, {}, {}, {}, {})";
+
+static constexpr const char* zonegroup_upsert6 =
+"INSERT INTO ZoneGroups (ID, Name, RealmID, Data, VersionNumber, VersionTag) \
+VALUES ({0}, {1}, {2}, {3}, {4}, {5}) \
+ON CONFLICT (ID) DO UPDATE SET Name = {1}, RealmID = {2}, \
+Data = {3}, VersionNumber = {4}, VersionTag = {5}";
+
+static constexpr const char* zonegroup_select_id1 =
+"SELECT * FROM ZoneGroups WHERE ID = {} LIMIT 1";
+
+static constexpr const char* zonegroup_select_name1 =
+"SELECT * FROM ZoneGroups WHERE Name = {} LIMIT 1";
+
+static constexpr const char* zonegroup_select_default0 =
+"SELECT z.* FROM ZoneGroups z \
+INNER JOIN DefaultZoneGroups d \
+ON d.ID = z.ID LIMIT 1";
+
+static constexpr const char* zonegroup_select_names2 =
+"SELECT Name FROM ZoneGroups WHERE Name > {} \
+ORDER BY Name ASC LIMIT {}";
+
+
+// DefaultZones
+
+static constexpr const char* default_zone_insert2 =
+"INSERT INTO DefaultZones (RealmID, ID) VALUES ({}, {})";
+
+static constexpr const char* default_zone_upsert2 =
+"INSERT INTO DefaultZones (RealmID, ID) VALUES ({0}, {1}) \
+ON CONFLICT(RealmID) DO UPDATE SET ID = {1}";
+
+static constexpr const char* default_zone_select1 =
+"SELECT ID FROM DefaultZones WHERE RealmID = {}";
+
+static constexpr const char* default_zone_delete1 =
+"DELETE FROM DefaultZones WHERE RealmID = {}";
+
+
+// Zones
+
+static constexpr const char* zone_update5 =
+"UPDATE Zones SET RealmID = {1}, Data = {2}, VersionNumber = {3} + 1 \
+WHERE ID = {0} AND VersionNumber = {3} AND VersionTag = {4}";
+
+static constexpr const char* zone_rename4 =
+"UPDATE Zones SET Name = {1}, VersionNumber = {2} + 1 \
+WHERE ID = {0} AND VersionNumber = {2} AND VersionTag = {3}";
+
+static constexpr const char* zone_delete3 =
+"DELETE FROM Zones WHERE ID = {} AND VersionNumber = {} AND VersionTag = {}";
+
+static constexpr const char* zone_insert6 =
+"INSERT INTO Zones (ID, Name, RealmID, Data, VersionNumber, VersionTag) \
+VALUES ({}, {}, {}, {}, {}, {})";
+
+static constexpr const char* zone_upsert6 =
+"INSERT INTO Zones (ID, Name, RealmID, Data, VersionNumber, VersionTag) \
+VALUES ({0}, {1}, {2}, {3}, {4}, {5}) \
+ON CONFLICT (ID) DO UPDATE SET Name = {1}, RealmID = {2}, \
+Data = {3}, VersionNumber = {4}, VersionTag = {5}";
+
+static constexpr const char* zone_select_id1 =
+"SELECT * FROM Zones WHERE ID = {} LIMIT 1";
+
+static constexpr const char* zone_select_name1 =
+"SELECT * FROM Zones WHERE Name = {} LIMIT 1";
+
+static constexpr const char* zone_select_default0 =
+"SELECT z.* FROM Zones z \
+INNER JOIN DefaultZones d \
+ON d.ID = z.ID LIMIT 1";
+
+static constexpr const char* zone_select_names2 =
+"SELECT Name FROM Zones WHERE Name > {} \
+ORDER BY Name ASC LIMIT {}";
+
+
+// PeriodConfigs
+
+static constexpr const char* period_config_insert2 =
+"INSERT INTO PeriodConfigs (RealmID, Data) VALUES ({}, {})";
+
+static constexpr const char* period_config_upsert2 =
+"INSERT INTO PeriodConfigs (RealmID, Data) VALUES ({0}, {1}) \
+ON CONFLICT (RealmID) DO UPDATE SET Data = {1}";
+
+static constexpr const char* period_config_select1 =
+"SELECT Data FROM PeriodConfigs WHERE RealmID = {} LIMIT 1";
+
+} // namespace rgw::dbstore::config::schema
diff --git a/src/rgw/driver/dbstore/config/store.cc b/src/rgw/driver/dbstore/config/store.cc
new file mode 100644 (file)
index 0000000..66f7471
--- /dev/null
@@ -0,0 +1,40 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2022 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include <stdexcept>
+
+#undef FMT_HEADER_ONLY
+#define FMT_HEADER_ONLY 1
+#include <fmt/format.h>
+
+#include "store.h"
+#ifdef SQLITE_ENABLED
+#include "sqlite.h"
+#endif
+
+namespace rgw::dbstore {
+
+auto create_config_store(const DoutPrefixProvider* dpp, const std::string& uri)
+  -> std::unique_ptr<sal::ConfigStore>
+{
+#ifdef SQLITE_ENABLED
+  if (uri.starts_with("file:")) {
+    return config::create_sqlite_store(dpp, uri);
+  }
+#endif
+  throw std::runtime_error(fmt::format("unrecognized URI {}", uri));
+}
+
+} // namespace rgw::dbstore
diff --git a/src/rgw/driver/dbstore/config/store.h b/src/rgw/driver/dbstore/config/store.h
new file mode 100644 (file)
index 0000000..553d9f7
--- /dev/null
@@ -0,0 +1,27 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2022 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#pragma once
+
+#include <memory>
+#include "rgw_sal_config.h"
+
+namespace rgw::dbstore {
+
+// ConfigStore factory
+auto create_config_store(const DoutPrefixProvider* dpp, const std::string& uri)
+  -> std::unique_ptr<sal::ConfigStore>;
+
+} // namespace rgw::dbstore
diff --git a/src/rgw/driver/dbstore/dbstore_main.cc b/src/rgw/driver/dbstore/dbstore_main.cc
new file mode 100644 (file)
index 0000000..08724d8
--- /dev/null
@@ -0,0 +1,201 @@
+#include <stdio.h>
+#include <sqlite3.h>
+#include <stdlib.h>
+#include <string.h>
+#include <pthread.h>
+
+#include "dbstore_mgr.h"
+#include <dbstore.h>
+#include <dbstore_log.h>
+
+using namespace std;
+using namespace rgw::store;
+using DB = rgw::store::DB;
+
+struct thr_args {
+  DB *dbs;
+  int thr_id;
+};
+
+void* process(void *arg)
+{
+  struct thr_args *t_args = (struct thr_args*)arg;
+
+  DB *db = t_args->dbs;
+  int thr_id = t_args->thr_id;
+  int ret = -1;
+
+  cout<<"Entered thread:"<<thr_id<<"\n";
+
+  string user1 = "User1";
+  string bucketa = "rgw";
+  string objecta1 = "bugfixing";
+  string objecta2 = "zipper";
+  string bucketb = "gluster";
+  string objectb1 = "bugfixing";
+  string objectb2 = "delegations";
+
+  string user2 = "User2";
+  string bucketc = "qe";
+  string objectc1 = "rhhi";
+  string objectc2 = "cns";
+
+  DBOpParams params = {};
+  const DoutPrefixProvider *dpp = db->get_def_dpp();
+
+  db->InitializeParams(dpp, &params);
+
+  params.op.user.uinfo.display_name = user1;
+  params.op.user.uinfo.user_id.tenant = "tenant";
+  params.op.user.uinfo.user_id.id = user1;
+  params.op.user.uinfo.suspended = 123;
+  params.op.user.uinfo.max_buckets = 456;
+  params.op.user.uinfo.assumed_role_arn = "role";
+  params.op.user.uinfo.placement_tags.push_back("tags1");
+  params.op.user.uinfo.placement_tags.push_back("tags2");
+
+  RGWAccessKey k1("id1", "key1");
+  RGWAccessKey k2("id2", "key2");
+  params.op.user.uinfo.access_keys.insert(make_pair("key1", k1));
+  params.op.user.uinfo.access_keys.insert(make_pair("key2", k2));
+
+  ret = db->ProcessOp(dpp, "InsertUser", &params);
+  cout << "InsertUser return value: " <<  ret << "\n";
+
+  DBOpParams params2 = {};
+  params.op.user.uinfo.user_id.tenant = "tenant2";
+
+  db->InitializeParams(dpp, &params2);
+  params2.op.user.uinfo.display_name = user1;
+  ret = db->ProcessOp(dpp, "GetUser", &params2);
+
+  cout << "GetUser return value: " <<  ret << "\n";
+
+  cout << "tenant: " << params2.op.user.uinfo.user_id.tenant << "\n";
+  cout << "suspended: " << (int)params2.op.user.uinfo.suspended << "\n";
+  cout << "assumed_role_arn: " << params2.op.user.uinfo.assumed_role_arn << "\n";
+
+  list<string>::iterator it = params2.op.user.uinfo.placement_tags.begin();
+
+  while (it != params2.op.user.uinfo.placement_tags.end()) {
+    cout << "list = " << *it << "\n";
+    it++;
+  }
+
+  map<string, RGWAccessKey>::iterator it2 = params2.op.user.uinfo.access_keys.begin();
+
+  while (it2 != params2.op.user.uinfo.access_keys.end()) {
+    cout << "keys = " << it2->first << "\n";
+    RGWAccessKey k = it2->second;
+    cout << "id = " << k.id << ", keys = " << k.key << "\n";
+    it2++;
+  }
+
+  params.op.bucket.info.bucket.name = bucketa;
+  db->ProcessOp(dpp, "InsertBucket", &params);
+
+  params.op.user.uinfo.display_name = user2;
+  params.op.user.uinfo.user_id.id = user2;
+  db->ProcessOp(dpp, "InsertUser", &params);
+
+  params.op.bucket.info.bucket.name = bucketb;
+  db->ProcessOp(dpp, "InsertBucket", &params);
+
+  db->ProcessOp(dpp, "GetUser", &params);
+  db->ProcessOp(dpp, "GetBucket", &params);
+
+  db->ListAllUsers(dpp, &params);
+  db->ListAllBuckets(dpp, &params);
+
+  params.op.bucket.info.bucket.name = bucketb;
+
+  db->ProcessOp(dpp, "RemoveBucket", &params);
+
+  params.op.user.uinfo.user_id.id = user2;
+  db->ProcessOp(dpp, "RemoveUser", &params);
+
+  db->ListAllUsers(dpp, &params);
+  db->ListAllBuckets(dpp, &params);
+  cout<<"Exiting thread:"<<thr_id<<"\n";
+
+  return 0;
+}
+
+int main(int argc, char *argv[])
+{
+  string tenant = "Redhat";
+  string logfile = "rgw_dbstore_bin.log";
+  int loglevel = 20;
+
+  DBStoreManager *dbsm;
+  DB *dbs;
+  int rc = 0, tnum = 0;
+  void *res;
+
+  pthread_attr_t attr;
+  int num_thr = 2;
+  pthread_t threads[num_thr];
+  struct thr_args t_args[num_thr];
+
+
+  cout << "loglevel  " << loglevel << "\n";
+  // format: ./dbstore-bin logfile loglevel
+  if (argc == 3) {
+       logfile = argv[1];
+       loglevel = (atoi)(argv[2]);
+       cout << "loglevel set to " << loglevel << "\n";
+  }
+
+  vector<const char*> args;
+  auto cct = global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT,
+                CODE_ENVIRONMENT_DAEMON, CINIT_FLAG_NO_MON_CONFIG, 1);
+  dbsm = new DBStoreManager(cct.get(), logfile, loglevel);
+  dbs = dbsm->getDB(tenant, true);
+
+  cout<<"No. of threads being created = "<<num_thr<<"\n";
+
+  /* Initialize thread creation attributes */
+  rc = pthread_attr_init(&attr);
+
+  if (rc != 0) {
+    cout<<" error in pthread_attr_init \n";
+    goto out;
+  }
+
+  for (tnum = 0; tnum < num_thr; tnum++) {
+    t_args[tnum].dbs = dbs;
+    t_args[tnum].thr_id = tnum;
+    rc = pthread_create((pthread_t*)&threads[tnum], &attr, &process,
+        &t_args[tnum]);
+    if (rc != 0) {
+      cout<<" error in pthread_create \n";
+      goto out;
+    }
+
+    cout<<"Created thread (thread-id:"<<tnum<<")\n";
+  }
+
+  /* Destroy the thread attributes object, since it is no
+     longer needed */
+
+  rc = pthread_attr_destroy(&attr);
+  if (rc != 0) {
+    cout<<"error in pthread_attr_destroy \n";
+  }
+
+  /* Now join with each thread, and display its returned value */
+
+  for (tnum = 0; tnum < num_thr; tnum++) {
+    rc = pthread_join(threads[tnum], &res);
+    if (rc != 0) {
+      cout<<"error in pthread_join \n";
+    } else {
+      cout<<"Joined with thread "<<tnum<<"\n";
+    }
+  }
+
+out:
+  dbsm->destroyAllHandles();
+
+  return 0;
+}
diff --git a/src/rgw/driver/dbstore/dbstore_mgr.cc b/src/rgw/driver/dbstore/dbstore_mgr.cc
new file mode 100644 (file)
index 0000000..6835f52
--- /dev/null
@@ -0,0 +1,140 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "dbstore_mgr.h"
+#include "common/dbstore_log.h"
+
+#include <filesystem>
+
+static constexpr auto dout_subsys = ceph_subsys_rgw;
+
+using namespace std;
+
+
+/* Given a tenant, find and return the DBStore handle.
+ * If not found and 'create' set to true, create one
+ * and return
+ */
+DB *DBStoreManager::getDB (string tenant, bool create)
+{
+  map<string, DB*>::iterator iter;
+  DB *dbs = nullptr;
+  pair<map<string, DB*>::iterator,bool> ret;
+
+  if (tenant.empty())
+    return default_db;
+
+  if (DBStoreHandles.empty())
+    goto not_found;
+
+  iter = DBStoreHandles.find(tenant);
+
+  if (iter != DBStoreHandles.end())
+    return iter->second;
+
+not_found:
+  if (!create)
+    return nullptr;
+
+  dbs = createDB(tenant);
+
+  return dbs;
+}
+
+/* Create DBStore instance */
+DB *DBStoreManager::createDB(std::string tenant) {
+  DB *dbs = nullptr;
+  pair<map<string, DB*>::iterator,bool> ret;
+  const auto& db_path = g_conf().get_val<std::string>("dbstore_db_dir");
+  const auto& db_name = g_conf().get_val<std::string>("dbstore_db_name_prefix") + "-" + tenant;
+
+  auto db_full_path = std::filesystem::path(db_path) / db_name;
+  ldout(cct, 0) << "DB initialization full db_path("<<db_full_path<<")" << dendl;
+
+  /* Create the handle */
+#ifdef SQLITE_ENABLED
+  dbs = new SQLiteDB(db_full_path.string(), cct);
+#else
+  dbs = new DB(db_full_path.string(), cct);
+#endif
+
+  /* API is DB::Initialize(string logfile, int loglevel);
+   * If none provided, by default write in to dbstore.log file
+   * created in current working directory with loglevel L_EVENT.
+   * XXX: need to align these logs to ceph location
+   */
+  if (dbs->Initialize("", -1) < 0) {
+    ldout(cct, 0) << "DB initialization failed for tenant("<<tenant<<")" << dendl;
+
+    delete dbs;
+    return nullptr;
+  }
+
+  /* XXX: Do we need lock to protect this map?
+  */
+  ret = DBStoreHandles.insert(pair<string, DB*>(tenant, dbs));
+
+  /*
+   * Its safe to check for already existing entry (just
+   * incase other thread raced and created the entry)
+   */
+  if (ret.second == false) {
+    /* Entry already created by another thread */
+    delete dbs;
+
+    dbs = ret.first->second;
+  }
+
+  return dbs;
+}
+
+void DBStoreManager::deleteDB(string tenant) {
+  map<string, DB*>::iterator iter;
+  DB *dbs = nullptr;
+
+  if (tenant.empty() || DBStoreHandles.empty())
+    return;
+
+  /* XXX: Check if we need to perform this operation under a lock */
+  iter = DBStoreHandles.find(tenant);
+
+  if (iter == DBStoreHandles.end())
+    return;
+
+  dbs = iter->second;
+
+  DBStoreHandles.erase(iter);
+  dbs->Destroy(dbs->get_def_dpp());
+  delete dbs;
+
+  return;
+}
+
+void DBStoreManager::deleteDB(DB *dbs) {
+  if (!dbs)
+    return;
+
+  (void)deleteDB(dbs->getDBname());
+}
+
+
+void DBStoreManager::destroyAllHandles(){
+  map<string, DB*>::iterator iter;
+  DB *dbs = nullptr;
+
+  if (DBStoreHandles.empty())
+    return;
+
+  for (iter = DBStoreHandles.begin(); iter != DBStoreHandles.end();
+      ++iter) {
+    dbs = iter->second;
+    dbs->Destroy(dbs->get_def_dpp());
+    delete dbs;
+  }
+
+  DBStoreHandles.clear();
+
+  return;
+}
+
+
diff --git a/src/rgw/driver/dbstore/dbstore_mgr.h b/src/rgw/driver/dbstore/dbstore_mgr.h
new file mode 100644 (file)
index 0000000..77fc3aa
--- /dev/null
@@ -0,0 +1,56 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <map>
+#include <cerrno>
+#include <cstdlib>
+#include <string>
+#include <cstdio>
+#include <iostream>
+#include <vector>
+
+#include "common/ceph_context.h"
+#include "common/dbstore.h"
+#include "sqlite/sqliteDB.h"
+
+using namespace rgw::store;
+using DB = rgw::store::DB;
+
+/* XXX: Should be a dbstore config option */
+const static std::string default_tenant = "default_ns";
+
+class DBStoreManager {
+private:
+  std::map<std::string, DB*> DBStoreHandles;
+  DB *default_db = nullptr;
+  CephContext *cct;
+
+public:
+  DBStoreManager(CephContext *_cct): DBStoreHandles() {
+    cct = _cct;
+       default_db = createDB(default_tenant);
+  };
+  DBStoreManager(CephContext *_cct, std::string logfile, int loglevel): DBStoreHandles() {
+    /* No ceph context. Create one with log args provided */
+    cct = _cct;
+    cct->_log->set_log_file(logfile);
+    cct->_log->reopen_log_file();
+    cct->_conf->subsys.set_log_level(ceph_subsys_rgw, loglevel);
+    default_db = createDB(default_tenant);
+  };
+  ~DBStoreManager() { destroyAllHandles(); };
+
+  /* XXX: TBD based on testing
+   * 1)  Lock to protect DBStoreHandles map.
+   * 2) Refcount of each DBStore to protect from
+   * being deleted while using it.
+   */
+  DB* getDB () { return default_db; };
+  DB* getDB (std::string tenant, bool create);
+  DB* createDB (std::string tenant);
+  void deleteDB (std::string tenant);
+  void deleteDB (DB* db);
+  void destroyAllHandles();
+};
diff --git a/src/rgw/driver/dbstore/sqlite/CMakeLists.txt b/src/rgw/driver/dbstore/sqlite/CMakeLists.txt
new file mode 100644 (file)
index 0000000..909765e
--- /dev/null
@@ -0,0 +1,16 @@
+cmake_minimum_required(VERSION 3.14.0)
+project(sqlite_db)
+
+find_package(SQLite3 REQUIRED)
+
+set(sqlite_db_srcs
+    sqliteDB.h
+    sqliteDB.cc)
+
+include_directories(${CMAKE_INCLUDE_DIR})
+
+set(SQLITE_COMPILE_FLAGS "-DSQLITE_THREADSAFE=1")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SQLITE_COMPILE_FLAGS}")
+
+add_library(sqlite_db STATIC ${sqlite_db_srcs})
+target_link_libraries(sqlite_db sqlite3 dbstore_lib rgw_common)
diff --git a/src/rgw/driver/dbstore/sqlite/connection.cc b/src/rgw/driver/dbstore/sqlite/connection.cc
new file mode 100644 (file)
index 0000000..143a3a0
--- /dev/null
@@ -0,0 +1,34 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2022 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "common/dout.h"
+#include "connection.h"
+#include "error.h"
+
+namespace rgw::dbstore::sqlite {
+
+db_ptr open_database(const char* filename, int flags)
+{
+  sqlite3* db = nullptr;
+  const int result = ::sqlite3_open_v2(filename, &db, flags, nullptr);
+  if (result != SQLITE_OK) {
+    throw std::system_error(result, sqlite::error_category());
+  }
+  // request extended result codes
+  (void) ::sqlite3_extended_result_codes(db, 1);
+  return db_ptr{db};
+}
+
+} // namespace rgw::dbstore::sqlite
diff --git a/src/rgw/driver/dbstore/sqlite/connection.h b/src/rgw/driver/dbstore/sqlite/connection.h
new file mode 100644 (file)
index 0000000..f5cd77d
--- /dev/null
@@ -0,0 +1,66 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2022 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#pragma once
+
+#include <memory>
+#include <sqlite3.h>
+
+#undef FMT_HEADER_ONLY
+#define FMT_HEADER_ONLY 1
+#include <fmt/format.h>
+
+#include "sqlite/statement.h"
+
+class DoutPrefixProvider;
+
+namespace rgw::dbstore::sqlite {
+
+// owning sqlite3 pointer
+struct db_deleter {
+  void operator()(sqlite3* p) const { ::sqlite3_close(p); }
+};
+using db_ptr = std::unique_ptr<sqlite3, db_deleter>;
+
+
+// open the database file or throw on error
+db_ptr open_database(const char* filename, int flags);
+
+
+struct Connection {
+  db_ptr db;
+  // map of statements, prepared on first use
+  std::map<std::string_view, stmt_ptr> statements;
+
+  explicit Connection(db_ptr db) : db(std::move(db)) {}
+};
+
+// sqlite connection factory for ConnectionPool
+class ConnectionFactory {
+  std::string uri;
+  int flags;
+ public:
+  ConnectionFactory(std::string uri, int flags)
+      : uri(std::move(uri)), flags(flags) {}
+
+  auto operator()(const DoutPrefixProvider* dpp)
+    -> std::unique_ptr<Connection>
+  {
+    auto db = open_database(uri.c_str(), flags);
+    return std::make_unique<Connection>(std::move(db));
+  }
+};
+
+} // namespace rgw::dbstore::sqlite
diff --git a/src/rgw/driver/dbstore/sqlite/error.cc b/src/rgw/driver/dbstore/sqlite/error.cc
new file mode 100644 (file)
index 0000000..5fe9eb0
--- /dev/null
@@ -0,0 +1,37 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2022 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "error.h"
+
+namespace rgw::dbstore::sqlite {
+
+const std::error_category& error_category()
+{
+  struct category : std::error_category {
+    const char* name() const noexcept override {
+      return "dbstore:sqlite";
+    }
+    std::string message(int ev) const override {
+      return ::sqlite3_errstr(ev);
+    }
+    std::error_condition default_error_condition(int code) const noexcept override {
+      return {code & 0xFF, category()};
+    }
+  };
+  static category instance;
+  return instance;
+}
+
+} // namespace rgw::dbstore::sqlite
diff --git a/src/rgw/driver/dbstore/sqlite/error.h b/src/rgw/driver/dbstore/sqlite/error.h
new file mode 100644 (file)
index 0000000..15396d8
--- /dev/null
@@ -0,0 +1,81 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2022 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#pragma once
+
+#include <system_error>
+#include <sqlite3.h>
+
+namespace rgw::dbstore::sqlite {
+
+// error category for sqlite extended result codes:
+//   https://www.sqlite.org/rescode.html
+const std::error_category& error_category();
+
+
+// sqlite exception type that carries the extended error code and message
+class error : public std::runtime_error {
+  std::error_code ec;
+ public:
+  error(const char* errmsg, std::error_code ec)
+      : runtime_error(errmsg), ec(ec) {}
+  error(sqlite3* db, std::error_code ec) : error(::sqlite3_errmsg(db), ec) {}
+  error(sqlite3* db, int result) : error(db, {result, error_category()}) {}
+  error(sqlite3* db) : error(db, ::sqlite3_extended_errcode(db)) {}
+  std::error_code code() const { return ec; }
+};
+
+
+// sqlite error conditions for primary and extended result codes
+//
+// 'primary' error_conditions will match 'primary' error_codes as well as any
+// 'extended' error_codes whose lowest 8 bits match that primary code. for
+// example, the error_condition for SQLITE_CONSTRAINT will match the error_codes
+// SQLITE_CONSTRAINT and SQLITE_CONSTRAINT_*
+enum class errc {
+  // primary result codes
+  ok = SQLITE_OK,
+  busy = SQLITE_BUSY,
+  constraint = SQLITE_CONSTRAINT,
+  row = SQLITE_ROW,
+  done = SQLITE_DONE,
+
+  // extended result codes
+  primary_key_constraint = SQLITE_CONSTRAINT_PRIMARYKEY,
+  foreign_key_constraint = SQLITE_CONSTRAINT_FOREIGNKEY,
+  unique_constraint = SQLITE_CONSTRAINT_UNIQUE,
+
+  // ..add conditions as needed
+};
+
+inline std::error_code make_error_code(errc e)
+{
+  return {static_cast<int>(e), error_category()};
+}
+
+inline std::error_condition make_error_condition(errc e)
+{
+  return {static_cast<int>(e), error_category()};
+}
+
+} // namespace rgw::dbstore::sqlite
+
+namespace std {
+
+// enable implicit conversions from sqlite::errc to std::error_condition
+template<> struct is_error_condition_enum<
+    rgw::dbstore::sqlite::errc> : public true_type {};
+
+} // namespace std
diff --git a/src/rgw/driver/dbstore/sqlite/sqliteDB.cc b/src/rgw/driver/dbstore/sqlite/sqliteDB.cc
new file mode 100644 (file)
index 0000000..b0ced45
--- /dev/null
@@ -0,0 +1,3001 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "sqliteDB.h"
+
+using namespace std;
+
+#define SQL_PREPARE(dpp, params, sdb, stmt, ret, Op)   \
+  do {                                                 \
+    string schema;                                     \
+    schema = Schema(params);                   \
+    sqlite3_prepare_v2 (*sdb, schema.c_str(),  \
+        -1, &stmt , NULL);             \
+    if (!stmt) {                                       \
+      ldpp_dout(dpp, 0) <<"failed to prepare statement " \
+      <<"for Op("<<Op<<"); Errmsg -"\
+      <<sqlite3_errmsg(*sdb)<< dendl;\
+      ret = -1;                                \
+      goto out;                                \
+    }                                          \
+    ldpp_dout(dpp, 20)<<"Successfully Prepared stmt for Op("<<Op       \
+    <<") schema("<<schema<<") stmt("<<stmt<<")"<< dendl;       \
+    ret = 0;                                   \
+  } while(0);
+
+#define SQL_BIND_INDEX(dpp, stmt, index, str, sdb)     \
+  do {                                         \
+    index = sqlite3_bind_parameter_index(stmt, str);     \
+    \
+    if (index <=0)  {                               \
+      ldpp_dout(dpp, 0) <<"failed to fetch bind parameter"\
+      " index for str("<<str<<") in "   \
+      <<"stmt("<<stmt<<"); Errmsg -"    \
+      <<sqlite3_errmsg(*sdb)<< dendl;       \
+      rc = -1;                              \
+      goto out;                                     \
+    }                                               \
+    ldpp_dout(dpp, 20)<<"Bind parameter index for str("  \
+    <<str<<") in stmt("<<stmt<<") is "  \
+    <<index<< dendl;                        \
+  }while(0);
+
+#define SQL_BIND_TEXT(dpp, stmt, index, str, sdb)                      \
+  do {                                                         \
+    rc = sqlite3_bind_text(stmt, index, str, -1, SQLITE_TRANSIENT);    \
+    if (rc != SQLITE_OK) {                                             \
+      ldpp_dout(dpp, 0)<<"sqlite bind text failed for index("          \
+      <<index<<"), str("<<str<<") in stmt("    \
+      <<stmt<<"); Errmsg - "<<sqlite3_errmsg(*sdb) \
+      << dendl;                                \
+      rc = -1;                                 \
+      goto out;                                        \
+    }                                                  \
+    ldpp_dout(dpp, 20)<<"Bind parameter text for index("  \
+    <<index<<") in stmt("<<stmt<<") is "  \
+    <<str<< dendl;                          \
+  }while(0);
+
+#define SQL_BIND_INT(dpp, stmt, index, num, sdb)                       \
+  do {                                                         \
+    rc = sqlite3_bind_int(stmt, index, num);           \
+    \
+    if (rc != SQLITE_OK) {                                     \
+      ldpp_dout(dpp, 0)<<"sqlite bind int failed for index("           \
+      <<index<<"), num("<<num<<") in stmt("    \
+      <<stmt<<"); Errmsg - "<<sqlite3_errmsg(*sdb) \
+      << dendl;                                \
+      rc = -1;                                 \
+      goto out;                                        \
+    }                                                  \
+    ldpp_dout(dpp, 20)<<"Bind parameter int for index("  \
+    <<index<<") in stmt("<<stmt<<") is "  \
+    <<num<< dendl;                          \
+  }while(0);
+
+#define SQL_BIND_BLOB(dpp, stmt, index, blob, size, sdb)               \
+  do {                                                         \
+    rc = sqlite3_bind_blob(stmt, index, blob, size, SQLITE_TRANSIENT);  \
+    \
+    if (rc != SQLITE_OK) {                                     \
+      ldpp_dout(dpp, 0)<<"sqlite bind blob failed for index("          \
+      <<index<<"), blob("<<blob<<") in stmt("          \
+      <<stmt<<"); Errmsg - "<<sqlite3_errmsg(*sdb) \
+      << dendl;                                \
+      rc = -1;                                 \
+      goto out;                                        \
+    }                                                  \
+  }while(0);
+
+#define SQL_ENCODE_BLOB_PARAM(dpp, stmt, index, param, sdb)            \
+  do {                                                         \
+    bufferlist b;                                              \
+    encode(param, b);                                  \
+    SQL_BIND_BLOB(dpp, stmt, index, b.c_str(), b.length(), sdb); \
+  }while(0);
+
+#define SQL_READ_BLOB(dpp, stmt, index, void_ptr, len)         \
+  do {                                                         \
+    void_ptr = NULL;                                   \
+    void_ptr = (void *)sqlite3_column_blob(stmt, index);       \
+    len = sqlite3_column_bytes(stmt, index);           \
+    \
+    if (!void_ptr || len == 0) {                               \
+      ldpp_dout(dpp, 20)<<"Null value for blob index("  \
+      <<index<<") in stmt("<<stmt<<") "<< dendl;   \
+    }                                                  \
+  }while(0);
+
+#define SQL_DECODE_BLOB_PARAM(dpp, stmt, index, param, sdb)            \
+  do {                                                         \
+    bufferlist b;                                              \
+    void *blob;                                                \
+    int blob_len = 0;                                  \
+    \
+    SQL_READ_BLOB(dpp, stmt, index, blob, blob_len);           \
+    \
+    b.append(reinterpret_cast<char *>(blob), blob_len);        \
+    \
+    decode(param, b);                                  \
+  }while(0);
+
+#define SQL_EXECUTE(dpp, params, stmt, cbk, args...) \
+  do{                                          \
+    const std::lock_guard<std::mutex> lk(((DBOp*)(this))->mtx); \
+    if (!stmt) {                               \
+      ret = Prepare(dpp, params);              \
+    }                                  \
+    \
+    if (!stmt) {                               \
+      ldpp_dout(dpp, 0) <<"No prepared statement "<< dendl;    \
+      goto out;                        \
+    }                                  \
+    \
+    ret = Bind(dpp, params);                   \
+    if (ret) {                         \
+      ldpp_dout(dpp, 0) <<"Bind parameters failed for stmt(" <<stmt<<") "<< dendl;             \
+      goto out;                        \
+    }                                  \
+    \
+    ret = Step(dpp, params->op, stmt, cbk);            \
+    \
+    Reset(dpp, stmt);                          \
+    \
+    if (ret) {                         \
+      ldpp_dout(dpp, 0) <<"Execution failed for stmt(" <<stmt<<")"<< dendl;            \
+      goto out;                        \
+    }                                  \
+  }while(0);
+
+int SQLiteDB::InitPrepareParams(const DoutPrefixProvider *dpp,
+                                DBOpPrepareParams &p_params,
+                                DBOpParams* params)
+{
+  std::string bucket;
+
+  if (!params)
+    return -1;
+
+  if (params->user_table.empty()) {
+    params->user_table = getUserTable();
+  }
+  if (params->user_table.empty()) {
+    params->user_table = getUserTable();
+  }
+  if (params->bucket_table.empty()) {
+    params->bucket_table = getBucketTable();
+  }
+  if (params->quota_table.empty()) {
+    params->quota_table = getQuotaTable();
+  }
+  if (params->lc_entry_table.empty()) {
+    params->lc_entry_table = getLCEntryTable();
+  }
+  if (params->lc_head_table.empty()) {
+    params->lc_head_table = getLCHeadTable();
+  }
+
+  p_params.user_table = params->user_table;
+  p_params.bucket_table = params->bucket_table;
+  p_params.quota_table = params->quota_table;
+  p_params.lc_entry_table = params->lc_entry_table;
+  p_params.lc_head_table = params->lc_head_table;
+
+  p_params.op.query_str = params->op.query_str;
+
+  bucket = params->op.bucket.info.bucket.name;
+
+  if (!bucket.empty()) {
+    if (params->object_table.empty()) {
+      params->object_table = getObjectTable(bucket);
+    }
+    if (params->objectdata_table.empty()) {
+      params->objectdata_table = getObjectDataTable(bucket);
+    }
+    if (params->object_view.empty()) {
+      params->object_view = getObjectView(bucket);
+    }
+    if (params->object_trigger.empty()) {
+      params->object_trigger = getObjectTrigger(bucket);
+    }
+    p_params.object_table = params->object_table;
+    p_params.objectdata_table = params->objectdata_table;
+    p_params.object_view = params->object_view;
+  }
+
+  return 0;
+}
+
+static int list_callback(void *None, int argc, char **argv, char **aname)
+{
+  int i;
+  for(i=0; i < argc; i++) {
+    string arg = argv[i] ? argv[i] : "NULL";
+    cout<<aname[i]<<" = "<<arg<<"\n";
+  }
+  return 0;
+}
+
+enum GetUser {
+  UserID = 0,
+  Tenant,
+  NS,
+  DisplayName,
+  UserEmail,
+  AccessKeysID,
+  AccessKeysSecret,
+  AccessKeys,
+  SwiftKeys,
+  SubUsers,
+  Suspended,
+  MaxBuckets,
+  OpMask,
+  UserCaps,
+  Admin,
+  System,
+  PlacementName,
+  PlacementStorageClass,
+  PlacementTags,
+  BucketQuota,
+  TempURLKeys,
+  UserQuota,
+  TYPE,
+  MfaIDs,
+  AssumedRoleARN,
+  UserAttrs,
+  UserVersion,
+  UserVersionTag,
+};
+
+enum GetBucket {
+  BucketName = 0,
+  Bucket_Tenant, //Tenant
+  Marker,
+  BucketID,
+  Size,
+  SizeRounded,
+  CreationTime,
+  Count,
+  Bucket_PlacementName,
+  Bucket_PlacementStorageClass,
+  OwnerID,
+  Flags,
+  Zonegroup,
+  HasInstanceObj,
+  Quota,
+  RequesterPays,
+  HasWebsite,
+  WebsiteConf,
+  SwiftVersioning,
+  SwiftVerLocation,
+  MdsearchConfig,
+  NewBucketInstanceID,
+  ObjectLock,
+  SyncPolicyInfoGroups,
+  BucketAttrs,
+  BucketVersion,
+  BucketVersionTag,
+  Mtime,
+  Bucket_User_NS
+};
+
+enum GetObject {
+  ObjName,
+  ObjInstance,
+  ObjNS,
+  ObjBucketName,
+  ACLs,
+  IndexVer,
+  Tag,
+  ObjFlags,
+  VersionedEpoch,
+  ObjCategory,
+  Etag,
+  Owner,
+  OwnerDisplayName,
+  StorageClass,
+  Appendable,
+  ContentType,
+  IndexHashSource,
+  ObjSize,
+  AccountedSize,
+  ObjMtime,
+  Epoch,
+  ObjTag,
+  TailTag,
+  WriteTag,
+  FakeTag,
+  ShadowObj,
+  HasData,
+  IsVersioned,
+  VersionNum,
+  PGVer,
+  ZoneShortID,
+  ObjVersion,
+  ObjVersionTag,
+  ObjAttrs,
+  HeadSize,
+  MaxHeadSize,
+  ObjID,
+  TailInstance,
+  HeadPlacementRuleName,
+  HeadPlacementRuleStorageClass,
+  TailPlacementRuleName,
+  TailPlacementStorageClass,
+  ManifestPartObjs,
+  ManifestPartRules,
+  Omap,
+  IsMultipart,
+  MPPartsList,
+  HeadData,
+  Versions
+};
+
+enum GetObjectData {
+  ObjDataName,
+  ObjDataInstance,
+  ObjDataNS,
+  ObjDataBucketName,
+  ObjDataID,
+  MultipartPartStr,
+  PartNum,
+  Offset,
+  ObjDataSize,
+  ObjDataMtime,
+  ObjData
+};
+
+enum GetLCEntry {
+  LCEntryIndex,
+  LCEntryBucketName,
+  LCEntryStartTime,
+  LCEntryStatus
+};
+
+enum GetLCHead {
+  LCHeadIndex,
+  LCHeadMarker,
+  LCHeadStartDate
+};
+
+static int list_user(const DoutPrefixProvider *dpp, DBOpInfo &op, sqlite3_stmt *stmt) {
+  if (!stmt)
+    return -1;
+
+  op.user.uinfo.user_id.tenant = (const char*)sqlite3_column_text(stmt, Tenant);
+  op.user.uinfo.user_id.id = (const char*)sqlite3_column_text(stmt, UserID);
+  op.user.uinfo.user_id.ns = (const char*)sqlite3_column_text(stmt, NS);
+  op.user.uinfo.display_name = (const char*)sqlite3_column_text(stmt, DisplayName); // user_name
+  op.user.uinfo.user_email = (const char*)sqlite3_column_text(stmt, UserEmail);
+
+  SQL_DECODE_BLOB_PARAM(dpp, stmt, SwiftKeys, op.user.uinfo.swift_keys, sdb);
+  SQL_DECODE_BLOB_PARAM(dpp, stmt, SubUsers, op.user.uinfo.subusers, sdb);
+  SQL_DECODE_BLOB_PARAM(dpp, stmt, AccessKeys, op.user.uinfo.access_keys, sdb);
+
+  op.user.uinfo.suspended = sqlite3_column_int(stmt, Suspended);
+  op.user.uinfo.max_buckets = sqlite3_column_int(stmt, MaxBuckets);
+  op.user.uinfo.op_mask = sqlite3_column_int(stmt, OpMask);
+
+  SQL_DECODE_BLOB_PARAM(dpp, stmt, UserCaps, op.user.uinfo.caps, sdb);
+
+  op.user.uinfo.admin = sqlite3_column_int(stmt, Admin);
+  op.user.uinfo.system = sqlite3_column_int(stmt, System);
+
+  op.user.uinfo.default_placement.name = (const char*)sqlite3_column_text(stmt, PlacementName);
+
+  op.user.uinfo.default_placement.storage_class = (const char*)sqlite3_column_text(stmt, PlacementStorageClass);
+
+  SQL_DECODE_BLOB_PARAM(dpp, stmt, PlacementTags, op.user.uinfo.placement_tags, sdb);
+
+  SQL_DECODE_BLOB_PARAM(dpp, stmt, BucketQuota, op.user.uinfo.quota.bucket_quota, sdb);
+  SQL_DECODE_BLOB_PARAM(dpp, stmt, TempURLKeys, op.user.uinfo.temp_url_keys, sdb);
+  SQL_DECODE_BLOB_PARAM(dpp, stmt, UserQuota, op.user.uinfo.quota.user_quota, sdb);
+
+  op.user.uinfo.type = sqlite3_column_int(stmt, TYPE);
+
+  SQL_DECODE_BLOB_PARAM(dpp, stmt, MfaIDs, op.user.uinfo.mfa_ids, sdb);
+
+  op.user.uinfo.assumed_role_arn = (const char*)sqlite3_column_text(stmt, AssumedRoleARN);
+
+  SQL_DECODE_BLOB_PARAM(dpp, stmt, UserAttrs, op.user.user_attrs, sdb);
+  op.user.user_version.ver = sqlite3_column_int(stmt, UserVersion);
+  op.user.user_version.tag = (const char*)sqlite3_column_text(stmt, UserVersionTag);
+
+  return 0;
+}
+
+static int list_bucket(const DoutPrefixProvider *dpp, DBOpInfo &op, sqlite3_stmt *stmt) {
+  if (!stmt)
+    return -1;
+
+  op.bucket.ent.bucket.name = (const char*)sqlite3_column_text(stmt, BucketName);
+  op.bucket.ent.bucket.tenant = (const char*)sqlite3_column_text(stmt, Bucket_Tenant);
+  op.bucket.ent.bucket.marker = (const char*)sqlite3_column_text(stmt, Marker);
+  op.bucket.ent.bucket.bucket_id = (const char*)sqlite3_column_text(stmt, BucketID);
+  op.bucket.ent.size = sqlite3_column_int(stmt, Size);
+  op.bucket.ent.size_rounded = sqlite3_column_int(stmt, SizeRounded);
+  SQL_DECODE_BLOB_PARAM(dpp, stmt, CreationTime, op.bucket.ent.creation_time, sdb);
+  op.bucket.ent.count = sqlite3_column_int(stmt, Count);
+  op.bucket.ent.placement_rule.name = (const char*)sqlite3_column_text(stmt, Bucket_PlacementName);
+  op.bucket.ent.placement_rule.storage_class = (const char*)sqlite3_column_text(stmt, Bucket_PlacementStorageClass);
+
+  op.bucket.info.bucket = op.bucket.ent.bucket;
+  op.bucket.info.placement_rule = op.bucket.ent.placement_rule;
+  op.bucket.info.creation_time = op.bucket.ent.creation_time;
+
+  op.bucket.info.owner.id = (const char*)sqlite3_column_text(stmt, OwnerID);
+  op.bucket.info.owner.tenant = op.bucket.ent.bucket.tenant;
+
+  if (op.name == "GetBucket") {
+    op.bucket.info.owner.ns = (const char*)sqlite3_column_text(stmt, Bucket_User_NS);
+  }
+
+  op.bucket.info.flags = sqlite3_column_int(stmt, Flags);
+  op.bucket.info.zonegroup = (const char*)sqlite3_column_text(stmt, Zonegroup);
+  op.bucket.info.has_instance_obj = sqlite3_column_int(stmt, HasInstanceObj);
+
+  SQL_DECODE_BLOB_PARAM(dpp, stmt, Quota, op.bucket.info.quota, sdb);
+  op.bucket.info.requester_pays = sqlite3_column_int(stmt, RequesterPays);
+  op.bucket.info.has_website = sqlite3_column_int(stmt, HasWebsite);
+  SQL_DECODE_BLOB_PARAM(dpp, stmt, WebsiteConf, op.bucket.info.website_conf, sdb);
+  op.bucket.info.swift_versioning = sqlite3_column_int(stmt, SwiftVersioning);
+  op.bucket.info.swift_ver_location = (const char*)sqlite3_column_text(stmt, SwiftVerLocation);
+  SQL_DECODE_BLOB_PARAM(dpp, stmt, MdsearchConfig, op.bucket.info.mdsearch_config, sdb);
+  op.bucket.info.new_bucket_instance_id = (const char*)sqlite3_column_text(stmt, NewBucketInstanceID);
+  SQL_DECODE_BLOB_PARAM(dpp, stmt, ObjectLock, op.bucket.info.obj_lock, sdb);
+  SQL_DECODE_BLOB_PARAM(dpp, stmt, SyncPolicyInfoGroups, op.bucket.info.sync_policy, sdb);
+  SQL_DECODE_BLOB_PARAM(dpp, stmt, BucketAttrs, op.bucket.bucket_attrs, sdb);
+  op.bucket.bucket_version.ver = sqlite3_column_int(stmt, BucketVersion);
+  op.bucket.bucket_version.tag = (const char*)sqlite3_column_text(stmt, BucketVersionTag);
+
+  /* Read bucket version into info.objv_tracker.read_ver. No need
+   * to set write_ver as its not used anywhere. Still keeping its
+   * value same as read_ver */
+  op.bucket.info.objv_tracker.read_version = op.bucket.bucket_version;
+  op.bucket.info.objv_tracker.write_version = op.bucket.bucket_version;
+
+  SQL_DECODE_BLOB_PARAM(dpp, stmt, Mtime, op.bucket.mtime, sdb);
+
+  op.bucket.list_entries.push_back(op.bucket.ent);
+
+  return 0;
+}
+
+static int list_object(const DoutPrefixProvider *dpp, DBOpInfo &op, sqlite3_stmt *stmt) {
+  if (!stmt)
+    return -1;
+
+  //cout<<sqlite3_column_text(stmt, 0)<<", ";
+  //cout<<sqlite3_column_text(stmt, 1) << "\n";
+
+  op.obj.state.exists = true;
+  op.obj.state.obj.key.name = (const char*)sqlite3_column_text(stmt, ObjName);
+  op.bucket.info.bucket.name = (const char*)sqlite3_column_text(stmt, ObjBucketName);
+  op.obj.state.obj.key.instance = (const char*)sqlite3_column_text(stmt, ObjInstance);
+  op.obj.state.obj.key.ns = (const char*)sqlite3_column_text(stmt, ObjNS);
+  SQL_DECODE_BLOB_PARAM(dpp, stmt, ACLs, op.obj.acls, sdb);
+  op.obj.index_ver = sqlite3_column_int(stmt, IndexVer);
+  op.obj.tag = (const char*)sqlite3_column_text(stmt, Tag);
+  op.obj.flags = sqlite3_column_int(stmt, ObjFlags); 
+  op.obj.versioned_epoch = sqlite3_column_int(stmt, VersionedEpoch);
+  op.obj.category = (RGWObjCategory)sqlite3_column_int(stmt, ObjCategory); 
+  op.obj.etag = (const char*)sqlite3_column_text(stmt, Etag);
+  op.obj.owner = (const char*)sqlite3_column_text(stmt, Owner);
+  op.obj.owner_display_name = (const char*)sqlite3_column_text(stmt, OwnerDisplayName);
+  op.obj.storage_class = (const char*)sqlite3_column_text(stmt, StorageClass);
+  op.obj.appendable = sqlite3_column_int(stmt, Appendable); 
+  op.obj.content_type = (const char*)sqlite3_column_text(stmt, ContentType);
+  op.obj.state.obj.index_hash_source = (const char*)sqlite3_column_text(stmt, IndexHashSource);
+  op.obj.state.size = sqlite3_column_int(stmt, ObjSize); 
+  op.obj.state.accounted_size = sqlite3_column_int(stmt, AccountedSize); 
+  SQL_DECODE_BLOB_PARAM(dpp, stmt, ObjMtime, op.obj.state.mtime, sdb);
+  op.obj.state.epoch = sqlite3_column_int(stmt, Epoch);
+  SQL_DECODE_BLOB_PARAM(dpp, stmt, ObjTag, op.obj.state.obj_tag, sdb);
+  SQL_DECODE_BLOB_PARAM(dpp, stmt, TailTag, op.obj.state.tail_tag, sdb);
+  op.obj.state.write_tag = (const char*)sqlite3_column_text(stmt, WriteTag);
+  op.obj.state.fake_tag = sqlite3_column_int(stmt, FakeTag);
+  op.obj.state.shadow_obj = (const char*)sqlite3_column_text(stmt, ShadowObj);
+  op.obj.state.has_data = sqlite3_column_int(stmt, HasData); 
+  op.obj.is_versioned = sqlite3_column_int(stmt, IsVersioned); 
+  op.obj.version_num = sqlite3_column_int(stmt, VersionNum); 
+  op.obj.state.pg_ver = sqlite3_column_int(stmt, PGVer); 
+  op.obj.state.zone_short_id = sqlite3_column_int(stmt, ZoneShortID); 
+  op.obj.state.objv_tracker.read_version.ver = sqlite3_column_int(stmt, ObjVersion); 
+  op.obj.state.objv_tracker.read_version.tag = (const char*)sqlite3_column_text(stmt, ObjVersionTag);
+  SQL_DECODE_BLOB_PARAM(dpp, stmt, ObjAttrs, op.obj.state.attrset, sdb);
+  op.obj.head_size = sqlite3_column_int(stmt, HeadSize); 
+  op.obj.max_head_size = sqlite3_column_int(stmt, MaxHeadSize); 
+  op.obj.obj_id = (const char*)sqlite3_column_text(stmt, ObjID);
+  op.obj.tail_instance = (const char*)sqlite3_column_text(stmt, TailInstance);
+  op.obj.head_placement_rule.name = (const char*)sqlite3_column_text(stmt, HeadPlacementRuleName);
+  op.obj.head_placement_rule.storage_class = (const char*)sqlite3_column_text(stmt, HeadPlacementRuleStorageClass);
+  op.obj.tail_placement.placement_rule.name = (const char*)sqlite3_column_text(stmt, TailPlacementRuleName);
+  op.obj.tail_placement.placement_rule.storage_class = (const char*)sqlite3_column_text(stmt, TailPlacementStorageClass);
+  SQL_DECODE_BLOB_PARAM(dpp, stmt, ManifestPartObjs, op.obj.objs, sdb);
+  SQL_DECODE_BLOB_PARAM(dpp, stmt, ManifestPartRules, op.obj.rules, sdb);
+  SQL_DECODE_BLOB_PARAM(dpp, stmt, Omap, op.obj.omap, sdb);
+  op.obj.is_multipart = sqlite3_column_int(stmt, IsMultipart);
+  SQL_DECODE_BLOB_PARAM(dpp, stmt, MPPartsList, op.obj.mp_parts, sdb);
+  SQL_DECODE_BLOB_PARAM(dpp, stmt, HeadData, op.obj.head_data, sdb);
+  op.obj.state.data = op.obj.head_data;
+
+  rgw_bucket_dir_entry dent;
+  dent.key.name = op.obj.state.obj.key.name;
+  dent.key.instance = op.obj.state.obj.key.instance;
+  dent.tag = op.obj.tag;
+  dent.flags = op.obj.flags;
+  dent.versioned_epoch = op.obj.versioned_epoch;
+  dent.index_ver = op.obj.index_ver;
+  dent.exists = true;
+  dent.meta.category = op.obj.category;
+  dent.meta.size = op.obj.state.size;
+  dent.meta.accounted_size = op.obj.state.accounted_size;
+  dent.meta.mtime = op.obj.state.mtime;
+  dent.meta.etag = op.obj.etag;
+  dent.meta.owner = op.obj.owner;
+  dent.meta.owner_display_name = op.obj.owner_display_name;
+  dent.meta.content_type = op.obj.content_type;
+  dent.meta.storage_class = op.obj.storage_class;
+  dent.meta.appendable = op.obj.appendable;
+
+  op.obj.list_entries.push_back(dent);
+  return 0;
+}
+
+static int get_objectdata(const DoutPrefixProvider *dpp, DBOpInfo &op, sqlite3_stmt *stmt) {
+  if (!stmt)
+    return -1;
+
+  op.obj.state.obj.key.name = (const char*)sqlite3_column_text(stmt, ObjName);
+  op.bucket.info.bucket.name = (const char*)sqlite3_column_text(stmt, ObjBucketName);
+  op.obj.state.obj.key.instance = (const char*)sqlite3_column_text(stmt, ObjInstance);
+  op.obj.state.obj.key.ns = (const char*)sqlite3_column_text(stmt, ObjNS);
+  op.obj.obj_id = (const char*)sqlite3_column_text(stmt, ObjDataID);
+  op.obj_data.part_num = sqlite3_column_int(stmt, PartNum);
+  op.obj_data.offset = sqlite3_column_int(stmt, Offset);
+  op.obj_data.size = sqlite3_column_int(stmt, ObjDataSize);
+  op.obj_data.multipart_part_str = (const char*)sqlite3_column_text(stmt, MultipartPartStr);
+  SQL_DECODE_BLOB_PARAM(dpp, stmt, ObjDataMtime, op.obj.state.mtime, sdb);
+  SQL_DECODE_BLOB_PARAM(dpp, stmt, ObjData, op.obj_data.data, sdb);
+
+  return 0;
+}
+
+static int list_lc_entry(const DoutPrefixProvider *dpp, DBOpInfo &op, sqlite3_stmt *stmt) {
+  if (!stmt)
+    return -1;
+
+  op.lc_entry.index = (const char*)sqlite3_column_text(stmt, LCEntryIndex);
+  op.lc_entry.entry.set_bucket((const char*)sqlite3_column_text(stmt, LCEntryBucketName));
+  op.lc_entry.entry.set_start_time(sqlite3_column_int(stmt, LCEntryStartTime));
+  op.lc_entry.entry.set_status(sqlite3_column_int(stmt, LCEntryStatus));
+  op.lc_entry.list_entries.push_back(op.lc_entry.entry);
+
+  return 0;
+}
+
+static int list_lc_head(const DoutPrefixProvider *dpp, DBOpInfo &op, sqlite3_stmt *stmt) {
+  if (!stmt)
+    return -1;
+
+  int64_t start_date;
+
+  op.lc_head.index = (const char*)sqlite3_column_text(stmt, LCHeadIndex);
+  op.lc_head.head.set_marker((const char*)sqlite3_column_text(stmt, LCHeadMarker));
+  SQL_DECODE_BLOB_PARAM(dpp, stmt, LCHeadStartDate, start_date, sdb);
+  op.lc_head.head.get_start_date() = start_date;
+
+  return 0;
+}
+
+int SQLiteDB::InitializeDBOps(const DoutPrefixProvider *dpp)
+{
+  (void)createTables(dpp);
+  dbops.InsertUser = make_shared<SQLInsertUser>(&this->db, this->getDBname(), cct);
+  dbops.RemoveUser = make_shared<SQLRemoveUser>(&this->db, this->getDBname(), cct);
+  dbops.GetUser = make_shared<SQLGetUser>(&this->db, this->getDBname(), cct);
+  dbops.InsertBucket = make_shared<SQLInsertBucket>(&this->db, this->getDBname(), cct);
+  dbops.UpdateBucket = make_shared<SQLUpdateBucket>(&this->db, this->getDBname(), cct);
+  dbops.RemoveBucket = make_shared<SQLRemoveBucket>(&this->db, this->getDBname(), cct);
+  dbops.GetBucket = make_shared<SQLGetBucket>(&this->db, this->getDBname(), cct);
+  dbops.ListUserBuckets = make_shared<SQLListUserBuckets>(&this->db, this->getDBname(), cct);
+  dbops.InsertLCEntry = make_shared<SQLInsertLCEntry>(&this->db, this->getDBname(), cct);
+  dbops.RemoveLCEntry = make_shared<SQLRemoveLCEntry>(&this->db, this->getDBname(), cct);
+  dbops.GetLCEntry = make_shared<SQLGetLCEntry>(&this->db, this->getDBname(), cct);
+  dbops.ListLCEntries = make_shared<SQLListLCEntries>(&this->db, this->getDBname(), cct);
+  dbops.InsertLCHead = make_shared<SQLInsertLCHead>(&this->db, this->getDBname(), cct);
+  dbops.RemoveLCHead = make_shared<SQLRemoveLCHead>(&this->db, this->getDBname(), cct);
+  dbops.GetLCHead = make_shared<SQLGetLCHead>(&this->db, this->getDBname(), cct);
+
+  return 0;
+}
+
+void *SQLiteDB::openDB(const DoutPrefixProvider *dpp)
+{
+  string dbname;
+  int rc = 0;
+
+  dbname = getDBfile();
+  if (dbname.empty()) {
+    ldpp_dout(dpp, 0)<<"dbname is NULL" << dendl;
+    goto out;
+  }
+
+  rc = sqlite3_open_v2(dbname.c_str(), (sqlite3**)&db,
+      SQLITE_OPEN_READWRITE |
+      SQLITE_OPEN_CREATE |
+      SQLITE_OPEN_FULLMUTEX,
+      NULL);
+
+  if (rc) {
+    ldpp_dout(dpp, 0) <<"Cant open "<<dbname<<"; Errmsg - "\
+      <<sqlite3_errmsg((sqlite3*)db) <<  dendl;
+  } else {
+    ldpp_dout(dpp, 0) <<"Opened database("<<dbname<<") successfully" <<  dendl;
+  }
+
+  exec(dpp, "PRAGMA foreign_keys=ON", NULL);
+
+out:
+  return db;
+}
+
+int SQLiteDB::closeDB(const DoutPrefixProvider *dpp)
+{
+  if (db)
+    sqlite3_close((sqlite3 *)db);
+
+  db = NULL;
+
+  return 0;
+}
+
+int SQLiteDB::Reset(const DoutPrefixProvider *dpp, sqlite3_stmt *stmt)
+{
+  int ret = -1;
+
+  if (!stmt) {
+    return -1;
+  }
+  sqlite3_clear_bindings(stmt);
+  ret = sqlite3_reset(stmt);
+
+  return ret;
+}
+
+int SQLiteDB::Step(const DoutPrefixProvider *dpp, DBOpInfo &op, sqlite3_stmt *stmt,
+    int (*cbk)(const DoutPrefixProvider *dpp, DBOpInfo &op, sqlite3_stmt *stmt))
+{
+  int ret = -1;
+
+  if (!stmt) {
+    return -1;
+  }
+
+again:
+  ret = sqlite3_step(stmt);
+
+  if ((ret != SQLITE_DONE) && (ret != SQLITE_ROW)) {
+    ldpp_dout(dpp, 0)<<"sqlite step failed for stmt("<<stmt \
+      <<"); Errmsg - "<<sqlite3_errmsg((sqlite3*)db) << dendl;
+    return -1;
+  } else if (ret == SQLITE_ROW) {
+    if (cbk) {
+      (*cbk)(dpp, op, stmt);
+    } else {
+    }
+    goto again;
+  }
+
+  ldpp_dout(dpp, 20)<<"sqlite step successfully executed for stmt(" \
+    <<stmt<<")  ret = " << ret << dendl;
+
+  return 0;
+}
+
+int SQLiteDB::exec(const DoutPrefixProvider *dpp, const char *schema,
+    int (*callback)(void*,int,char**,char**))
+{
+  int ret = -1;
+  char *errmsg = NULL;
+
+  if (!db)
+    goto out;
+
+  ret = sqlite3_exec((sqlite3*)db, schema, callback, 0, &errmsg);
+  if (ret != SQLITE_OK) {
+    ldpp_dout(dpp, 0) <<"sqlite exec failed for schema("<<schema \
+      <<"); Errmsg - "<<errmsg <<  dendl;
+    sqlite3_free(errmsg);
+    goto out;
+  }
+  ret = 0;
+  ldpp_dout(dpp, 10) <<"sqlite exec successfully processed for schema(" \
+    <<schema<<")" <<  dendl;
+out:
+  return ret;
+}
+
+int SQLiteDB::createTables(const DoutPrefixProvider *dpp)
+{
+  int ret = -1;
+  int cu = 0, cb = 0, cq = 0;
+  DBOpParams params = {};
+
+  params.user_table = getUserTable();
+  params.bucket_table = getBucketTable();
+
+  if ((cu = createUserTable(dpp, &params)))
+    goto out;
+
+  if ((cb = createBucketTable(dpp, &params)))
+    goto out;
+
+  if ((cq = createQuotaTable(dpp, &params)))
+    goto out;
+
+  ret = 0;
+out:
+  if (ret) {
+    if (cu)
+      DeleteUserTable(dpp, &params);
+    if (cb)
+      DeleteBucketTable(dpp, &params);
+    ldpp_dout(dpp, 0)<<"Creation of tables failed" << dendl;
+  }
+
+  return ret;
+}
+
+int SQLiteDB::createUserTable(const DoutPrefixProvider *dpp, DBOpParams *params)
+{
+  int ret = -1;
+  string schema;
+
+  schema = CreateTableSchema("User", params);
+
+  ret = exec(dpp, schema.c_str(), NULL);
+  if (ret)
+    ldpp_dout(dpp, 0)<<"CreateUserTable failed" << dendl;
+
+  ldpp_dout(dpp, 20)<<"CreateUserTable suceeded" << dendl;
+
+  return ret;
+}
+
+int SQLiteDB::createBucketTable(const DoutPrefixProvider *dpp, DBOpParams *params)
+{
+  int ret = -1;
+  string schema;
+
+  schema = CreateTableSchema("Bucket", params);
+
+  ret = exec(dpp, schema.c_str(), NULL);
+  if (ret)
+    ldpp_dout(dpp, 0)<<"CreateBucketTable failed " << dendl;
+
+  ldpp_dout(dpp, 20)<<"CreateBucketTable suceeded " << dendl;
+
+  return ret;
+}
+
+int SQLiteDB::createObjectTable(const DoutPrefixProvider *dpp, DBOpParams *params)
+{
+  int ret = -1;
+  string schema;
+
+  schema = CreateTableSchema("Object", params);
+
+  ret = exec(dpp, schema.c_str(), NULL);
+  if (ret)
+    ldpp_dout(dpp, 0)<<"CreateObjectTable failed " << dendl;
+
+  ldpp_dout(dpp, 20)<<"CreateObjectTable suceeded " << dendl;
+
+  return ret;
+}
+
+int SQLiteDB::createObjectTableTrigger(const DoutPrefixProvider *dpp, DBOpParams *params)
+{
+  int ret = -1;
+  string schema;
+
+  schema = CreateTableSchema("ObjectTrigger", params);
+
+  ret = exec(dpp, schema.c_str(), NULL);
+  if (ret)
+    ldpp_dout(dpp, 0)<<"CreateObjectTableTrigger failed " << dendl;
+
+  ldpp_dout(dpp, 20)<<"CreateObjectTableTrigger suceeded " << dendl;
+
+  return ret;
+}
+
+int SQLiteDB::createObjectView(const DoutPrefixProvider *dpp, DBOpParams *params)
+{
+  int ret = -1;
+  string schema;
+
+  schema = CreateTableSchema("ObjectView", params);
+
+  ret = exec(dpp, schema.c_str(), NULL);
+  if (ret)
+    ldpp_dout(dpp, 0)<<"CreateObjectView failed " << dendl;
+
+  ldpp_dout(dpp, 20)<<"CreateObjectView suceeded " << dendl;
+
+  return ret;
+}
+
+int SQLiteDB::createQuotaTable(const DoutPrefixProvider *dpp, DBOpParams *params)
+{
+  int ret = -1;
+  string schema;
+
+  schema = CreateTableSchema("Quota", params);
+
+  ret = exec(dpp, schema.c_str(), NULL);
+  if (ret)
+    ldpp_dout(dpp, 0)<<"CreateQuotaTable failed " << dendl;
+
+  ldpp_dout(dpp, 20)<<"CreateQuotaTable suceeded " << dendl;
+
+  return ret;
+}
+
+int SQLiteDB::createObjectDataTable(const DoutPrefixProvider *dpp, DBOpParams *params)
+{
+  int ret = -1;
+  string schema;
+
+  schema = CreateTableSchema("ObjectData", params);
+
+  ret = exec(dpp, schema.c_str(), NULL);
+  if (ret)
+    ldpp_dout(dpp, 0)<<"CreateObjectDataTable failed " << dendl;
+
+  ldpp_dout(dpp, 20)<<"CreateObjectDataTable suceeded " << dendl;
+
+  return ret;
+}
+
+int SQLiteDB::createLCTables(const DoutPrefixProvider *dpp)
+{
+  int ret = -1;
+  string schema;
+  DBOpParams params = {};
+
+  params.lc_entry_table = getLCEntryTable();
+  params.lc_head_table = getLCHeadTable();
+  params.bucket_table = getBucketTable();
+
+  schema = CreateTableSchema("LCEntry", &params);
+  ret = exec(dpp, schema.c_str(), NULL);
+  if (ret) {
+    ldpp_dout(dpp, 0)<<"CreateLCEntryTable failed" << dendl;
+    return ret;
+  }
+  ldpp_dout(dpp, 20)<<"CreateLCEntryTable suceeded" << dendl;
+
+  schema = CreateTableSchema("LCHead", &params);
+  ret = exec(dpp, schema.c_str(), NULL);
+  if (ret) {
+    ldpp_dout(dpp, 0)<<"CreateLCHeadTable failed" << dendl;
+    (void)DeleteLCEntryTable(dpp, &params);
+  }
+  ldpp_dout(dpp, 20)<<"CreateLCHeadTable suceeded" << dendl;
+
+  return ret;
+}
+
+int SQLiteDB::DeleteUserTable(const DoutPrefixProvider *dpp, DBOpParams *params)
+{
+  int ret = -1;
+  string schema;
+
+  schema = DeleteTableSchema(params->user_table);
+
+  ret = exec(dpp, schema.c_str(), NULL);
+  if (ret)
+    ldpp_dout(dpp, 0)<<"DeleteUserTable failed " << dendl;
+
+  ldpp_dout(dpp, 20)<<"DeleteUserTable suceeded " << dendl;
+
+  return ret;
+}
+
+int SQLiteDB::DeleteBucketTable(const DoutPrefixProvider *dpp, DBOpParams *params)
+{
+  int ret = -1;
+  string schema;
+
+  schema = DeleteTableSchema(params->bucket_table);
+
+  ret = exec(dpp, schema.c_str(), NULL);
+  if (ret)
+    ldpp_dout(dpp, 0)<<"DeletebucketTable failed " << dendl;
+
+  ldpp_dout(dpp, 20)<<"DeletebucketTable suceeded " << dendl;
+
+  return ret;
+}
+
+int SQLiteDB::DeleteObjectTable(const DoutPrefixProvider *dpp, DBOpParams *params)
+{
+  int ret = -1;
+  string schema;
+
+  schema = DeleteTableSchema(params->object_table);
+
+  ret = exec(dpp, schema.c_str(), NULL);
+  if (ret)
+    ldpp_dout(dpp, 0)<<"DeleteObjectTable failed " << dendl;
+
+  ldpp_dout(dpp, 20)<<"DeleteObjectTable suceeded " << dendl;
+
+  return ret;
+}
+
+int SQLiteDB::DeleteObjectDataTable(const DoutPrefixProvider *dpp, DBOpParams *params)
+{
+  int ret = -1;
+  string schema;
+
+  schema = DeleteTableSchema(params->objectdata_table);
+
+  ret = exec(dpp, schema.c_str(), NULL);
+  if (ret)
+    ldpp_dout(dpp, 0)<<"DeleteObjectDataTable failed " << dendl;
+
+  ldpp_dout(dpp, 20)<<"DeleteObjectDataTable suceeded " << dendl;
+
+  return ret;
+}
+
+int SQLiteDB::DeleteQuotaTable(const DoutPrefixProvider *dpp, DBOpParams *params)
+{
+  int ret = -1;
+  string schema;
+
+  schema = DeleteTableSchema(params->quota_table);
+
+  ret = exec(dpp, schema.c_str(), NULL);
+  if (ret)
+    ldpp_dout(dpp, 0)<<"DeleteQuotaTable failed " << dendl;
+
+  ldpp_dout(dpp, 20)<<"DeleteQuotaTable suceeded " << dendl;
+
+  return ret;
+}
+
+int SQLiteDB::DeleteLCEntryTable(const DoutPrefixProvider *dpp, DBOpParams *params)
+{
+  int ret = -1;
+  string schema;
+
+  schema = DeleteTableSchema(params->lc_entry_table);
+  ret = exec(dpp, schema.c_str(), NULL);
+  if (ret)
+    ldpp_dout(dpp, 0)<<"DeleteLCEntryTable failed " << dendl;
+  ldpp_dout(dpp, 20)<<"DeleteLCEntryTable suceeded " << dendl;
+
+  return ret;
+}
+
+int SQLiteDB::DeleteLCHeadTable(const DoutPrefixProvider *dpp, DBOpParams *params)
+{
+  int ret = -1;
+  string schema;
+
+  schema = DeleteTableSchema(params->lc_head_table);
+  ret = exec(dpp, schema.c_str(), NULL);
+  if (ret)
+    ldpp_dout(dpp, 0)<<"DeleteLCHeadTable failed " << dendl;
+  ldpp_dout(dpp, 20)<<"DeleteLCHeadTable suceeded " << dendl;
+
+  return ret;
+}
+
+int SQLiteDB::ListAllUsers(const DoutPrefixProvider *dpp, DBOpParams *params)
+{
+  int ret = -1;
+  string schema;
+
+  schema = ListTableSchema(params->user_table);
+  ret = exec(dpp, schema.c_str(), &list_callback);
+  if (ret)
+    ldpp_dout(dpp, 0)<<"GetUsertable failed " << dendl;
+
+  ldpp_dout(dpp, 20)<<"GetUserTable suceeded " << dendl;
+
+  return ret;
+}
+
+int SQLiteDB::ListAllBuckets(const DoutPrefixProvider *dpp, DBOpParams *params)
+{
+  int ret = -1;
+  string schema;
+
+  schema = ListTableSchema(params->bucket_table);
+
+  ret = exec(dpp, schema.c_str(), &list_callback);
+  if (ret)
+    ldpp_dout(dpp, 0)<<"Listbuckettable failed " << dendl;
+
+  ldpp_dout(dpp, 20)<<"ListbucketTable suceeded " << dendl;
+
+  return ret;
+}
+
+int SQLiteDB::ListAllObjects(const DoutPrefixProvider *dpp, DBOpParams *params)
+{
+  int ret = -1;
+  string schema;
+  map<string, class ObjectOp*>::iterator iter;
+  map<string, class ObjectOp*> objectmap;
+  string bucket;
+
+  objectmap = getObjectMap();
+
+  if (objectmap.empty())
+    ldpp_dout(dpp, 20)<<"objectmap empty " << dendl;
+
+  for (iter = objectmap.begin(); iter != objectmap.end(); ++iter) {
+    bucket = iter->first;
+    params->object_table = getObjectTable(bucket);
+    schema = ListTableSchema(params->object_table);
+
+    ret = exec(dpp, schema.c_str(), &list_callback);
+    if (ret)
+      ldpp_dout(dpp, 0)<<"ListObjecttable failed " << dendl;
+
+    ldpp_dout(dpp, 20)<<"ListObjectTable suceeded " << dendl;
+  }
+
+  return ret;
+}
+
+int SQLObjectOp::InitializeObjectOps(string db_name, const DoutPrefixProvider *dpp)
+{
+  PutObject = make_shared<SQLPutObject>(sdb, db_name, cct);
+  DeleteObject = make_shared<SQLDeleteObject>(sdb, db_name, cct);
+  GetObject = make_shared<SQLGetObject>(sdb, db_name, cct);
+  UpdateObject = make_shared<SQLUpdateObject>(sdb, db_name, cct);
+  ListBucketObjects = make_shared<SQLListBucketObjects>(sdb, db_name, cct);
+  ListVersionedObjects = make_shared<SQLListVersionedObjects>(sdb, db_name, cct);
+  PutObjectData = make_shared<SQLPutObjectData>(sdb, db_name, cct);
+  UpdateObjectData = make_shared<SQLUpdateObjectData>(sdb, db_name, cct);
+  GetObjectData = make_shared<SQLGetObjectData>(sdb, db_name, cct);
+  DeleteObjectData = make_shared<SQLDeleteObjectData>(sdb, db_name, cct);
+  DeleteStaleObjectData = make_shared<SQLDeleteStaleObjectData>(sdb, db_name, cct);
+
+  return 0;
+}
+
+int SQLInsertUser::Prepare(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int ret = -1;
+  struct DBOpPrepareParams p_params = PrepareParams;
+
+  if (!*sdb) {
+    ldpp_dout(dpp, 0)<<"In SQLInsertUser - no db" << dendl;
+    goto out;
+  }
+
+  InitPrepareParams(dpp, p_params, params);
+
+  SQL_PREPARE(dpp, p_params, sdb, stmt, ret, "PrepareInsertUser");
+out:
+  return ret;
+}
+
+int SQLInsertUser::Bind(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int index = -1;
+  int rc = 0;
+  struct DBOpPrepareParams p_params = PrepareParams;
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.user.tenant, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.user.uinfo.user_id.tenant.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.user.user_id, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.user.uinfo.user_id.id.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.user.ns, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.user.uinfo.user_id.ns.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.user.display_name, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.user.uinfo.display_name.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.user.user_email, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.user.uinfo.user_email.c_str(), sdb);
+
+  if (!params->op.user.uinfo.access_keys.empty()) {
+    string access_key;
+    string key;
+    map<string, RGWAccessKey>::const_iterator it =
+      params->op.user.uinfo.access_keys.begin();
+    const RGWAccessKey& k = it->second;
+    access_key = k.id;
+    key = k.key;
+
+    SQL_BIND_INDEX(dpp, stmt, index, p_params.op.user.access_keys_id, sdb);
+    SQL_BIND_TEXT(dpp, stmt, index, access_key.c_str(), sdb);
+
+    SQL_BIND_INDEX(dpp, stmt, index, p_params.op.user.access_keys_secret, sdb);
+    SQL_BIND_TEXT(dpp, stmt, index, key.c_str(), sdb);
+
+  }
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.user.access_keys, sdb);
+  SQL_ENCODE_BLOB_PARAM(dpp, stmt, index, params->op.user.uinfo.access_keys, sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.user.swift_keys, sdb);
+  SQL_ENCODE_BLOB_PARAM(dpp, stmt, index, params->op.user.uinfo.swift_keys, sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.user.subusers, sdb);
+  SQL_ENCODE_BLOB_PARAM(dpp, stmt, index, params->op.user.uinfo.subusers, sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.user.suspended, sdb);
+  SQL_BIND_INT(dpp, stmt, index, params->op.user.uinfo.suspended, sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.user.max_buckets, sdb);
+  SQL_BIND_INT(dpp, stmt, index, params->op.user.uinfo.max_buckets, sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.user.op_mask, sdb);
+  SQL_BIND_INT(dpp, stmt, index, params->op.user.uinfo.op_mask, sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.user.user_caps, sdb);
+  SQL_ENCODE_BLOB_PARAM(dpp, stmt, index, params->op.user.uinfo.caps, sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.user.admin, sdb);
+  SQL_BIND_INT(dpp, stmt, index, params->op.user.uinfo.admin, sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.user.system, sdb);
+  SQL_BIND_INT(dpp, stmt, index, params->op.user.uinfo.system, sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.user.placement_name, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.user.uinfo.default_placement.name.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.user.placement_storage_class, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.user.uinfo.default_placement.storage_class.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.user.placement_tags, sdb);
+  SQL_ENCODE_BLOB_PARAM(dpp, stmt, index, params->op.user.uinfo.placement_tags, sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.user.bucket_quota, sdb);
+  SQL_ENCODE_BLOB_PARAM(dpp, stmt, index, params->op.user.uinfo.quota.bucket_quota, sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.user.temp_url_keys, sdb);
+  SQL_ENCODE_BLOB_PARAM(dpp, stmt, index, params->op.user.uinfo.temp_url_keys, sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.user.user_quota, sdb);
+  SQL_ENCODE_BLOB_PARAM(dpp, stmt, index, params->op.user.uinfo.quota.user_quota, sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.user.type, sdb);
+  SQL_BIND_INT(dpp, stmt, index, params->op.user.uinfo.type, sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.user.mfa_ids, sdb);
+  SQL_ENCODE_BLOB_PARAM(dpp, stmt, index, params->op.user.uinfo.mfa_ids, sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.user.assumed_role_arn, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.user.uinfo.assumed_role_arn.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.user.user_attrs, sdb);
+  SQL_ENCODE_BLOB_PARAM(dpp, stmt, index, params->op.user.user_attrs, sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.user.user_ver, sdb);
+  SQL_BIND_INT(dpp, stmt, index, params->op.user.user_version.ver, sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.user.user_ver_tag, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.user.user_version.tag.c_str(), sdb);
+
+out:
+  return rc;
+}
+
+int SQLInsertUser::Execute(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int ret = -1;
+
+  SQL_EXECUTE(dpp, params, stmt, NULL);
+out:
+  return ret;
+}
+
+int SQLRemoveUser::Prepare(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int ret = -1;
+  struct DBOpPrepareParams p_params = PrepareParams;
+
+  if (!*sdb) {
+    ldpp_dout(dpp, 0)<<"In SQLRemoveUser - no db" << dendl;
+    goto out;
+  }
+
+  InitPrepareParams(dpp, p_params, params);
+
+  SQL_PREPARE(dpp, p_params, sdb, stmt, ret, "PrepareRemoveUser");
+out:
+  return ret;
+}
+
+int SQLRemoveUser::Bind(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int index = -1;
+  int rc = 0;
+  struct DBOpPrepareParams p_params = PrepareParams;
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.user.user_id, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.user.uinfo.user_id.id.c_str(), sdb);
+
+out:
+  return rc;
+}
+
+int SQLRemoveUser::Execute(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int ret = -1;
+
+  SQL_EXECUTE(dpp, params, stmt, NULL);
+out:
+  return ret;
+}
+
+int SQLGetUser::Prepare(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int ret = -1;
+  struct DBOpPrepareParams p_params = PrepareParams;
+
+  if (!*sdb) {
+    ldpp_dout(dpp, 0)<<"In SQLGetUser - no db" << dendl;
+    goto out;
+  }
+
+  InitPrepareParams(dpp, p_params, params);
+
+  if (params->op.query_str == "email") { 
+    SQL_PREPARE(dpp, p_params, sdb, email_stmt, ret, "PrepareGetUser");
+  } else if (params->op.query_str == "access_key") { 
+    SQL_PREPARE(dpp, p_params, sdb, ak_stmt, ret, "PrepareGetUser");
+  } else if (params->op.query_str == "user_id") { 
+    SQL_PREPARE(dpp, p_params, sdb, userid_stmt, ret, "PrepareGetUser");
+  } else { // by default by userid
+    SQL_PREPARE(dpp, p_params, sdb, stmt, ret, "PrepareGetUser");
+  }
+out:
+  return ret;
+}
+
+int SQLGetUser::Bind(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int index = -1;
+  int rc = 0;
+  struct DBOpPrepareParams p_params = PrepareParams;
+
+  if (params->op.query_str == "email") { 
+    SQL_BIND_INDEX(dpp, email_stmt, index, p_params.op.user.user_email, sdb);
+    SQL_BIND_TEXT(dpp, email_stmt, index, params->op.user.uinfo.user_email.c_str(), sdb);
+  } else if (params->op.query_str == "access_key") { 
+    if (!params->op.user.uinfo.access_keys.empty()) {
+      string access_key;
+      map<string, RGWAccessKey>::const_iterator it =
+        params->op.user.uinfo.access_keys.begin();
+      const RGWAccessKey& k = it->second;
+      access_key = k.id;
+
+      SQL_BIND_INDEX(dpp, ak_stmt, index, p_params.op.user.access_keys_id, sdb);
+      SQL_BIND_TEXT(dpp, ak_stmt, index, access_key.c_str(), sdb);
+    }
+  } else if (params->op.query_str == "user_id") { 
+    SQL_BIND_INDEX(dpp, userid_stmt, index, p_params.op.user.user_id, sdb);
+    SQL_BIND_TEXT(dpp, userid_stmt, index, params->op.user.uinfo.user_id.id.c_str(), sdb);
+  } else { // by default by userid
+    SQL_BIND_INDEX(dpp, stmt, index, p_params.op.user.user_id, sdb);
+    SQL_BIND_TEXT(dpp, stmt, index, params->op.user.uinfo.user_id.id.c_str(), sdb);
+  }
+
+out:
+  return rc;
+}
+
+int SQLGetUser::Execute(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int ret = -1;
+
+  if (params->op.query_str == "email") { 
+    SQL_EXECUTE(dpp, params, email_stmt, list_user);
+  } else if (params->op.query_str == "access_key") { 
+    SQL_EXECUTE(dpp, params, ak_stmt, list_user);
+  } else if (params->op.query_str == "user_id") { 
+    SQL_EXECUTE(dpp, params, userid_stmt, list_user);
+  } else { // by default by userid
+    SQL_EXECUTE(dpp, params, stmt, list_user);
+  }
+
+out:
+  return ret;
+}
+
+int SQLInsertBucket::Prepare(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int ret = -1;
+  struct DBOpPrepareParams p_params = PrepareParams;
+
+  if (!*sdb) {
+    ldpp_dout(dpp, 0)<<"In SQLInsertBucket - no db" << dendl;
+    goto out;
+  }
+
+  InitPrepareParams(dpp, p_params, params);
+
+  SQL_PREPARE(dpp, p_params, sdb, stmt, ret, "PrepareInsertBucket");
+
+out:
+  return ret;
+}
+
+int SQLInsertBucket::Bind(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int index = -1;
+  int rc = 0;
+  struct DBOpPrepareParams p_params = PrepareParams;
+
+  // user_id here is copied as OwnerID in the bucket table.
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.user.user_id, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.user.uinfo.user_id.id.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.bucket_name, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.bucket.info.bucket.name.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.tenant, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.bucket.info.bucket.tenant.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.marker, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.bucket.info.bucket.marker.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.bucket_id, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.bucket.info.bucket.bucket_id.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.size, sdb);
+  SQL_BIND_INT(dpp, stmt, index, params->op.bucket.ent.size, sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.size_rounded, sdb);
+  SQL_BIND_INT(dpp, stmt, index, params->op.bucket.ent.size_rounded, sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.creation_time, sdb);
+  SQL_ENCODE_BLOB_PARAM(dpp, stmt, index, params->op.bucket.info.creation_time, sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.count, sdb);
+  SQL_BIND_INT(dpp, stmt, index, params->op.bucket.ent.count, sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.placement_name, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.bucket.info.placement_rule.name.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.placement_storage_class, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.bucket.info.placement_rule.storage_class.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.flags, sdb);
+  SQL_BIND_INT(dpp, stmt, index, params->op.bucket.info.flags, sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.zonegroup, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.bucket.info.zonegroup.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.has_instance_obj, sdb);
+  SQL_BIND_INT(dpp, stmt, index, params->op.bucket.info.has_instance_obj, sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.quota, sdb);
+  SQL_ENCODE_BLOB_PARAM(dpp, stmt, index, params->op.bucket.info.quota, sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.requester_pays, sdb);
+  SQL_BIND_INT(dpp, stmt, index, params->op.bucket.info.requester_pays, sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.has_website, sdb);
+  SQL_BIND_INT(dpp, stmt, index, params->op.bucket.info.has_website, sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.website_conf, sdb);
+  SQL_ENCODE_BLOB_PARAM(dpp, stmt, index, params->op.bucket.info.website_conf, sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.swift_versioning, sdb);
+  SQL_BIND_INT(dpp, stmt, index, params->op.bucket.info.swift_versioning, sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.swift_ver_location, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.bucket.info.swift_ver_location.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.mdsearch_config, sdb);
+  SQL_ENCODE_BLOB_PARAM(dpp, stmt, index, params->op.bucket.info.mdsearch_config, sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.new_bucket_instance_id, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.bucket.info.new_bucket_instance_id.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.obj_lock, sdb);
+  SQL_ENCODE_BLOB_PARAM(dpp, stmt, index, params->op.bucket.info.obj_lock, sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.sync_policy_info_groups, sdb);
+  SQL_ENCODE_BLOB_PARAM(dpp, stmt, index, params->op.bucket.info.sync_policy, sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.bucket_attrs, sdb);
+  SQL_ENCODE_BLOB_PARAM(dpp, stmt, index, params->op.bucket.bucket_attrs, sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.bucket_ver, sdb);
+  SQL_BIND_INT(dpp, stmt, index, params->op.bucket.bucket_version.ver, sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.bucket_ver_tag, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.bucket.bucket_version.tag.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.mtime, sdb);
+  SQL_ENCODE_BLOB_PARAM(dpp, stmt, index, params->op.bucket.mtime, sdb);
+
+out:
+  return rc;
+}
+
+int SQLInsertBucket::Execute(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int ret = -1;
+  class SQLObjectOp *ObPtr = NULL;
+  string bucket_name = params->op.bucket.info.bucket.name;
+  struct DBOpPrepareParams p_params = PrepareParams;
+
+  ObPtr = new SQLObjectOp(sdb, ctx());
+
+  objectmapInsert(dpp, bucket_name, ObPtr);
+
+  SQL_EXECUTE(dpp, params, stmt, NULL);
+
+  /* Once Bucket is inserted created corresponding object(&data) tables
+   */
+  InitPrepareParams(dpp, p_params, params);
+
+  (void)createObjectTable(dpp, params);
+  (void)createObjectDataTable(dpp, params);
+  (void)createObjectTableTrigger(dpp, params);
+out:
+  return ret;
+}
+
+int SQLUpdateBucket::Prepare(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int ret = -1;
+  struct DBOpPrepareParams p_params = PrepareParams;
+
+  if (!*sdb) {
+    ldpp_dout(dpp, 0)<<"In SQLUpdateBucket - no db" << dendl;
+    goto out;
+  }
+
+  InitPrepareParams(dpp, p_params, params);
+
+  if (params->op.query_str == "attrs") { 
+    SQL_PREPARE(dpp, p_params, sdb, attrs_stmt, ret, "PrepareUpdateBucket");
+  } else if (params->op.query_str == "owner") { 
+    SQL_PREPARE(dpp, p_params, sdb, owner_stmt, ret, "PrepareUpdateBucket");
+  } else if (params->op.query_str == "info") { 
+    SQL_PREPARE(dpp, p_params, sdb, info_stmt, ret, "PrepareUpdateBucket");
+  } else {
+    ldpp_dout(dpp, 0)<<"In SQLUpdateBucket invalid query_str:" <<
+      params->op.query_str << "" << dendl;
+    goto out;
+  }
+
+out:
+  return ret;
+}
+
+int SQLUpdateBucket::Bind(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int index = -1;
+  int rc = 0;
+  struct DBOpPrepareParams p_params = PrepareParams;
+  sqlite3_stmt** stmt = NULL; // Prepared statement
+
+  /* All below fields for attrs */
+  if (params->op.query_str == "attrs") { 
+    stmt = &attrs_stmt;
+  } else if (params->op.query_str == "owner") { 
+    stmt = &owner_stmt;
+  } else if (params->op.query_str == "info") { 
+    stmt = &info_stmt;
+  } else {
+    ldpp_dout(dpp, 0)<<"In SQLUpdateBucket invalid query_str:" <<
+      params->op.query_str << "" << dendl;
+    goto out;
+  }
+
+  if (params->op.query_str == "attrs") { 
+    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.bucket.bucket_attrs, sdb);
+    SQL_ENCODE_BLOB_PARAM(dpp, *stmt, index, params->op.bucket.bucket_attrs, sdb);
+  } else if (params->op.query_str == "owner") { 
+    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.bucket.creation_time, sdb);
+    SQL_ENCODE_BLOB_PARAM(dpp, *stmt, index, params->op.bucket.info.creation_time, sdb);
+  } else if (params->op.query_str == "info") { 
+    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.bucket.tenant, sdb);
+    SQL_BIND_TEXT(dpp, *stmt, index, params->op.bucket.info.bucket.tenant.c_str(), sdb);
+
+    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.bucket.marker, sdb);
+    SQL_BIND_TEXT(dpp, *stmt, index, params->op.bucket.info.bucket.marker.c_str(), sdb);
+
+    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.bucket.bucket_id, sdb);
+    SQL_BIND_TEXT(dpp, *stmt, index, params->op.bucket.info.bucket.bucket_id.c_str(), sdb);
+
+    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.bucket.creation_time, sdb);
+    SQL_ENCODE_BLOB_PARAM(dpp, *stmt, index, params->op.bucket.info.creation_time, sdb);
+
+    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.bucket.count, sdb);
+    SQL_BIND_INT(dpp, *stmt, index, params->op.bucket.ent.count, sdb);
+
+    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.bucket.placement_name, sdb);
+    SQL_BIND_TEXT(dpp, *stmt, index, params->op.bucket.info.placement_rule.name.c_str(), sdb);
+
+    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.bucket.placement_storage_class, sdb);
+    SQL_BIND_TEXT(dpp, *stmt, index, params->op.bucket.info.placement_rule.storage_class.c_str(), sdb);
+
+    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.bucket.flags, sdb);
+    SQL_BIND_INT(dpp, *stmt, index, params->op.bucket.info.flags, sdb);
+
+    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.bucket.zonegroup, sdb);
+    SQL_BIND_TEXT(dpp, *stmt, index, params->op.bucket.info.zonegroup.c_str(), sdb);
+
+    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.bucket.has_instance_obj, sdb);
+    SQL_BIND_INT(dpp, *stmt, index, params->op.bucket.info.has_instance_obj, sdb);
+
+    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.bucket.quota, sdb);
+    SQL_ENCODE_BLOB_PARAM(dpp, *stmt, index, params->op.bucket.info.quota, sdb);
+
+    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.bucket.requester_pays, sdb);
+    SQL_BIND_INT(dpp, *stmt, index, params->op.bucket.info.requester_pays, sdb);
+
+    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.bucket.has_website, sdb);
+    SQL_BIND_INT(dpp, *stmt, index, params->op.bucket.info.has_website, sdb);
+
+    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.bucket.website_conf, sdb);
+    SQL_ENCODE_BLOB_PARAM(dpp, *stmt, index, params->op.bucket.info.website_conf, sdb);
+
+    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.bucket.swift_versioning, sdb);
+    SQL_BIND_INT(dpp, *stmt, index, params->op.bucket.info.swift_versioning, sdb);
+
+    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.bucket.swift_ver_location, sdb);
+    SQL_BIND_TEXT(dpp, *stmt, index, params->op.bucket.info.swift_ver_location.c_str(), sdb);
+
+    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.bucket.mdsearch_config, sdb);
+    SQL_ENCODE_BLOB_PARAM(dpp, *stmt, index, params->op.bucket.info.mdsearch_config, sdb);
+
+    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.bucket.new_bucket_instance_id, sdb);
+    SQL_BIND_TEXT(dpp, *stmt, index, params->op.bucket.info.new_bucket_instance_id.c_str(), sdb);
+
+    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.bucket.obj_lock, sdb);
+    SQL_ENCODE_BLOB_PARAM(dpp, *stmt, index, params->op.bucket.info.obj_lock, sdb);
+
+    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.bucket.sync_policy_info_groups, sdb);
+    SQL_ENCODE_BLOB_PARAM(dpp, *stmt, index, params->op.bucket.info.sync_policy, sdb);
+  }
+
+  SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.user.user_id, sdb);
+  SQL_BIND_TEXT(dpp, *stmt, index, params->op.user.uinfo.user_id.id.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.bucket.bucket_name, sdb);
+  SQL_BIND_TEXT(dpp, *stmt, index, params->op.bucket.info.bucket.name.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.bucket.bucket_ver, sdb);
+  SQL_BIND_INT(dpp, *stmt, index, params->op.bucket.bucket_version.ver, sdb);
+
+  SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.bucket.mtime, sdb);
+  SQL_ENCODE_BLOB_PARAM(dpp, *stmt, index, params->op.bucket.mtime, sdb);
+
+out:
+  return rc;
+}
+
+int SQLUpdateBucket::Execute(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int ret = -1;
+  sqlite3_stmt** stmt = NULL; // Prepared statement
+
+  if (params->op.query_str == "attrs") { 
+    stmt = &attrs_stmt;
+  } else if (params->op.query_str == "owner") { 
+    stmt = &owner_stmt;
+  } else if (params->op.query_str == "info") { 
+    stmt = &info_stmt;
+  } else {
+    ldpp_dout(dpp, 0)<<"In SQLUpdateBucket invalid query_str:" <<
+      params->op.query_str << "" << dendl;
+    goto out;
+  }
+
+  SQL_EXECUTE(dpp, params, *stmt, NULL);
+out:
+  return ret;
+}
+
+int SQLRemoveBucket::Prepare(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int ret = -1;
+  struct DBOpPrepareParams p_params = PrepareParams;
+
+  if (!*sdb) {
+    ldpp_dout(dpp, 0)<<"In SQLRemoveBucket - no db" << dendl;
+    goto out;
+  }
+
+  InitPrepareParams(dpp, p_params, params);
+
+  SQL_PREPARE(dpp, p_params, sdb, stmt, ret, "PrepareRemoveBucket");
+
+out:
+  return ret;
+}
+
+int SQLRemoveBucket::Bind(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int index = -1;
+  int rc = 0;
+  struct DBOpPrepareParams p_params = PrepareParams;
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.bucket_name, sdb);
+
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.bucket.info.bucket.name.c_str(), sdb);
+
+out:
+  return rc;
+}
+
+int SQLRemoveBucket::Execute(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int ret = -1;
+
+  objectmapDelete(dpp, params->op.bucket.info.bucket.name);
+
+  SQL_EXECUTE(dpp, params, stmt, NULL);
+out:
+  return ret;
+}
+
+int SQLGetBucket::Prepare(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int ret = -1;
+  struct DBOpPrepareParams p_params = PrepareParams;
+
+  if (!*sdb) {
+    ldpp_dout(dpp, 0)<<"In SQLGetBucket - no db" << dendl;
+    goto out;
+  }
+
+  InitPrepareParams(dpp, p_params, params);
+
+  SQL_PREPARE(dpp, p_params, sdb, stmt, ret, "PrepareGetBucket");
+
+out:
+  return ret;
+}
+
+int SQLGetBucket::Bind(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int index = -1;
+  int rc = 0;
+  struct DBOpPrepareParams p_params = PrepareParams;
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.bucket_name, sdb);
+
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.bucket.info.bucket.name.c_str(), sdb);
+
+out:
+  return rc;
+}
+
+int SQLGetBucket::Execute(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int ret = -1;
+  class SQLObjectOp *ObPtr = NULL;
+
+  params->op.name = "GetBucket";
+
+  ObPtr = new SQLObjectOp(sdb, ctx());
+
+  /* For the case when the  server restarts, need to reinsert objectmap*/
+  objectmapInsert(dpp, params->op.bucket.info.bucket.name, ObPtr);
+  SQL_EXECUTE(dpp, params, stmt, list_bucket);
+out:
+  return ret;
+}
+
+int SQLListUserBuckets::Prepare(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int ret = -1;
+  struct DBOpPrepareParams p_params = PrepareParams;
+
+  if (!*sdb) {
+    ldpp_dout(dpp, 0)<<"In SQLListUserBuckets - no db" << dendl;
+    goto out;
+  }
+
+  InitPrepareParams(dpp, p_params, params);
+
+  if (params->op.query_str == "all") { 
+    SQL_PREPARE(dpp, p_params, sdb, all_stmt, ret, "PrepareListUserBuckets");
+  }else {
+    SQL_PREPARE(dpp, p_params, sdb, stmt, ret, "PrepareListUserBuckets");
+  }
+
+out:
+  return ret;
+}
+
+int SQLListUserBuckets::Bind(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int index = -1;
+  int rc = 0;
+  struct DBOpPrepareParams p_params = PrepareParams;
+  sqlite3_stmt** pstmt = NULL; // Prepared statement
+
+  if (params->op.query_str == "all") { 
+    pstmt = &all_stmt;
+  } else { 
+    pstmt = &stmt;
+  }
+
+  if (params->op.query_str != "all") { 
+    SQL_BIND_INDEX(dpp, *pstmt, index, p_params.op.user.user_id, sdb);
+    SQL_BIND_TEXT(dpp, *pstmt, index, params->op.user.uinfo.user_id.id.c_str(), sdb);
+  }
+
+  SQL_BIND_INDEX(dpp, *pstmt, index, p_params.op.bucket.min_marker, sdb);
+  SQL_BIND_TEXT(dpp, *pstmt, index, params->op.bucket.min_marker.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, *pstmt, index, p_params.op.list_max_count, sdb);
+  SQL_BIND_INT(dpp, *pstmt, index, params->op.list_max_count, sdb);
+
+out:
+  return rc;
+}
+
+int SQLListUserBuckets::Execute(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int ret = -1;
+
+  if (params->op.query_str == "all") { 
+    SQL_EXECUTE(dpp, params, all_stmt, list_bucket);
+  } else {
+    SQL_EXECUTE(dpp, params, stmt, list_bucket);
+  }
+out:
+  return ret;
+}
+
+int SQLPutObject::Prepare(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int ret = -1;
+  struct DBOpPrepareParams p_params = PrepareParams;
+
+  if (!*sdb) {
+    ldpp_dout(dpp, 0)<<"In SQLPutObject - no db" << dendl;
+    goto out;
+  }
+
+  InitPrepareParams(dpp, p_params, params);
+  SQL_PREPARE(dpp, p_params, sdb, stmt, ret, "PreparePutObject");
+
+out:
+  return ret;
+}
+
+int SQLPutObject::Bind(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int index = -1;
+  int rc = 0;
+  int VersionNum = 0;
+  struct DBOpPrepareParams p_params = PrepareParams;
+
+  if (params->op.obj.state.obj.key.instance.empty()) {
+    params->op.obj.state.obj.key.instance = "null";
+  }
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.obj_name, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.state.obj.key.name.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.bucket_name, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.bucket.info.bucket.name.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.obj_instance, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.state.obj.key.instance.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.obj_ns, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.state.obj.key.ns.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.acls, sdb);
+  SQL_ENCODE_BLOB_PARAM(dpp, stmt, index, params->op.obj.acls, sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.index_ver, sdb);
+  SQL_BIND_INT(dpp, stmt, index, params->op.obj.index_ver, sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.tag, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.tag.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.flags, sdb);
+  SQL_BIND_INT(dpp, stmt, index, params->op.obj.flags, sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.versioned_epoch, sdb);
+  SQL_BIND_INT(dpp, stmt, index, params->op.obj.versioned_epoch, sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.obj_category, sdb);
+  SQL_BIND_INT(dpp, stmt, index, (uint8_t)(params->op.obj.category), sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.etag, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.etag.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.owner, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.owner.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.owner_display_name, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.owner_display_name.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.storage_class, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.storage_class.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.appendable, sdb);
+  SQL_BIND_INT(dpp, stmt, index, params->op.obj.appendable, sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.content_type, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.content_type.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.index_hash_source, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.state.obj.index_hash_source.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.obj_size, sdb);
+  SQL_BIND_INT(dpp, stmt, index, params->op.obj.state.size, sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.accounted_size, sdb);
+  SQL_BIND_INT(dpp, stmt, index, params->op.obj.state.accounted_size, sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.mtime, sdb);
+  SQL_ENCODE_BLOB_PARAM(dpp, stmt, index, params->op.obj.state.mtime, sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.epoch, sdb);
+  SQL_BIND_INT(dpp, stmt, index, params->op.obj.state.epoch, sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.obj_tag, sdb);
+  SQL_ENCODE_BLOB_PARAM(dpp, stmt, index, params->op.obj.state.obj_tag, sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.tail_tag, sdb);
+  SQL_ENCODE_BLOB_PARAM(dpp, stmt, index, params->op.obj.state.tail_tag, sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.write_tag, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.state.write_tag.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.fake_tag, sdb);
+  SQL_BIND_INT(dpp, stmt, index, params->op.obj.state.fake_tag, sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.shadow_obj, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.state.shadow_obj.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.has_data, sdb);
+  SQL_BIND_INT(dpp, stmt, index, params->op.obj.state.has_data, sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.is_versioned, sdb);
+  SQL_BIND_INT(dpp, stmt, index, params->op.obj.is_versioned, sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.version_num, sdb);
+  SQL_BIND_INT(dpp, stmt, index, VersionNum, sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.pg_ver, sdb);
+  SQL_BIND_INT(dpp, stmt, index, params->op.obj.state.pg_ver, sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.zone_short_id, sdb);
+  SQL_BIND_INT(dpp, stmt, index, params->op.obj.state.zone_short_id, sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.obj_version, sdb);
+  SQL_BIND_INT(dpp, stmt, index, params->op.obj.state.objv_tracker.read_version.ver, sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.obj_version_tag, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.state.objv_tracker.read_version.tag.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.obj_attrs, sdb);
+  SQL_ENCODE_BLOB_PARAM(dpp, stmt, index, params->op.obj.state.attrset, sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.head_size, sdb);
+  SQL_BIND_INT(dpp, stmt, index, params->op.obj.head_size, sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.max_head_size, sdb);
+  SQL_BIND_INT(dpp, stmt, index, params->op.obj.max_head_size, sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.obj_id, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.obj_id.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.tail_instance, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.tail_instance.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.head_placement_rule_name, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.head_placement_rule.name.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.head_placement_storage_class, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.head_placement_rule.storage_class.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.tail_placement_rule_name, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.tail_placement.placement_rule.name.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.tail_placement_storage_class, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.tail_placement.placement_rule.storage_class.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.manifest_part_objs, sdb);
+  SQL_ENCODE_BLOB_PARAM(dpp, stmt, index, params->op.obj.objs, sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.manifest_part_rules, sdb);
+  SQL_ENCODE_BLOB_PARAM(dpp, stmt, index, params->op.obj.rules, sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.omap, sdb);
+  SQL_ENCODE_BLOB_PARAM(dpp, stmt, index, params->op.obj.omap, sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.is_multipart, sdb);
+  SQL_BIND_INT(dpp, stmt, index, params->op.obj.is_multipart, sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.mp_parts, sdb);
+  SQL_ENCODE_BLOB_PARAM(dpp, stmt, index, params->op.obj.mp_parts, sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.head_data, sdb);
+  SQL_ENCODE_BLOB_PARAM(dpp, stmt, index, params->op.obj.head_data, sdb);
+
+out:
+  return rc;
+}
+
+int SQLPutObject::Execute(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int ret = -1;
+
+  SQL_EXECUTE(dpp, params, stmt, NULL);
+out:
+  return ret;
+}
+
+int SQLDeleteObject::Prepare(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int ret = -1;
+  struct DBOpPrepareParams p_params = PrepareParams;
+
+  if (!*sdb) {
+    ldpp_dout(dpp, 0)<<"In SQLDeleteObject - no db" << dendl;
+    goto out;
+  }
+
+  InitPrepareParams(dpp, p_params, params);
+  SQL_PREPARE(dpp, p_params, sdb, stmt, ret, "PrepareDeleteObject");
+
+out:
+  return ret;
+}
+
+int SQLDeleteObject::Bind(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int index = -1;
+  int rc = 0;
+  struct DBOpPrepareParams p_params = PrepareParams;
+
+  if (params->op.obj.state.obj.key.instance.empty()) {
+    params->op.obj.state.obj.key.instance = "null";
+  }
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.bucket_name, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.bucket.info.bucket.name.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.obj_name, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.state.obj.key.name.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.obj_instance, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.state.obj.key.instance.c_str(), sdb);
+out:
+  return rc;
+}
+
+int SQLDeleteObject::Execute(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int ret = -1;
+
+  SQL_EXECUTE(dpp, params, stmt, NULL);
+out:
+  return ret;
+}
+
+int SQLGetObject::Prepare(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int ret = -1;
+  struct DBOpPrepareParams p_params = PrepareParams;
+
+  if (!*sdb) {
+    ldpp_dout(dpp, 0)<<"In SQLGetObject - no db" << dendl;
+    goto out;
+  }
+
+  InitPrepareParams(dpp, p_params, params);
+  SQL_PREPARE(dpp, p_params, sdb, stmt, ret, "PrepareGetObject");
+
+out:
+  return ret;
+}
+
+int SQLGetObject::Bind(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int index = -1;
+  int rc = 0;
+  struct DBOpPrepareParams p_params = PrepareParams;
+
+  if (params->op.obj.state.obj.key.instance.empty()) {
+    params->op.obj.state.obj.key.instance = "null";
+  }
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.bucket_name, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.bucket.info.bucket.name.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.obj_name, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.state.obj.key.name.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.obj_instance, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.state.obj.key.instance.c_str(), sdb);
+
+out:
+  return rc;
+}
+
+int SQLGetObject::Execute(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int ret = -1;
+
+  SQL_EXECUTE(dpp, params, stmt, list_object);
+out:
+  return ret;
+}
+
+int SQLUpdateObject::Prepare(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int ret = -1;
+  struct DBOpPrepareParams p_params = PrepareParams;
+  struct DBOpParams copy = *params;
+  string bucket_name;
+
+  if (!*sdb) {
+    ldpp_dout(dpp, 0)<<"In SQLUpdateObject - no db" << dendl;
+    goto out;
+  }
+
+  InitPrepareParams(dpp, p_params, params);
+
+  if (params->op.query_str == "omap") {
+    SQL_PREPARE(dpp, p_params, sdb, omap_stmt, ret, "PrepareUpdateObject");
+  } else if (params->op.query_str == "attrs") {
+    SQL_PREPARE(dpp, p_params, sdb, attrs_stmt, ret, "PrepareUpdateObject");
+  } else if (params->op.query_str == "meta") {
+    SQL_PREPARE(dpp, p_params, sdb, meta_stmt, ret, "PrepareUpdateObject");
+  } else if (params->op.query_str == "mp") {
+    SQL_PREPARE(dpp, p_params, sdb, mp_stmt, ret, "PrepareUpdateObject");
+  } else {
+    ldpp_dout(dpp, 0)<<"In SQLUpdateObject invalid query_str:" <<
+      params->op.query_str << dendl;
+    goto out;
+  }
+
+out:
+  return ret;
+}
+
+int SQLUpdateObject::Bind(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int index = -1;
+  int rc = 0;
+  struct DBOpPrepareParams p_params = PrepareParams;
+  sqlite3_stmt** stmt = NULL; // Prepared statement
+
+  /* All below fields for attrs */
+  if (params->op.query_str == "omap") { 
+    stmt = &omap_stmt;
+  } else if (params->op.query_str == "attrs") { 
+    stmt = &attrs_stmt;
+  } else if (params->op.query_str == "meta") { 
+    stmt = &meta_stmt;
+  } else if (params->op.query_str == "mp") { 
+    stmt = &mp_stmt;
+  } else {
+    ldpp_dout(dpp, 0)<<"In SQLUpdateObject invalid query_str:" <<
+      params->op.query_str << dendl;
+    goto out;
+  }
+
+  if (params->op.obj.state.obj.key.instance.empty()) {
+    params->op.obj.state.obj.key.instance = "null";
+  }
+
+  SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.bucket.bucket_name, sdb);
+  SQL_BIND_TEXT(dpp, *stmt, index, params->op.bucket.info.bucket.name.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.obj_name, sdb);
+  SQL_BIND_TEXT(dpp, *stmt, index, params->op.obj.state.obj.key.name.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.obj_instance, sdb);
+  SQL_BIND_TEXT(dpp, *stmt, index, params->op.obj.state.obj.key.instance.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.mtime, sdb);
+  SQL_ENCODE_BLOB_PARAM(dpp, *stmt, index, params->op.obj.state.mtime, sdb);
+
+  if (params->op.query_str == "omap") { 
+    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.omap, sdb);
+    SQL_ENCODE_BLOB_PARAM(dpp, *stmt, index, params->op.obj.omap, sdb);
+  }
+  if (params->op.query_str == "attrs") { 
+    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.obj_attrs, sdb);
+    SQL_ENCODE_BLOB_PARAM(dpp, *stmt, index, params->op.obj.state.attrset, sdb);
+  }
+  if (params->op.query_str == "mp") { 
+    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.mp_parts, sdb);
+    SQL_ENCODE_BLOB_PARAM(dpp, *stmt, index, params->op.obj.mp_parts, sdb);
+  }
+  if (params->op.query_str == "meta") { 
+    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.obj_ns, sdb);
+    SQL_BIND_TEXT(dpp, *stmt, index, params->op.obj.state.obj.key.ns.c_str(), sdb);
+
+    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.acls, sdb);
+    SQL_ENCODE_BLOB_PARAM(dpp, *stmt, index, params->op.obj.acls, sdb);
+
+    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.index_ver, sdb);
+    SQL_BIND_INT(dpp, *stmt, index, params->op.obj.index_ver, sdb);
+
+    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.tag, sdb);
+    SQL_BIND_TEXT(dpp, *stmt, index, params->op.obj.tag.c_str(), sdb);
+
+    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.flags, sdb);
+    SQL_BIND_INT(dpp, *stmt, index, params->op.obj.flags, sdb);
+
+    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.versioned_epoch, sdb);
+    SQL_BIND_INT(dpp, *stmt, index, params->op.obj.versioned_epoch, sdb);
+
+    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.obj_category, sdb);
+    SQL_BIND_INT(dpp, *stmt, index, (uint8_t)(params->op.obj.category), sdb);
+
+    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.etag, sdb);
+    SQL_BIND_TEXT(dpp, *stmt, index, params->op.obj.etag.c_str(), sdb);
+
+    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.owner, sdb);
+    SQL_BIND_TEXT(dpp, *stmt, index, params->op.obj.owner.c_str(), sdb);
+
+    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.owner_display_name, sdb);
+    SQL_BIND_TEXT(dpp, *stmt, index, params->op.obj.owner_display_name.c_str(), sdb);
+
+    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.storage_class, sdb);
+    SQL_BIND_TEXT(dpp, *stmt, index, params->op.obj.storage_class.c_str(), sdb);
+
+    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.appendable, sdb);
+    SQL_BIND_INT(dpp, *stmt, index, params->op.obj.appendable, sdb);
+
+    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.content_type, sdb);
+    SQL_BIND_TEXT(dpp, *stmt, index, params->op.obj.content_type.c_str(), sdb);
+
+    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.index_hash_source, sdb);
+    SQL_BIND_TEXT(dpp, *stmt, index, params->op.obj.state.obj.index_hash_source.c_str(), sdb);
+
+    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.obj_size, sdb);
+    SQL_BIND_INT(dpp, *stmt, index, params->op.obj.state.size, sdb);
+
+    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.accounted_size, sdb);
+    SQL_BIND_INT(dpp, *stmt, index, params->op.obj.state.accounted_size, sdb);
+
+    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.epoch, sdb);
+    SQL_BIND_INT(dpp, *stmt, index, params->op.obj.state.epoch, sdb);
+
+    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.obj_tag, sdb);
+    SQL_ENCODE_BLOB_PARAM(dpp, *stmt, index, params->op.obj.state.obj_tag, sdb);
+
+    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.tail_tag, sdb);
+    SQL_ENCODE_BLOB_PARAM(dpp, *stmt, index, params->op.obj.state.tail_tag, sdb);
+
+    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.write_tag, sdb);
+    SQL_BIND_TEXT(dpp, *stmt, index, params->op.obj.state.write_tag.c_str(), sdb);
+
+    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.fake_tag, sdb);
+    SQL_BIND_INT(dpp, *stmt, index, params->op.obj.state.fake_tag, sdb);
+
+    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.shadow_obj, sdb);
+    SQL_BIND_TEXT(dpp, *stmt, index, params->op.obj.state.shadow_obj.c_str(), sdb);
+
+    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.has_data, sdb);
+    SQL_BIND_INT(dpp, *stmt, index, params->op.obj.state.has_data, sdb);
+
+    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.is_versioned, sdb);
+    SQL_BIND_INT(dpp, *stmt, index, params->op.obj.is_versioned, sdb);
+
+    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.version_num, sdb);
+    SQL_BIND_INT(dpp, *stmt, index, params->op.obj.version_num, sdb);
+
+    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.pg_ver, sdb);
+    SQL_BIND_INT(dpp, *stmt, index, params->op.obj.state.pg_ver, sdb);
+
+    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.zone_short_id, sdb);
+    SQL_BIND_INT(dpp, *stmt, index, params->op.obj.state.zone_short_id, sdb);
+
+    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.obj_version, sdb);
+    SQL_BIND_INT(dpp, *stmt, index, params->op.obj.state.objv_tracker.read_version.ver, sdb);
+
+    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.obj_version_tag, sdb);
+    SQL_BIND_TEXT(dpp, *stmt, index, params->op.obj.state.objv_tracker.read_version.tag.c_str(), sdb);
+
+    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.obj_attrs, sdb);
+    SQL_ENCODE_BLOB_PARAM(dpp, *stmt, index, params->op.obj.state.attrset, sdb);
+
+    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.head_size, sdb);
+    SQL_BIND_INT(dpp, *stmt, index, params->op.obj.head_size, sdb);
+
+    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.max_head_size, sdb);
+    SQL_BIND_INT(dpp, *stmt, index, params->op.obj.max_head_size, sdb);
+
+    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.obj_id, sdb);
+    SQL_BIND_TEXT(dpp, *stmt, index, params->op.obj.obj_id.c_str(), sdb);
+
+    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.tail_instance, sdb);
+    SQL_BIND_TEXT(dpp, *stmt, index, params->op.obj.tail_instance.c_str(), sdb);
+
+    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.head_placement_rule_name, sdb);
+    SQL_BIND_TEXT(dpp, *stmt, index, params->op.obj.head_placement_rule.name.c_str(), sdb);
+
+    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.head_placement_storage_class, sdb);
+    SQL_BIND_TEXT(dpp, *stmt, index, params->op.obj.head_placement_rule.storage_class.c_str(), sdb);
+
+    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.tail_placement_rule_name, sdb);
+    SQL_BIND_TEXT(dpp, *stmt, index, params->op.obj.tail_placement.placement_rule.name.c_str(), sdb);
+
+    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.tail_placement_storage_class, sdb);
+    SQL_BIND_TEXT(dpp, *stmt, index, params->op.obj.tail_placement.placement_rule.storage_class.c_str(), sdb);
+
+    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.manifest_part_objs, sdb);
+    SQL_ENCODE_BLOB_PARAM(dpp, *stmt, index, params->op.obj.objs, sdb);
+
+    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.manifest_part_rules, sdb);
+    SQL_ENCODE_BLOB_PARAM(dpp, *stmt, index, params->op.obj.rules, sdb);
+
+    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.omap, sdb);
+    SQL_ENCODE_BLOB_PARAM(dpp, *stmt, index, params->op.obj.omap, sdb);
+
+    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.is_multipart, sdb);
+    SQL_BIND_INT(dpp, *stmt, index, params->op.obj.is_multipart, sdb);
+
+    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.mp_parts, sdb);
+    SQL_ENCODE_BLOB_PARAM(dpp, *stmt, index, params->op.obj.mp_parts, sdb);
+
+    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.head_data, sdb);
+    SQL_ENCODE_BLOB_PARAM(dpp, *stmt, index, params->op.obj.head_data, sdb);
+  }
+
+out:
+  return rc;
+}
+
+int SQLUpdateObject::Execute(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int ret = -1;
+  sqlite3_stmt** stmt = NULL; // Prepared statement
+
+  if (params->op.query_str == "omap") { 
+    stmt = &omap_stmt;
+  } else if (params->op.query_str == "attrs") { 
+    stmt = &attrs_stmt;
+  } else if (params->op.query_str == "meta") { 
+    stmt = &meta_stmt;
+  } else if (params->op.query_str == "mp") { 
+    stmt = &mp_stmt;
+  } else {
+    ldpp_dout(dpp, 0)<<"In SQLUpdateObject invalid query_str:" <<
+      params->op.query_str << dendl;
+    goto out;
+  }
+
+  SQL_EXECUTE(dpp, params, *stmt, NULL);
+out:
+  return ret;
+}
+
+int SQLListBucketObjects::Prepare(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int ret = -1;
+  struct DBOpPrepareParams p_params = PrepareParams;
+
+  if (!*sdb) {
+    ldpp_dout(dpp, 0)<<"In SQLListBucketObjects - no db" << dendl;
+    goto out;
+  }
+
+  InitPrepareParams(dpp, p_params, params);
+
+  SQL_PREPARE(dpp, p_params, sdb, stmt, ret, "PrepareListBucketObjects");
+
+out:
+  return ret;
+}
+
+int SQLListBucketObjects::Bind(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int index = -1;
+  int rc = 0;
+  struct DBOpPrepareParams p_params = PrepareParams;
+
+  if (params->op.obj.state.obj.key.instance.empty()) {
+    params->op.obj.state.obj.key.instance = "null";
+  }
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.bucket_name, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.bucket.info.bucket.name.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.min_marker, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.min_marker.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.prefix, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.prefix.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.list_max_count, sdb);
+  SQL_BIND_INT(dpp, stmt, index, params->op.list_max_count, sdb);
+
+out:
+  return rc;
+}
+
+int SQLListBucketObjects::Execute(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int ret = -1;
+
+  SQL_EXECUTE(dpp, params, stmt, list_object);
+out:
+  return ret;
+}
+
+int SQLListVersionedObjects::Prepare(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int ret = -1;
+  struct DBOpPrepareParams p_params = PrepareParams;
+
+  if (!*sdb) {
+    ldpp_dout(dpp, 0)<<"In SQLListVersionedObjects - no db" << dendl;
+    goto out;
+  }
+
+  InitPrepareParams(dpp, p_params, params);
+
+  SQL_PREPARE(dpp, p_params, sdb, stmt, ret, "PrepareListVersionedObjects");
+
+out:
+  return ret;
+}
+
+int SQLListVersionedObjects::Bind(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int index = -1;
+  int rc = 0;
+  struct DBOpPrepareParams p_params = PrepareParams;
+
+  if (params->op.obj.state.obj.key.instance.empty()) {
+    params->op.obj.state.obj.key.instance = "null";
+  }
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.bucket_name, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.bucket.info.bucket.name.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.obj_name, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.state.obj.key.name.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.list_max_count, sdb);
+  SQL_BIND_INT(dpp, stmt, index, params->op.list_max_count, sdb);
+
+out:
+  return rc;
+}
+
+int SQLListVersionedObjects::Execute(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int ret = -1;
+
+  SQL_EXECUTE(dpp, params, stmt, list_object);
+out:
+  return ret;
+}
+
+int SQLPutObjectData::Prepare(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int ret = -1;
+  struct DBOpPrepareParams p_params = PrepareParams;
+
+  if (!*sdb) {
+    ldpp_dout(dpp, 0)<<"In SQLPutObjectData - no db" << dendl;
+    goto out;
+  }
+
+  InitPrepareParams(dpp, p_params, params);
+
+  SQL_PREPARE(dpp, p_params, sdb, stmt, ret, "PreparePutObjectData");
+
+out:
+  return ret;
+}
+
+int SQLPutObjectData::Bind(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int index = -1;
+  int rc = 0;
+  struct DBOpPrepareParams p_params = PrepareParams;
+
+  if (params->op.obj.state.obj.key.instance.empty()) {
+    params->op.obj.state.obj.key.instance = "null";
+  }
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.obj_name, sdb);
+
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.state.obj.key.name.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.obj_instance, sdb);
+
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.state.obj.key.instance.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.obj_ns, sdb);
+
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.state.obj.key.ns.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.bucket_name, sdb);
+
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.bucket.info.bucket.name.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.obj_id, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.obj_id.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj_data.part_num, sdb);
+
+  SQL_BIND_INT(dpp, stmt, index, params->op.obj_data.part_num, sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj_data.offset, sdb);
+
+  SQL_BIND_INT(dpp, stmt, index, params->op.obj_data.offset, sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj_data.data, sdb);
+  SQL_ENCODE_BLOB_PARAM(dpp, stmt, index, params->op.obj_data.data, sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj_data.size, sdb);
+
+  SQL_BIND_INT(dpp, stmt, index, params->op.obj_data.size, sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj_data.multipart_part_str, sdb);
+
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.obj_data.multipart_part_str.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.mtime, sdb);
+  SQL_ENCODE_BLOB_PARAM(dpp, stmt, index, params->op.obj.state.mtime, sdb);
+
+out:
+  return rc;
+}
+
+int SQLPutObjectData::Execute(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int ret = -1;
+
+  SQL_EXECUTE(dpp, params, stmt, NULL);
+out:
+  return ret;
+}
+
+int SQLUpdateObjectData::Prepare(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int ret = -1;
+  struct DBOpPrepareParams p_params = PrepareParams;
+
+  if (!*sdb) {
+    ldpp_dout(dpp, 0)<<"In SQLUpdateObjectData - no db" << dendl;
+    goto out;
+  }
+
+  InitPrepareParams(dpp, p_params, params);
+  SQL_PREPARE(dpp, p_params, sdb, stmt, ret, "PrepareUpdateObjectData");
+
+out:
+  return ret;
+}
+
+int SQLUpdateObjectData::Bind(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int index = -1;
+  int rc = 0;
+  struct DBOpPrepareParams p_params = PrepareParams;
+
+  if (params->op.obj.state.obj.key.instance.empty()) {
+    params->op.obj.state.obj.key.instance = "null";
+  }
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.obj_name, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.state.obj.key.name.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.obj_instance, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.state.obj.key.instance.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.bucket_name, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.bucket.info.bucket.name.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.obj_id, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.obj_id.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.mtime, sdb);
+  SQL_ENCODE_BLOB_PARAM(dpp, stmt, index, params->op.obj.state.mtime, sdb);
+
+out:
+  return rc;
+}
+
+int SQLUpdateObjectData::Execute(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int ret = -1;
+
+  SQL_EXECUTE(dpp, params, stmt, NULL);
+out:
+  return ret;
+}
+
+int SQLGetObjectData::Prepare(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int ret = -1;
+  struct DBOpPrepareParams p_params = PrepareParams;
+
+  if (!*sdb) {
+    ldpp_dout(dpp, 0)<<"In SQLGetObjectData - no db" << dendl;
+    goto out;
+  }
+
+  InitPrepareParams(dpp, p_params, params);
+  SQL_PREPARE(dpp, p_params, sdb, stmt, ret, "PrepareGetObjectData");
+
+out:
+  return ret;
+}
+
+int SQLGetObjectData::Bind(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int index = -1;
+  int rc = 0;
+  struct DBOpPrepareParams p_params = PrepareParams;
+
+  if (params->op.obj.state.obj.key.instance.empty()) {
+    params->op.obj.state.obj.key.instance = "null";
+  }
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.bucket_name, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.bucket.info.bucket.name.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.obj_name, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.state.obj.key.name.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.obj_instance, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.state.obj.key.instance.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.obj_id, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.obj_id.c_str(), sdb);
+
+out:
+  return rc;
+}
+
+int SQLGetObjectData::Execute(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int ret = -1;
+
+  SQL_EXECUTE(dpp, params, stmt, get_objectdata);
+out:
+  return ret;
+}
+
+int SQLDeleteObjectData::Prepare(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int ret = -1;
+  struct DBOpPrepareParams p_params = PrepareParams;
+
+  if (!*sdb) {
+    ldpp_dout(dpp, 0)<<"In SQLDeleteObjectData - no db" << dendl;
+    goto out;
+  }
+
+  InitPrepareParams(dpp, p_params, params);
+  SQL_PREPARE(dpp, p_params, sdb, stmt, ret, "PrepareDeleteObjectData");
+
+out:
+  return ret;
+}
+
+int SQLDeleteObjectData::Bind(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int index = -1;
+  int rc = 0;
+  struct DBOpPrepareParams p_params = PrepareParams;
+
+  if (params->op.obj.state.obj.key.instance.empty()) {
+    params->op.obj.state.obj.key.instance = "null";
+  }
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.bucket_name, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.bucket.info.bucket.name.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.obj_name, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.state.obj.key.name.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.obj_instance, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.state.obj.key.instance.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.obj_id, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.obj_id.c_str(), sdb);
+
+out:
+  return rc;
+}
+
+int SQLDeleteObjectData::Execute(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int ret = -1;
+
+  SQL_EXECUTE(dpp, params, stmt, NULL);
+out:
+  return ret;
+}
+
+int SQLDeleteStaleObjectData::Prepare(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int ret = -1;
+  struct DBOpPrepareParams p_params = PrepareParams;
+
+  if (!*sdb) {
+    ldpp_dout(dpp, 0)<<"In SQLDeleteStaleObjectData - no db" << dendl;
+    goto out;
+  }
+
+  InitPrepareParams(dpp, p_params, params);
+  SQL_PREPARE(dpp, p_params, sdb, stmt, ret, "PrepareDeleteStaleObjectData");
+
+out:
+  return ret;
+}
+
+int SQLDeleteStaleObjectData::Bind(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int index = -1;
+  int rc = 0;
+  struct DBOpPrepareParams p_params = PrepareParams;
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.mtime, sdb);
+  SQL_ENCODE_BLOB_PARAM(dpp, stmt, index, params->op.obj.state.mtime, sdb);
+
+out:
+  return rc;
+}
+
+int SQLDeleteStaleObjectData::Execute(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int ret = -1;
+
+  SQL_EXECUTE(dpp, params, stmt, NULL);
+out:
+  return ret;
+}
+
+int SQLInsertLCEntry::Prepare(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int ret = -1;
+  struct DBOpPrepareParams p_params = PrepareParams;
+
+  if (!*sdb) {
+    ldpp_dout(dpp, 0)<<"In SQLInsertLCEntry - no db" << dendl;
+    goto out;
+  }
+
+  InitPrepareParams(dpp, p_params, params);
+
+  SQL_PREPARE(dpp, p_params, sdb, stmt, ret, "PrepareInsertLCEntry");
+
+out:
+  return ret;
+}
+
+int SQLInsertLCEntry::Bind(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int index = -1;
+  int rc = 0;
+  struct DBOpPrepareParams p_params = PrepareParams;
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.lc_entry.index, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.lc_entry.index.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.lc_entry.bucket_name, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.lc_entry.entry.get_bucket().c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.lc_entry.status, sdb);
+  SQL_BIND_INT(dpp, stmt, index, params->op.lc_entry.entry.get_status(), sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.lc_entry.start_time, sdb);
+  SQL_BIND_INT(dpp, stmt, index, params->op.lc_entry.entry.get_start_time(), sdb);
+
+out:
+  return rc;
+}
+
+int SQLInsertLCEntry::Execute(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int ret = -1;
+
+  SQL_EXECUTE(dpp, params, stmt, NULL);
+out:
+  return ret;
+}
+
+int SQLRemoveLCEntry::Prepare(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int ret = -1;
+  struct DBOpPrepareParams p_params = PrepareParams;
+
+  if (!*sdb) {
+    ldpp_dout(dpp, 0)<<"In SQLRemoveLCEntry - no db" << dendl;
+    goto out;
+  }
+
+  InitPrepareParams(dpp, p_params, params);
+
+  SQL_PREPARE(dpp, p_params, sdb, stmt, ret, "PrepareRemoveLCEntry");
+
+out:
+  return ret;
+}
+
+int SQLRemoveLCEntry::Bind(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int index = -1;
+  int rc = 0;
+  struct DBOpPrepareParams p_params = PrepareParams;
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.lc_entry.index, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.lc_entry.index.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.lc_entry.bucket_name, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.lc_entry.entry.get_bucket().c_str(), sdb);
+
+out:
+  return rc;
+}
+
+int SQLRemoveLCEntry::Execute(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int ret = -1;
+
+  SQL_EXECUTE(dpp, params, stmt, NULL);
+out:
+  return ret;
+}
+
+int SQLGetLCEntry::Prepare(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int ret = -1;
+  sqlite3_stmt** pstmt = NULL; // Prepared statement
+  struct DBOpPrepareParams p_params = PrepareParams;
+
+  if (!*sdb) {
+    ldpp_dout(dpp, 0)<<"In SQLGetLCEntry - no db" << dendl;
+    goto out;
+  }
+
+  InitPrepareParams(dpp, p_params, params);
+
+  if (params->op.query_str == "get_next_entry") {
+    pstmt = &next_stmt;
+  } else {
+    pstmt = &stmt;
+  }
+  SQL_PREPARE(dpp, p_params, sdb, *pstmt, ret, "PrepareGetLCEntry");
+
+out:
+  return ret;
+}
+
+int SQLGetLCEntry::Bind(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int index = -1;
+  int rc = 0;
+  struct DBOpPrepareParams p_params = PrepareParams;
+  sqlite3_stmt** pstmt = NULL; // Prepared statement
+
+  if (params->op.query_str == "get_next_entry") {
+    pstmt = &next_stmt;
+  } else {
+    pstmt = &stmt;
+  }
+  SQL_BIND_INDEX(dpp, *pstmt, index, p_params.op.lc_entry.index, sdb);
+  SQL_BIND_TEXT(dpp, *pstmt, index, params->op.lc_entry.index.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, *pstmt, index, p_params.op.lc_entry.bucket_name, sdb);
+  SQL_BIND_TEXT(dpp, *pstmt, index, params->op.lc_entry.entry.get_bucket().c_str(), sdb);
+
+out:
+  return rc;
+}
+
+int SQLGetLCEntry::Execute(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int ret = -1;
+  sqlite3_stmt** pstmt = NULL; // Prepared statement
+
+  if (params->op.query_str == "get_next_entry") {
+    pstmt = &next_stmt;
+  } else {
+    pstmt = &stmt;
+  }
+
+  SQL_EXECUTE(dpp, params, *pstmt, list_lc_entry);
+out:
+  return ret;
+}
+
+int SQLListLCEntries::Prepare(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int ret = -1;
+  struct DBOpPrepareParams p_params = PrepareParams;
+
+  if (!*sdb) {
+    ldpp_dout(dpp, 0)<<"In SQLListLCEntries - no db" << dendl;
+    goto out;
+  }
+
+  InitPrepareParams(dpp, p_params, params);
+
+  SQL_PREPARE(dpp, p_params, sdb, stmt, ret, "PrepareListLCEntries");
+
+out:
+  return ret;
+}
+
+int SQLListLCEntries::Bind(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int index = -1;
+  int rc = 0;
+  struct DBOpPrepareParams p_params = PrepareParams;
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.lc_entry.index, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.lc_entry.index.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.lc_entry.min_marker, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.lc_entry.min_marker.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.list_max_count, sdb);
+  SQL_BIND_INT(dpp, stmt, index, params->op.list_max_count, sdb);
+
+out:
+  return rc;
+}
+
+int SQLListLCEntries::Execute(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int ret = -1;
+
+  SQL_EXECUTE(dpp, params, stmt, list_lc_entry);
+out:
+  return ret;
+}
+
+int SQLInsertLCHead::Prepare(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int ret = -1;
+  struct DBOpPrepareParams p_params = PrepareParams;
+
+  if (!*sdb) {
+    ldpp_dout(dpp, 0)<<"In SQLInsertLCHead - no db" << dendl;
+    goto out;
+  }
+
+  InitPrepareParams(dpp, p_params, params);
+
+  SQL_PREPARE(dpp, p_params, sdb, stmt, ret, "PrepareInsertLCHead");
+
+out:
+  return ret;
+}
+
+int SQLInsertLCHead::Bind(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int index = -1;
+  int rc = 0;
+  struct DBOpPrepareParams p_params = PrepareParams;
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.lc_head.index, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.lc_head.index.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.lc_head.marker, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.lc_head.head.get_marker().c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.lc_head.start_date, sdb);
+  SQL_ENCODE_BLOB_PARAM(dpp, stmt, index, static_cast<int64_t>(params->op.lc_head.head.start_date), sdb);
+
+out:
+  return rc;
+}
+
+int SQLInsertLCHead::Execute(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int ret = -1;
+
+  SQL_EXECUTE(dpp, params, stmt, NULL);
+out:
+  return ret;
+}
+
+int SQLRemoveLCHead::Prepare(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int ret = -1;
+  struct DBOpPrepareParams p_params = PrepareParams;
+
+  if (!*sdb) {
+    ldpp_dout(dpp, 0)<<"In SQLRemoveLCHead - no db" << dendl;
+    goto out;
+  }
+
+  InitPrepareParams(dpp, p_params, params);
+
+  SQL_PREPARE(dpp, p_params, sdb, stmt, ret, "PrepareRemoveLCHead");
+
+out:
+  return ret;
+}
+
+int SQLRemoveLCHead::Bind(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int index = -1;
+  int rc = 0;
+  struct DBOpPrepareParams p_params = PrepareParams;
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.lc_head.index, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.lc_head.index.c_str(), sdb);
+
+out:
+  return rc;
+}
+
+int SQLRemoveLCHead::Execute(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int ret = -1;
+
+  SQL_EXECUTE(dpp, params, stmt, NULL);
+out:
+  return ret;
+}
+
+int SQLGetLCHead::Prepare(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int ret = -1;
+  struct DBOpPrepareParams p_params = PrepareParams;
+
+  if (!*sdb) {
+    ldpp_dout(dpp, 0)<<"In SQLGetLCHead - no db" << dendl;
+    goto out;
+  }
+
+  InitPrepareParams(dpp, p_params, params);
+
+  SQL_PREPARE(dpp, p_params, sdb, stmt, ret, "PrepareGetLCHead");
+
+out:
+  return ret;
+}
+
+int SQLGetLCHead::Bind(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int index = -1;
+  int rc = 0;
+  struct DBOpPrepareParams p_params = PrepareParams;
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.lc_head.index, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.lc_head.index.c_str(), sdb);
+
+out:
+  return rc;
+}
+
+int SQLGetLCHead::Execute(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int ret = -1;
+
+  // clear the params before fetching the entry
+  params->op.lc_head.head = {};
+  SQL_EXECUTE(dpp, params, stmt, list_lc_head);
+out:
+  return ret;
+}
diff --git a/src/rgw/driver/dbstore/sqlite/sqliteDB.h b/src/rgw/driver/dbstore/sqlite/sqliteDB.h
new file mode 100644 (file)
index 0000000..4f65144
--- /dev/null
@@ -0,0 +1,554 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef SQLITE_DB_H
+#define SQLITE_DB_H
+
+#include <errno.h>
+#include <stdlib.h>
+#include <string>
+#include <sqlite3.h>
+#include "rgw/driver/dbstore/common/dbstore.h"
+
+using namespace rgw::store;
+
+class SQLiteDB : public DB, virtual public DBOp {
+  private:
+    sqlite3_mutex *mutex = NULL;
+
+  protected:
+    CephContext *cct;
+
+  public:
+    sqlite3_stmt *stmt = NULL;
+    DBOpPrepareParams PrepareParams;
+
+    SQLiteDB(sqlite3 *dbi, std::string db_name, CephContext *_cct) : DB(db_name, _cct), cct(_cct) {
+      db = (void*)dbi;
+    }
+    SQLiteDB(std::string db_name, CephContext *_cct) : DB(db_name, _cct), cct(_cct) {
+    }
+    ~SQLiteDB() {}
+
+    uint64_t get_blob_limit() override { return SQLITE_LIMIT_LENGTH; }
+    void *openDB(const DoutPrefixProvider *dpp) override;
+    int closeDB(const DoutPrefixProvider *dpp) override;
+    int InitializeDBOps(const DoutPrefixProvider *dpp) override;
+
+    int InitPrepareParams(const DoutPrefixProvider *dpp, DBOpPrepareParams &p_params,
+                          DBOpParams* params) override;
+
+    int exec(const DoutPrefixProvider *dpp, const char *schema,
+        int (*callback)(void*,int,char**,char**));
+    int Step(const DoutPrefixProvider *dpp, DBOpInfo &op, sqlite3_stmt *stmt,
+        int (*cbk)(const DoutPrefixProvider *dpp, DBOpInfo &op, sqlite3_stmt *stmt));
+    int Reset(const DoutPrefixProvider *dpp, sqlite3_stmt *stmt);
+    /* default value matches with sqliteDB style */
+
+    int createTables(const DoutPrefixProvider *dpp) override;
+    int createBucketTable(const DoutPrefixProvider *dpp, DBOpParams *params);
+    int createUserTable(const DoutPrefixProvider *dpp, DBOpParams *params);
+    int createObjectTable(const DoutPrefixProvider *dpp, DBOpParams *params);
+    int createObjectDataTable(const DoutPrefixProvider *dpp, DBOpParams *params);
+    int createObjectView(const DoutPrefixProvider *dpp, DBOpParams *params);
+    int createObjectTableTrigger(const DoutPrefixProvider *dpp, DBOpParams *params);
+    int createQuotaTable(const DoutPrefixProvider *dpp, DBOpParams *params);
+    void populate_object_params(const DoutPrefixProvider *dpp,
+                                struct DBOpPrepareParams& p_params,
+                                struct DBOpParams* params, bool data);
+
+    int createLCTables(const DoutPrefixProvider *dpp) override;
+
+    int DeleteBucketTable(const DoutPrefixProvider *dpp, DBOpParams *params);
+    int DeleteUserTable(const DoutPrefixProvider *dpp, DBOpParams *params);
+    int DeleteObjectTable(const DoutPrefixProvider *dpp, DBOpParams *params);
+    int DeleteObjectDataTable(const DoutPrefixProvider *dpp, DBOpParams *params);
+    int DeleteQuotaTable(const DoutPrefixProvider *dpp, DBOpParams *params);
+    int DeleteLCEntryTable(const DoutPrefixProvider *dpp, DBOpParams *params);
+    int DeleteLCHeadTable(const DoutPrefixProvider *dpp, DBOpParams *params);
+
+    int ListAllBuckets(const DoutPrefixProvider *dpp, DBOpParams *params) override;
+    int ListAllUsers(const DoutPrefixProvider *dpp, DBOpParams *params) override;
+    int ListAllObjects(const DoutPrefixProvider *dpp, DBOpParams *params) override;
+};
+
+class SQLObjectOp : public ObjectOp {
+  private:
+    sqlite3 **sdb = NULL;
+    CephContext *cct;
+
+  public:
+    SQLObjectOp(sqlite3 **sdbi, CephContext *_cct) : sdb(sdbi), cct(_cct) {};
+    ~SQLObjectOp() {}
+
+    int InitializeObjectOps(std::string db_name, const DoutPrefixProvider *dpp);
+};
+
+class SQLInsertUser : public SQLiteDB, public InsertUserOp {
+  private:
+    sqlite3 **sdb = NULL;
+    sqlite3_stmt *stmt = NULL; // Prepared statement
+
+  public:
+    SQLInsertUser(void **db, std::string db_name, CephContext *cct) : SQLiteDB((sqlite3 *)(*db), db_name, cct), sdb((sqlite3 **)db) {}
+    ~SQLInsertUser() {
+      if (stmt)
+        sqlite3_finalize(stmt);
+    }
+    int Prepare(const DoutPrefixProvider *dpp, DBOpParams *params);
+    int Execute(const DoutPrefixProvider *dpp, DBOpParams *params);
+    int Bind(const DoutPrefixProvider *dpp, DBOpParams *params);
+};
+
+class SQLRemoveUser : public SQLiteDB, public RemoveUserOp {
+  private:
+    sqlite3 **sdb = NULL;
+    sqlite3_stmt *stmt = NULL; // Prepared statement
+
+  public:
+    SQLRemoveUser(void **db, std::string db_name, CephContext *cct) : SQLiteDB((sqlite3 *)(*db), db_name, cct), sdb((sqlite3 **)db) {}
+    ~SQLRemoveUser() {
+      if (stmt)
+        sqlite3_finalize(stmt);
+    }
+    int Prepare(const DoutPrefixProvider *dpp, DBOpParams *params);
+    int Execute(const DoutPrefixProvider *dpp, DBOpParams *params);
+    int Bind(const DoutPrefixProvider *dpp, DBOpParams *params);
+};
+
+class SQLGetUser : public SQLiteDB, public GetUserOp {
+  private:
+    sqlite3 **sdb = NULL;
+    sqlite3_stmt *stmt = NULL; // Prepared statement
+    sqlite3_stmt *email_stmt = NULL; // Prepared statement to query by useremail
+    sqlite3_stmt *ak_stmt = NULL; // Prepared statement to query by access_key_id
+    sqlite3_stmt *userid_stmt = NULL; // Prepared statement to query by user_id
+
+  public:
+    SQLGetUser(void **db, std::string db_name, CephContext *cct) : SQLiteDB((sqlite3 *)(*db), db_name, cct), sdb((sqlite3 **)db) {}
+    ~SQLGetUser() {
+      if (stmt)
+        sqlite3_finalize(stmt);
+      if (email_stmt)
+        sqlite3_finalize(email_stmt);
+      if (ak_stmt)
+        sqlite3_finalize(ak_stmt);
+      if (userid_stmt)
+        sqlite3_finalize(userid_stmt);
+    }
+    int Prepare(const DoutPrefixProvider *dpp, DBOpParams *params);
+    int Execute(const DoutPrefixProvider *dpp, DBOpParams *params);
+    int Bind(const DoutPrefixProvider *dpp, DBOpParams *params);
+};
+
+class SQLInsertBucket : public SQLiteDB, public InsertBucketOp {
+  private:
+    sqlite3 **sdb = NULL;
+    sqlite3_stmt *stmt = NULL; // Prepared statement
+
+  public:
+    SQLInsertBucket(void **db, std::string db_name, CephContext *cct) : SQLiteDB((sqlite3 *)(*db), db_name, cct), sdb((sqlite3 **)db) {}
+    ~SQLInsertBucket() {
+      if (stmt)
+        sqlite3_finalize(stmt);
+    }
+    int Prepare(const DoutPrefixProvider *dpp, DBOpParams *params);
+    int Execute(const DoutPrefixProvider *dpp, DBOpParams *params);
+    int Bind(const DoutPrefixProvider *dpp, DBOpParams *params);
+};
+
+class SQLUpdateBucket : public SQLiteDB, public UpdateBucketOp {
+  private:
+    sqlite3 **sdb = NULL;
+    sqlite3_stmt *info_stmt = NULL; // Prepared statement
+    sqlite3_stmt *attrs_stmt = NULL; // Prepared statement
+    sqlite3_stmt *owner_stmt = NULL; // Prepared statement
+
+  public:
+    SQLUpdateBucket(void **db, std::string db_name, CephContext *cct) : SQLiteDB((sqlite3 *)(*db), db_name, cct), sdb((sqlite3 **)db) {}
+    ~SQLUpdateBucket() {
+      if (info_stmt)
+        sqlite3_finalize(info_stmt);
+      if (attrs_stmt)
+        sqlite3_finalize(attrs_stmt);
+      if (owner_stmt)
+        sqlite3_finalize(owner_stmt);
+    }
+    int Prepare(const DoutPrefixProvider *dpp, DBOpParams *params);
+    int Execute(const DoutPrefixProvider *dpp, DBOpParams *params);
+    int Bind(const DoutPrefixProvider *dpp, DBOpParams *params);
+};
+
+class SQLRemoveBucket : public SQLiteDB, public RemoveBucketOp {
+  private:
+    sqlite3 **sdb = NULL;
+    sqlite3_stmt *stmt = NULL; // Prepared statement
+
+  public:
+    SQLRemoveBucket(void **db, std::string db_name, CephContext *cct) : SQLiteDB((sqlite3 *)(*db), db_name, cct), sdb((sqlite3 **)db) {}
+    ~SQLRemoveBucket() {
+      if (stmt)
+        sqlite3_finalize(stmt);
+    }
+    int Prepare(const DoutPrefixProvider *dpp, DBOpParams *params);
+    int Execute(const DoutPrefixProvider *dpp, DBOpParams *params);
+    int Bind(const DoutPrefixProvider *dpp, DBOpParams *params);
+};
+
+class SQLGetBucket : public SQLiteDB, public GetBucketOp {
+  private:
+    sqlite3 **sdb = NULL;
+    sqlite3_stmt *stmt = NULL; // Prepared statement
+
+  public:
+    SQLGetBucket(void **db, std::string db_name, CephContext *cct) : SQLiteDB((sqlite3 *)(*db), db_name, cct), sdb((sqlite3 **)db) {}
+    ~SQLGetBucket() {
+      if (stmt)
+        sqlite3_finalize(stmt);
+    }
+    int Prepare(const DoutPrefixProvider *dpp, DBOpParams *params);
+    int Execute(const DoutPrefixProvider *dpp, DBOpParams *params);
+    int Bind(const DoutPrefixProvider *dpp, DBOpParams *params);
+};
+
+class SQLListUserBuckets : public SQLiteDB, public ListUserBucketsOp {
+  private:
+    sqlite3 **sdb = NULL;
+    sqlite3_stmt *stmt = NULL; // Prepared statement
+    sqlite3_stmt *all_stmt = NULL; // Prepared statement
+
+  public:
+    SQLListUserBuckets(void **db, std::string db_name, CephContext *cct) : SQLiteDB((sqlite3 *)(*db), db_name, cct), sdb((sqlite3 **)db) {}
+    ~SQLListUserBuckets() {
+      if (stmt)
+        sqlite3_finalize(stmt);
+      if (all_stmt)
+        sqlite3_finalize(all_stmt);
+    }
+    int Prepare(const DoutPrefixProvider *dpp, DBOpParams *params);
+    int Execute(const DoutPrefixProvider *dpp, DBOpParams *params);
+    int Bind(const DoutPrefixProvider *dpp, DBOpParams *params);
+};
+
+class SQLPutObject : public SQLiteDB, public PutObjectOp {
+  private:
+    sqlite3 **sdb = NULL;
+    sqlite3_stmt *stmt = NULL; // Prepared statement
+
+  public:
+    SQLPutObject(void **db, std::string db_name, CephContext *cct) : SQLiteDB((sqlite3 *)(*db), db_name, cct), sdb((sqlite3 **)db) {}
+    SQLPutObject(sqlite3 **sdbi, std::string db_name, CephContext *cct) : SQLiteDB(*sdbi, db_name, cct), sdb(sdbi) {}
+
+    ~SQLPutObject() {
+      if (stmt)
+        sqlite3_finalize(stmt);
+    }
+    int Prepare(const DoutPrefixProvider *dpp, DBOpParams *params);
+    int Execute(const DoutPrefixProvider *dpp, DBOpParams *params);
+    int Bind(const DoutPrefixProvider *dpp, DBOpParams *params);
+};
+
+class SQLDeleteObject : public SQLiteDB, public DeleteObjectOp {
+  private:
+    sqlite3 **sdb = NULL;
+    sqlite3_stmt *stmt = NULL; // Prepared statement
+
+  public:
+    SQLDeleteObject(void **db, std::string db_name, CephContext *cct) : SQLiteDB((sqlite3 *)(*db), db_name, cct), sdb((sqlite3 **)db) {}
+    SQLDeleteObject(sqlite3 **sdbi, std::string db_name, CephContext *cct) : SQLiteDB(*sdbi, db_name, cct), sdb(sdbi) {}
+
+    ~SQLDeleteObject() {
+      if (stmt)
+        sqlite3_finalize(stmt);
+    }
+    int Prepare(const DoutPrefixProvider *dpp, DBOpParams *params);
+    int Execute(const DoutPrefixProvider *dpp, DBOpParams *params);
+    int Bind(const DoutPrefixProvider *dpp, DBOpParams *params);
+};
+
+class SQLGetObject : public SQLiteDB, public GetObjectOp {
+  private:
+    sqlite3 **sdb = NULL;
+    sqlite3_stmt *stmt = NULL; // Prepared statement
+
+  public:
+    SQLGetObject(void **db, std::string db_name, CephContext *cct) : SQLiteDB((sqlite3 *)(*db), db_name, cct), sdb((sqlite3 **)db) {}
+    SQLGetObject(sqlite3 **sdbi, std::string db_name, CephContext *cct) : SQLiteDB(*sdbi, db_name, cct), sdb(sdbi) {}
+
+    ~SQLGetObject() {
+      if (stmt)
+        sqlite3_finalize(stmt);
+    }
+    int Prepare(const DoutPrefixProvider *dpp, DBOpParams *params);
+    int Execute(const DoutPrefixProvider *dpp, DBOpParams *params);
+    int Bind(const DoutPrefixProvider *dpp, DBOpParams *params);
+};
+
+class SQLUpdateObject : public SQLiteDB, public UpdateObjectOp {
+  private:
+    sqlite3 **sdb = NULL;
+    sqlite3_stmt *omap_stmt = NULL; // Prepared statement
+    sqlite3_stmt *attrs_stmt = NULL; // Prepared statement
+    sqlite3_stmt *meta_stmt = NULL; // Prepared statement
+    sqlite3_stmt *mp_stmt = NULL; // Prepared statement
+
+  public:
+    SQLUpdateObject(void **db, std::string db_name, CephContext *cct) : SQLiteDB((sqlite3 *)(*db), db_name, cct), sdb((sqlite3 **)db) {}
+    SQLUpdateObject(sqlite3 **sdbi, std::string db_name, CephContext *cct) : SQLiteDB(*sdbi, db_name, cct), sdb(sdbi) {}
+
+    ~SQLUpdateObject() {
+      if (omap_stmt)
+        sqlite3_finalize(omap_stmt);
+      if (attrs_stmt)
+        sqlite3_finalize(attrs_stmt);
+      if (meta_stmt)
+        sqlite3_finalize(meta_stmt);
+    }
+
+    int Prepare(const DoutPrefixProvider *dpp, DBOpParams *params);
+    int Execute(const DoutPrefixProvider *dpp, DBOpParams *params);
+    int Bind(const DoutPrefixProvider *dpp, DBOpParams *params);
+};
+
+class SQLListBucketObjects : public SQLiteDB, public ListBucketObjectsOp {
+  private:
+    sqlite3 **sdb = NULL;
+    sqlite3_stmt *stmt = NULL; // Prepared statement
+
+  public:
+    SQLListBucketObjects(void **db, std::string db_name, CephContext *cct) : SQLiteDB((sqlite3 *)(*db), db_name, cct), sdb((sqlite3 **)db) {}
+    SQLListBucketObjects(sqlite3 **sdbi, std::string db_name, CephContext *cct) : SQLiteDB(*sdbi, db_name, cct), sdb(sdbi) {}
+
+    ~SQLListBucketObjects() {
+      if (stmt)
+        sqlite3_finalize(stmt);
+    }
+    int Prepare(const DoutPrefixProvider *dpp, DBOpParams *params);
+    int Execute(const DoutPrefixProvider *dpp, DBOpParams *params);
+    int Bind(const DoutPrefixProvider *dpp, DBOpParams *params);
+};
+
+class SQLListVersionedObjects : public SQLiteDB, public ListVersionedObjectsOp {
+  private:
+    sqlite3 **sdb = NULL;
+    sqlite3_stmt *stmt = NULL; // Prepared statement
+
+  public:
+    SQLListVersionedObjects(void **db, std::string db_name, CephContext *cct) : SQLiteDB((sqlite3 *)(*db), db_name, cct), sdb((sqlite3 **)db) {}
+    SQLListVersionedObjects(sqlite3 **sdbi, std::string db_name, CephContext *cct) : SQLiteDB(*sdbi, db_name, cct), sdb(sdbi) {}
+
+    ~SQLListVersionedObjects() {
+      if (stmt)
+        sqlite3_finalize(stmt);
+    }
+    int Prepare(const DoutPrefixProvider *dpp, DBOpParams *params);
+    int Execute(const DoutPrefixProvider *dpp, DBOpParams *params);
+    int Bind(const DoutPrefixProvider *dpp, DBOpParams *params);
+};
+
+class SQLPutObjectData : public SQLiteDB, public PutObjectDataOp {
+  private:
+    sqlite3 **sdb = NULL;
+    sqlite3_stmt *stmt = NULL; // Prepared statement
+
+  public:
+    SQLPutObjectData(void **db, std::string db_name, CephContext *cct) : SQLiteDB((sqlite3 *)(*db), db_name, cct), sdb((sqlite3 **)db) {}
+    SQLPutObjectData(sqlite3 **sdbi, std::string db_name, CephContext *cct) : SQLiteDB(*sdbi, db_name, cct), sdb(sdbi) {}
+
+    ~SQLPutObjectData() {
+      if (stmt)
+        sqlite3_finalize(stmt);
+    }
+    int Prepare(const DoutPrefixProvider *dpp, DBOpParams *params);
+    int Execute(const DoutPrefixProvider *dpp, DBOpParams *params);
+    int Bind(const DoutPrefixProvider *dpp, DBOpParams *params);
+};
+
+class SQLUpdateObjectData : public SQLiteDB, public UpdateObjectDataOp {
+  private:
+    sqlite3 **sdb = NULL;
+    sqlite3_stmt *stmt = NULL; // Prepared statement
+
+  public:
+    SQLUpdateObjectData(void **db, std::string db_name, CephContext *cct) : SQLiteDB((sqlite3 *)(*db), db_name, cct), sdb((sqlite3 **)db) {}
+    SQLUpdateObjectData(sqlite3 **sdbi, std::string db_name, CephContext *cct) : SQLiteDB(*sdbi, db_name, cct), sdb(sdbi) {}
+
+    ~SQLUpdateObjectData() {
+      if (stmt)
+        sqlite3_finalize(stmt);
+    }
+    int Prepare(const DoutPrefixProvider *dpp, DBOpParams *params);
+    int Execute(const DoutPrefixProvider *dpp, DBOpParams *params);
+    int Bind(const DoutPrefixProvider *dpp, DBOpParams *params);
+};
+
+class SQLGetObjectData : public SQLiteDB, public GetObjectDataOp {
+  private:
+    sqlite3 **sdb = NULL;
+    sqlite3_stmt *stmt = NULL; // Prepared statement
+
+  public:
+    SQLGetObjectData(void **db, std::string db_name, CephContext *cct) : SQLiteDB((sqlite3 *)(*db), db_name, cct), sdb((sqlite3 **)db) {}
+    SQLGetObjectData(sqlite3 **sdbi, std::string db_name, CephContext *cct) : SQLiteDB(*sdbi, db_name, cct), sdb(sdbi) {}
+
+    ~SQLGetObjectData() {
+      if (stmt)
+        sqlite3_finalize(stmt);
+    }
+    int Prepare(const DoutPrefixProvider *dpp, DBOpParams *params);
+    int Execute(const DoutPrefixProvider *dpp, DBOpParams *params);
+    int Bind(const DoutPrefixProvider *dpp, DBOpParams *params);
+};
+
+class SQLDeleteObjectData : public SQLiteDB, public DeleteObjectDataOp {
+  private:
+    sqlite3 **sdb = NULL;
+    sqlite3_stmt *stmt = NULL; // Prepared statement
+
+  public:
+    SQLDeleteObjectData(void **db, std::string db_name, CephContext *cct) : SQLiteDB((sqlite3 *)(*db), db_name, cct), sdb((sqlite3 **)db) {}
+    SQLDeleteObjectData(sqlite3 **sdbi, std::string db_name, CephContext *cct) : SQLiteDB(*sdbi, db_name, cct), sdb(sdbi) {}
+
+    ~SQLDeleteObjectData() {
+      if (stmt)
+        sqlite3_finalize(stmt);
+    }
+    int Prepare(const DoutPrefixProvider *dpp, DBOpParams *params);
+    int Execute(const DoutPrefixProvider *dpp, DBOpParams *params);
+    int Bind(const DoutPrefixProvider *dpp, DBOpParams *params);
+};
+
+class SQLDeleteStaleObjectData : public SQLiteDB, public DeleteStaleObjectDataOp {
+  private:
+    sqlite3 **sdb = NULL;
+    sqlite3_stmt *stmt = NULL; // Prepared statement
+
+  public:
+    SQLDeleteStaleObjectData(void **db, std::string db_name, CephContext *cct) : SQLiteDB((sqlite3 *)(*db), db_name, cct), sdb((sqlite3 **)db) {}
+    SQLDeleteStaleObjectData(sqlite3 **sdbi, std::string db_name, CephContext *cct) : SQLiteDB(*sdbi, db_name, cct), sdb(sdbi) {}
+
+    ~SQLDeleteStaleObjectData() {
+      if (stmt)
+        sqlite3_finalize(stmt);
+    }
+    int Prepare(const DoutPrefixProvider *dpp, DBOpParams *params);
+    int Execute(const DoutPrefixProvider *dpp, DBOpParams *params);
+    int Bind(const DoutPrefixProvider *dpp, DBOpParams *params);
+};
+
+class SQLInsertLCEntry : public SQLiteDB, public InsertLCEntryOp {
+  private:
+    sqlite3 **sdb = NULL;
+    sqlite3_stmt *stmt = NULL; // Prepared statement
+
+  public:
+    SQLInsertLCEntry(void **db, std::string db_name, CephContext *cct) : SQLiteDB((sqlite3 *)(*db), db_name, cct), sdb((sqlite3 **)db) {}
+    ~SQLInsertLCEntry() {
+      if (stmt)
+        sqlite3_finalize(stmt);
+    }
+    int Prepare(const DoutPrefixProvider *dpp, DBOpParams *params);
+    int Execute(const DoutPrefixProvider *dpp, DBOpParams *params);
+    int Bind(const DoutPrefixProvider *dpp, DBOpParams *params);
+};
+
+class SQLRemoveLCEntry : public SQLiteDB, public RemoveLCEntryOp {
+  private:
+    sqlite3 **sdb = NULL;
+    sqlite3_stmt *stmt = NULL; // Prepared statement
+
+  public:
+    SQLRemoveLCEntry(void **db, std::string db_name, CephContext *cct) : SQLiteDB((sqlite3 *)(*db), db_name, cct), sdb((sqlite3 **)db) {}
+    ~SQLRemoveLCEntry() {
+      if (stmt)
+        sqlite3_finalize(stmt);
+    }
+    int Prepare(const DoutPrefixProvider *dpp, DBOpParams *params);
+    int Execute(const DoutPrefixProvider *dpp, DBOpParams *params);
+    int Bind(const DoutPrefixProvider *dpp, DBOpParams *params);
+};
+
+class SQLGetLCEntry : public SQLiteDB, public GetLCEntryOp {
+  private:
+    sqlite3 **sdb = NULL;
+    sqlite3_stmt *stmt = NULL; // Prepared statement
+    sqlite3_stmt *next_stmt = NULL; // Prepared statement
+
+  public:
+    SQLGetLCEntry(void **db, std::string db_name, CephContext *cct) : SQLiteDB((sqlite3 *)(*db), db_name, cct), sdb((sqlite3 **)db) {}
+    ~SQLGetLCEntry() {
+      if (stmt)
+        sqlite3_finalize(stmt);
+      if (next_stmt)
+        sqlite3_finalize(next_stmt);
+    }
+    int Prepare(const DoutPrefixProvider *dpp, DBOpParams *params);
+    int Execute(const DoutPrefixProvider *dpp, DBOpParams *params);
+    int Bind(const DoutPrefixProvider *dpp, DBOpParams *params);
+};
+
+class SQLListLCEntries : public SQLiteDB, public ListLCEntriesOp {
+  private:
+    sqlite3 **sdb = NULL;
+    sqlite3_stmt *stmt = NULL; // Prepared statement
+
+  public:
+    SQLListLCEntries(void **db, std::string db_name, CephContext *cct) : SQLiteDB((sqlite3 *)(*db), db_name, cct), sdb((sqlite3 **)db) {}
+    ~SQLListLCEntries() {
+      if (stmt)
+        sqlite3_finalize(stmt);
+    }
+    int Prepare(const DoutPrefixProvider *dpp, DBOpParams *params);
+    int Execute(const DoutPrefixProvider *dpp, DBOpParams *params);
+    int Bind(const DoutPrefixProvider *dpp, DBOpParams *params);
+};
+
+class SQLInsertLCHead : public SQLiteDB, public InsertLCHeadOp {
+  private:
+    sqlite3 **sdb = NULL;
+    sqlite3_stmt *stmt = NULL; // Prepared statement
+
+  public:
+    SQLInsertLCHead(void **db, std::string db_name, CephContext *cct) : SQLiteDB((sqlite3 *)(*db), db_name, cct), sdb((sqlite3 **)db) {}
+    ~SQLInsertLCHead() {
+      if (stmt)
+        sqlite3_finalize(stmt);
+    }
+    int Prepare(const DoutPrefixProvider *dpp, DBOpParams *params);
+    int Execute(const DoutPrefixProvider *dpp, DBOpParams *params);
+    int Bind(const DoutPrefixProvider *dpp, DBOpParams *params);
+};
+
+class SQLRemoveLCHead : public SQLiteDB, public RemoveLCHeadOp {
+  private:
+    sqlite3 **sdb = NULL;
+    sqlite3_stmt *stmt = NULL; // Prepared statement
+
+  public:
+    SQLRemoveLCHead(void **db, std::string db_name, CephContext *cct) : SQLiteDB((sqlite3 *)(*db), db_name, cct), sdb((sqlite3 **)db) {}
+    ~SQLRemoveLCHead() {
+      if (stmt)
+        sqlite3_finalize(stmt);
+    }
+    int Prepare(const DoutPrefixProvider *dpp, DBOpParams *params);
+    int Execute(const DoutPrefixProvider *dpp, DBOpParams *params);
+    int Bind(const DoutPrefixProvider *dpp, DBOpParams *params);
+};
+
+class SQLGetLCHead : public SQLiteDB, public GetLCHeadOp {
+  private:
+    sqlite3 **sdb = NULL;
+    sqlite3_stmt *stmt = NULL; // Prepared statement
+
+  public:
+    SQLGetLCHead(void **db, std::string db_name, CephContext *cct) : SQLiteDB((sqlite3 *)(*db), db_name, cct), sdb((sqlite3 **)db) {}
+    ~SQLGetLCHead() {
+      if (stmt)
+        sqlite3_finalize(stmt);
+    }
+    int Prepare(const DoutPrefixProvider *dpp, DBOpParams *params);
+    int Execute(const DoutPrefixProvider *dpp, DBOpParams *params);
+    int Bind(const DoutPrefixProvider *dpp, DBOpParams *params);
+};
+
+#endif
diff --git a/src/rgw/driver/dbstore/sqlite/statement.cc b/src/rgw/driver/dbstore/sqlite/statement.cc
new file mode 100644 (file)
index 0000000..dcf7dba
--- /dev/null
@@ -0,0 +1,196 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2022 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "common/dout.h"
+#include "error.h"
+#include "statement.h"
+
+#define dout_subsys ceph_subsys_rgw_dbstore
+
+namespace rgw::dbstore::sqlite {
+
+// owning pointer to arbitrary memory allocated and returned by sqlite3
+struct sqlite_deleter {
+  template <typename T>
+  void operator()(T* p) { ::sqlite3_free(p); }
+};
+template <typename T>
+using sqlite_ptr = std::unique_ptr<T, sqlite_deleter>;
+
+
+stmt_ptr prepare_statement(const DoutPrefixProvider* dpp,
+                           sqlite3* db, std::string_view sql)
+{
+  sqlite3_stmt* stmt = nullptr;
+  int result = ::sqlite3_prepare_v2(db, sql.data(), sql.size(), &stmt, nullptr);
+  auto ec = std::error_code{result, sqlite::error_category()};
+  if (ec != sqlite::errc::ok) {
+    const char* errmsg = ::sqlite3_errmsg(db);
+    ldpp_dout(dpp, 1) << "preparation failed: " << errmsg
+        << " (" << ec << ")\nstatement: " << sql << dendl;
+    throw sqlite::error(errmsg, ec);
+  }
+  return stmt_ptr{stmt};
+}
+
+static int bind_index(const DoutPrefixProvider* dpp,
+                      const stmt_binding& stmt, const char* name)
+{
+  const int index = ::sqlite3_bind_parameter_index(stmt.get(), name);
+  if (index <= 0) {
+    ldpp_dout(dpp, 1) << "binding failed on parameter name="
+        << name << dendl;
+    sqlite3* db = ::sqlite3_db_handle(stmt.get());
+    throw sqlite::error(db);
+  }
+  return index;
+}
+
+void bind_text(const DoutPrefixProvider* dpp, const stmt_binding& stmt,
+               const char* name, std::string_view value)
+{
+  const int index = bind_index(dpp, stmt, name);
+
+  int result = ::sqlite3_bind_text(stmt.get(), index, value.data(),
+                                   value.size(), SQLITE_STATIC);
+  auto ec = std::error_code{result, sqlite::error_category()};
+  if (ec != sqlite::errc::ok) {
+    ldpp_dout(dpp, 1) << "binding failed on parameter name="
+        << name << " value=" << value << dendl;
+    sqlite3* db = ::sqlite3_db_handle(stmt.get());
+    throw sqlite::error(db, ec);
+  }
+}
+
+void bind_int(const DoutPrefixProvider* dpp, const stmt_binding& stmt,
+              const char* name, int value)
+{
+  const int index = bind_index(dpp, stmt, name);
+
+  int result = ::sqlite3_bind_int(stmt.get(), index, value);
+  auto ec = std::error_code{result, sqlite::error_category()};
+  if (ec != sqlite::errc::ok) {
+    ldpp_dout(dpp, 1) << "binding failed on parameter name="
+        << name << " value=" << value << dendl;
+    sqlite3* db = ::sqlite3_db_handle(stmt.get());
+    throw sqlite::error(db, ec);
+  }
+}
+
+void eval0(const DoutPrefixProvider* dpp, const stmt_execution& stmt)
+{
+  sqlite_ptr<char> sql;
+  if (dpp->get_cct()->_conf->subsys.should_gather<dout_subsys, 20>()) {
+    sql.reset(::sqlite3_expanded_sql(stmt.get()));
+  }
+
+  const int result = ::sqlite3_step(stmt.get());
+  auto ec = std::error_code{result, sqlite::error_category()};
+  sqlite3* db = ::sqlite3_db_handle(stmt.get());
+
+  if (ec != sqlite::errc::done) {
+    const char* errmsg = ::sqlite3_errmsg(db);
+    ldpp_dout(dpp, 20) << "evaluation failed: " << errmsg
+        << " (" << ec << ")\nstatement: " << sql.get() << dendl;
+    throw sqlite::error(errmsg, ec);
+  }
+  ldpp_dout(dpp, 20) << "evaluation succeeded: " << sql.get() << dendl;
+}
+
+void eval1(const DoutPrefixProvider* dpp, const stmt_execution& stmt)
+{
+  sqlite_ptr<char> sql;
+  if (dpp->get_cct()->_conf->subsys.should_gather<dout_subsys, 20>()) {
+    sql.reset(::sqlite3_expanded_sql(stmt.get()));
+  }
+
+  const int result = ::sqlite3_step(stmt.get());
+  auto ec = std::error_code{result, sqlite::error_category()};
+  if (ec != sqlite::errc::row) {
+    sqlite3* db = ::sqlite3_db_handle(stmt.get());
+    const char* errmsg = ::sqlite3_errmsg(db);
+    ldpp_dout(dpp, 1) << "evaluation failed: " << errmsg << " (" << ec
+        << ")\nstatement: " << sql.get() << dendl;
+    throw sqlite::error(errmsg, ec);
+  }
+  ldpp_dout(dpp, 20) << "evaluation succeeded: " << sql.get() << dendl;
+}
+
+int column_int(const stmt_execution& stmt, int column)
+{
+  return ::sqlite3_column_int(stmt.get(), column);
+}
+
+std::string column_text(const stmt_execution& stmt, int column)
+{
+  const unsigned char* text = ::sqlite3_column_text(stmt.get(), column);
+  // may be NULL
+  if (text) {
+    const std::size_t size = ::sqlite3_column_bytes(stmt.get(), column);
+    return {reinterpret_cast<const char*>(text), size};
+  } else {
+    return {};
+  }
+}
+
+auto read_text_rows(const DoutPrefixProvider* dpp,
+                    const stmt_execution& stmt,
+                    std::span<std::string> entries)
+  -> std::span<std::string>
+{
+  sqlite_ptr<char> sql;
+  if (dpp->get_cct()->_conf->subsys.should_gather<dout_subsys, 20>()) {
+    sql.reset(::sqlite3_expanded_sql(stmt.get()));
+  }
+
+  std::size_t count = 0;
+  while (count < entries.size()) {
+    const int result = ::sqlite3_step(stmt.get());
+    auto ec = std::error_code{result, sqlite::error_category()};
+    if (ec == sqlite::errc::done) {
+      break;
+    }
+    if (ec != sqlite::errc::row) {
+      sqlite3* db = ::sqlite3_db_handle(stmt.get());
+      const char* errmsg = ::sqlite3_errmsg(db);
+      ldpp_dout(dpp, 1) << "evaluation failed: " << errmsg << " (" << ec
+          << ")\nstatement: " << sql.get() << dendl;
+      throw sqlite::error(errmsg, ec);
+    }
+    entries[count] = column_text(stmt, 0);
+    ++count;
+  }
+  ldpp_dout(dpp, 20) << "statement evaluation produced " << count
+      << " results: " << sql.get() << dendl;
+
+  return entries.first(count);
+}
+
+void execute(const DoutPrefixProvider* dpp, sqlite3* db, const char* query,
+             sqlite3_callback callback, void* arg)
+{
+  char* errmsg = nullptr;
+  const int result = ::sqlite3_exec(db, query, callback, arg, &errmsg);
+  auto ec = std::error_code{result, sqlite::error_category()};
+  auto ptr = sqlite_ptr<char>{errmsg}; // free on destruction
+  if (ec != sqlite::errc::ok) {
+    ldpp_dout(dpp, 1) << "query execution failed: " << errmsg << " (" << ec
+        << ")\nquery: " << query << dendl;
+    throw sqlite::error(errmsg, ec);
+  }
+  ldpp_dout(dpp, 20) << "query execution succeeded: " << query << dendl;
+}
+
+} // namespace rgw::dbstore::sqlite
diff --git a/src/rgw/driver/dbstore/sqlite/statement.h b/src/rgw/driver/dbstore/sqlite/statement.h
new file mode 100644 (file)
index 0000000..98b4acf
--- /dev/null
@@ -0,0 +1,83 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2022 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#pragma once
+
+#include <memory>
+#include <span>
+#include <string>
+
+#include <sqlite3.h>
+
+class DoutPrefixProvider;
+
+namespace rgw::dbstore::sqlite {
+
+// owning sqlite3_stmt pointer
+struct stmt_deleter {
+  void operator()(sqlite3_stmt* p) const { ::sqlite3_finalize(p); }
+};
+using stmt_ptr = std::unique_ptr<sqlite3_stmt, stmt_deleter>;
+
+// non-owning sqlite3_stmt pointer that clears binding state on destruction
+struct stmt_binding_deleter {
+  void operator()(sqlite3_stmt* p) const { ::sqlite3_clear_bindings(p); }
+};
+using stmt_binding = std::unique_ptr<sqlite3_stmt, stmt_binding_deleter>;
+
+// non-owning sqlite3_stmt pointer that clears execution state on destruction
+struct stmt_execution_deleter {
+  void operator()(sqlite3_stmt* p) const { ::sqlite3_reset(p); }
+};
+using stmt_execution = std::unique_ptr<sqlite3_stmt, stmt_execution_deleter>;
+
+
+// prepare the sql statement or throw on error
+stmt_ptr prepare_statement(const DoutPrefixProvider* dpp,
+                           sqlite3* db, std::string_view sql);
+
+// bind an input string for the given parameter name
+void bind_text(const DoutPrefixProvider* dpp, const stmt_binding& stmt,
+               const char* name, std::string_view value);
+
+// bind an input integer for the given parameter name
+void bind_int(const DoutPrefixProvider* dpp, const stmt_binding& stmt,
+              const char* name, int value);
+
+// evaluate a prepared statement, expecting no result rows
+void eval0(const DoutPrefixProvider* dpp, const stmt_execution& stmt);
+
+// evaluate a prepared statement, expecting a single result row
+void eval1(const DoutPrefixProvider* dpp, const stmt_execution& stmt);
+
+// return the given column as an integer
+int column_int(const stmt_execution& stmt, int column);
+
+// return the given column as text, or an empty string on NULL
+std::string column_text(const stmt_execution& stmt, int column);
+
+// read the text column from each result row into the given entries, and return
+// the sub-span of entries that contain results
+auto read_text_rows(const DoutPrefixProvider* dpp,
+                    const stmt_execution& stmt,
+                    std::span<std::string> entries)
+  -> std::span<std::string>;
+
+// execute a raw query without preparing a statement. the optional callback
+// can be used to read results
+void execute(const DoutPrefixProvider* dpp, sqlite3* db, const char* query,
+             sqlite3_callback callback, void* arg);
+
+} // namespace rgw::dbstore::sqlite
diff --git a/src/rgw/driver/dbstore/tests/CMakeLists.txt b/src/rgw/driver/dbstore/tests/CMakeLists.txt
new file mode 100644 (file)
index 0000000..4e60dcf
--- /dev/null
@@ -0,0 +1,17 @@
+cmake_minimum_required(VERSION 3.14.0)
+project(dbstore-tests)
+
+set (CMAKE_LINK_LIBRARIES ${CMAKE_LINK_LIBRARIES} gtest)
+
+set(dbstore_tests_srcs
+    dbstore_tests.cc)
+
+include_directories(${CMAKE_INCLUDE_DIR})
+
+add_executable(unittest_dbstore_tests ${dbstore_tests_srcs})
+target_link_libraries(unittest_dbstore_tests ${CMAKE_LINK_LIBRARIES})
+add_ceph_unittest(unittest_dbstore_tests)
+
+add_executable(unittest_dbstore_mgr_tests dbstore_mgr_tests.cc)
+target_link_libraries(unittest_dbstore_mgr_tests dbstore gtest_main)
+add_ceph_unittest(unittest_dbstore_mgr_tests)
diff --git a/src/rgw/driver/dbstore/tests/dbstore_mgr_tests.cc b/src/rgw/driver/dbstore/tests/dbstore_mgr_tests.cc
new file mode 100644 (file)
index 0000000..02ecd9f
--- /dev/null
@@ -0,0 +1,157 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "common/ceph_context.h"
+#include "rgw/driver/dbstore/dbstore_mgr.h"
+
+#include <filesystem>
+#include <gtest/gtest.h>
+#include <memory>
+
+using namespace rgw;
+namespace fs = std::filesystem;
+const static std::string TEST_DIR = "rgw_dbstore_tests";
+
+bool endsWith(const std::string &mainStr, const std::string &toMatch)
+{
+    if(mainStr.size() >= toMatch.size() &&
+            mainStr.compare(mainStr.size() - toMatch.size(), toMatch.size(), toMatch) == 0)
+            return true;
+        else
+            return false;
+}
+
+class TestDBStoreManager : public ::testing::Test {
+protected:
+  void SetUp() override {
+    ctx_ = std::make_shared<CephContext>(CEPH_ENTITY_TYPE_CLIENT);
+    g_ceph_context = ctx_.get();
+    fs::current_path(fs::temp_directory_path());
+    fs::create_directory(TEST_DIR);
+  }
+
+  void TearDown() override {
+    fs::current_path(fs::temp_directory_path());
+    fs::remove_all(TEST_DIR);
+  }
+
+  std::string getTestDir() const {
+    auto test_dir = fs::temp_directory_path() / TEST_DIR;
+    return test_dir.string();
+  }
+
+  fs::path getDBFullPath(const std::string & base_dir,
+                         const std::string & tenant) const {
+    auto db_path = ctx_->_conf.get_val<std::string>("dbstore_db_dir");
+    const auto& db_name = ctx_->_conf.get_val<std::string>("dbstore_db_name_prefix") + "-" + tenant + ".db";
+
+    auto db_full_path = std::filesystem::path(db_path) / db_name;
+    auto db_full_path_test = fs::path(base_dir) / db_full_path;
+    return db_full_path_test;
+  }
+
+  std::string getDBTenant(const std::string & base_dir,
+                          const std::string & tenant) const {
+    auto db_name = ctx_->_conf.get_val<std::string>("dbstore_db_name_prefix");
+    db_name += "-" + tenant;
+    auto db_full_path = fs::path(base_dir) /  db_name;
+    return db_full_path.string();
+  }
+
+  std::string getDBTenant(const std::string & tenant = default_tenant) const {
+    return getDBTenant(getTestDir(), tenant);
+  }
+
+  fs::path getDBFullPath(const std::string & tenant) const {
+    return getDBFullPath(getTestDir(), tenant);
+  }
+
+  fs::path getLogFilePath(const std::string & log_file) {
+    return fs::temp_directory_path() / log_file;
+  }
+
+  std::shared_ptr<CephContext> getContext() const {
+    return ctx_;
+  }
+
+ private:
+    std::shared_ptr<CephContext> ctx_;
+};
+
+TEST_F(TestDBStoreManager, BasicInstantiateUsingDBDir) {
+  getContext()->_conf.set_val("dbstore_db_dir", getTestDir());
+
+  EXPECT_FALSE(fs::exists(getDBFullPath(default_tenant)));
+  auto dbstore_mgr = std::make_shared<DBStoreManager>(getContext().get());
+  EXPECT_TRUE(fs::exists(getDBFullPath(default_tenant)));
+}
+
+TEST_F(TestDBStoreManager, DBNamePrefix) {
+  getContext()->_conf.set_val("dbstore_db_dir", getTestDir());
+  std::string prefix = "testprefix";
+  getContext()->_conf.set_val("dbstore_db_name_prefix", prefix);
+
+  EXPECT_FALSE(fs::exists(getDBFullPath(default_tenant)));
+  auto dbstore_mgr = std::make_shared<DBStoreManager>(getContext().get());
+  EXPECT_TRUE(fs::exists(getDBFullPath(default_tenant)));
+
+  // check that the database name contains the given prefix
+  std::string expected_db_name = prefix + "-" + default_tenant + ".db";
+  EXPECT_TRUE(endsWith(getDBFullPath(default_tenant), expected_db_name));
+}
+
+TEST_F(TestDBStoreManager, BasicInstantiateSecondConstructor) {
+  getContext()->_conf.set_val("dbstore_db_dir", getTestDir());
+
+  EXPECT_FALSE(fs::exists(getDBFullPath(default_tenant)));
+  auto dbstore_mgr = std::make_shared<DBStoreManager>(getContext().get(), getLogFilePath("test.log").string(), 10);
+  EXPECT_TRUE(fs::exists(getDBFullPath(default_tenant)));
+}
+
+TEST_F(TestDBStoreManager, TestDBName) {
+  getContext()->_conf.set_val("dbstore_db_dir", getTestDir());
+
+  auto dbstore_mgr = std::make_shared<DBStoreManager>(getContext().get());
+  auto db = dbstore_mgr->getDB(default_tenant, false);
+  ASSERT_NE(nullptr, db);
+  EXPECT_EQ(getDBTenant(), db->getDBname());
+}
+
+
+TEST_F(TestDBStoreManager, TestDBNameDefaultDB) {
+  getContext()->_conf.set_val("dbstore_db_dir", getTestDir());
+
+  auto dbstore_mgr = std::make_shared<DBStoreManager>(getContext().get());
+  // passing an empty tenant should return the default_db
+  auto db = dbstore_mgr->getDB("", false);
+  ASSERT_NE(nullptr, db);
+  EXPECT_EQ(getDBTenant(), db->getDBname());
+}
+
+TEST_F(TestDBStoreManager, TestDBBadTenant) {
+  getContext()->_conf.set_val("dbstore_db_dir", getTestDir());
+
+  auto dbstore_mgr = std::make_shared<DBStoreManager>(getContext().get());
+  auto db = dbstore_mgr->getDB("does-not-exist", false);
+  ASSERT_EQ(nullptr, db);
+}
+
+TEST_F(TestDBStoreManager, TestGetNewDB) {
+  getContext()->_conf.set_val("dbstore_db_dir", getTestDir());
+
+  auto dbstore_mgr = std::make_shared<DBStoreManager>(getContext().get());
+
+  auto new_tenant_path = "new_tenant";
+  auto db = dbstore_mgr->getDB(new_tenant_path, true);
+  ASSERT_NE(nullptr, db);
+  EXPECT_EQ(getDBTenant(new_tenant_path), db->getDBname());
+}
+
+TEST_F(TestDBStoreManager, TestDelete) {
+  getContext()->_conf.set_val("dbstore_db_dir", getTestDir());
+
+  auto dbstore_mgr = std::make_shared<DBStoreManager>(getContext().get());
+  dbstore_mgr->deleteDB(default_tenant);
+  auto db = dbstore_mgr->getDB(default_tenant, false);
+  ASSERT_EQ(nullptr, db);
+}
diff --git a/src/rgw/driver/dbstore/tests/dbstore_tests.cc b/src/rgw/driver/dbstore/tests/dbstore_tests.cc
new file mode 100644 (file)
index 0000000..e87002f
--- /dev/null
@@ -0,0 +1,1424 @@
+#include "gtest/gtest.h"
+#include <iostream>
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <dbstore.h>
+#include <sqliteDB.h>
+#include "rgw_common.h"
+
+using namespace std;
+using DB = rgw::store::DB;
+
+vector<const char*> args;
+
+namespace gtest {
+  class Environment* env;
+
+  class Environment : public ::testing::Environment {
+    public:
+      Environment(): tenant("default_ns"), db(nullptr),
+      db_type("SQLite"), ret(-1) {}
+
+      Environment(string tenantname, string db_typename): 
+        tenant(tenantname), db(nullptr),
+        db_type(db_typename), ret(-1) {}
+
+      virtual ~Environment() {}
+
+      void SetUp() override {
+        cct = global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT,
+            CODE_ENVIRONMENT_DAEMON,
+            CINIT_FLAG_NO_DEFAULT_CONFIG_FILE | CINIT_FLAG_NO_MON_CONFIG | CINIT_FLAG_NO_DAEMON_ACTIONS);
+        if (!db_type.compare("SQLite")) {
+          db = new SQLiteDB(tenant, cct.get());
+          ASSERT_TRUE(db != nullptr);
+          ret = db->Initialize(logfile, loglevel);
+          ASSERT_GE(ret, 0);
+        }
+      }
+
+      void TearDown() override {
+        if (!db)
+          return;
+        db->Destroy(db->get_def_dpp());
+        delete db;
+      }
+
+      string tenant;
+      DB *db;
+      string db_type;
+      int ret;
+      string logfile = "rgw_dbstore_tests.log";
+      int loglevel = 30;
+      boost::intrusive_ptr<CephContext> cct;
+  };
+}
+
+ceph::real_time bucket_mtime = real_clock::now();
+string marker1;
+
+class DBGetDataCB : public RGWGetDataCB {
+  public:
+    bufferlist data_bl;
+    off_t data_ofs, data_len;
+
+    int handle_data(bufferlist& bl, off_t bl_ofs, off_t bl_len) {
+      data_bl = bl;
+      data_ofs = bl_ofs;
+      data_len = bl_len;
+      return 0;
+    }
+};
+
+namespace {
+
+  class DBStoreTest : public ::testing::Test {
+    protected:
+      int ret;
+      DB *db = nullptr;
+      string user1 = "user1";
+      string user_id1 = "user_id1";
+      string bucket1 = "bucket1";
+      string object1 = "object1";
+      string data = "Hello World";
+      DBOpParams GlobalParams = {};
+      const DoutPrefixProvider *dpp;
+
+      DBStoreTest() {}
+      void SetUp() {
+        db = gtest::env->db;
+        ASSERT_TRUE(db != nullptr);
+        dpp = db->get_def_dpp();
+        ASSERT_TRUE(dpp != nullptr);
+
+        GlobalParams.op.user.uinfo.display_name = user1;
+        GlobalParams.op.user.uinfo.user_id.id = user_id1;
+        GlobalParams.op.bucket.info.bucket.name = bucket1;
+        GlobalParams.op.obj.state.obj.bucket = GlobalParams.op.bucket.info.bucket;
+        GlobalParams.op.obj.state.obj.key.name = object1;
+        GlobalParams.op.obj.state.obj.key.instance = "inst1";
+        GlobalParams.op.obj.obj_id = "obj_id1";
+        GlobalParams.op.obj_data.part_num = 0;
+
+        /* As of now InitializeParams doesnt do anything
+         * special based on fop. Hence its okay to do
+         * global initialization once.
+         */
+        ret = db->InitializeParams(dpp, &GlobalParams);
+        ASSERT_EQ(ret, 0);
+      }
+
+      void TearDown() {
+      }
+
+      int write_object(const DoutPrefixProvider *dpp, DBOpParams params) {
+        DB::Object op_target(db, params.op.bucket.info,
+                             params.op.obj.state.obj);
+        DB::Object::Write write_op(&op_target);
+        map<string, bufferlist> setattrs;
+        ret = write_op.prepare(dpp);
+        if (ret)
+          return ret;
+
+        write_op.meta.mtime = &bucket_mtime;
+        write_op.meta.category = RGWObjCategory::Main;
+        write_op.meta.owner = params.op.user.uinfo.user_id;
+
+        bufferlist b1 = params.op.obj.head_data;
+        write_op.meta.data = &b1;
+
+        bufferlist b2;
+        encode("ACL", b2);
+        setattrs[RGW_ATTR_ACL] = b2;
+
+        ret = write_op.write_meta(0, params.op.obj.state.size, b1.length()+1, setattrs);
+        return ret;
+      }
+  };
+}
+
+TEST_F(DBStoreTest, InsertUser) {
+  struct DBOpParams params = GlobalParams;
+  int ret = -1;
+
+  params.op.user.uinfo.user_id.tenant = "tenant";
+  params.op.user.uinfo.user_email = "user1@dbstore.com";
+  params.op.user.uinfo.suspended = 123;
+  params.op.user.uinfo.max_buckets = 456;
+  params.op.user.uinfo.assumed_role_arn = "role";
+  params.op.user.uinfo.placement_tags.push_back("tags");
+  RGWAccessKey k1("id1", "key1");
+  RGWAccessKey k2("id2", "key2");
+  params.op.user.uinfo.access_keys["id1"] = k1;
+  params.op.user.uinfo.access_keys["id2"] = k2;
+  params.op.user.user_version.ver = 1;    
+  params.op.user.user_version.tag = "UserTAG";    
+
+  ret = db->ProcessOp(dpp, "InsertUser", &params);
+  ASSERT_EQ(ret, 0);
+}
+
+TEST_F(DBStoreTest, GetUser) {
+  struct DBOpParams params = GlobalParams;
+  int ret = -1;
+
+  ret = db->ProcessOp(dpp, "GetUser", &params);
+  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(params.op.user.uinfo.user_id.tenant, "tenant");
+  ASSERT_EQ(params.op.user.uinfo.user_email, "user1@dbstore.com");
+  ASSERT_EQ(params.op.user.uinfo.user_id.id, "user_id1");
+  ASSERT_EQ(params.op.user.uinfo.suspended, 123);
+  ASSERT_EQ(params.op.user.uinfo.max_buckets, 456);
+  ASSERT_EQ(params.op.user.uinfo.assumed_role_arn, "role");
+  ASSERT_EQ(params.op.user.uinfo.placement_tags.back(), "tags");
+  RGWAccessKey k;
+  map<string, RGWAccessKey>::iterator it2 = params.op.user.uinfo.access_keys.begin();
+  k = it2->second;
+  ASSERT_EQ(k.id, "id1");
+  ASSERT_EQ(k.key, "key1");
+  it2++;
+  k = it2->second;
+  ASSERT_EQ(k.id, "id2");
+  ASSERT_EQ(k.key, "key2");
+
+}
+
+TEST_F(DBStoreTest, GetUserQuery) {
+  struct DBOpParams params = GlobalParams;
+  int ret = -1;
+
+  params.op.query_str = "email";
+  params.op.user.uinfo.user_email = "user1@dbstore.com";
+
+  ret = db->ProcessOp(dpp, "GetUser", &params);
+  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(params.op.user.uinfo.user_id.tenant, "tenant");
+  ASSERT_EQ(params.op.user.uinfo.user_email, "user1@dbstore.com");
+  ASSERT_EQ(params.op.user.uinfo.user_id.id, "user_id1");
+  ASSERT_EQ(params.op.user.uinfo.suspended, 123);
+  ASSERT_EQ(params.op.user.uinfo.max_buckets, 456);
+  ASSERT_EQ(params.op.user.uinfo.assumed_role_arn, "role");
+  ASSERT_EQ(params.op.user.uinfo.placement_tags.back(), "tags");
+  RGWAccessKey k;
+  map<string, RGWAccessKey>::iterator it2 = params.op.user.uinfo.access_keys.begin();
+  k = it2->second;
+  ASSERT_EQ(k.id, "id1");
+  ASSERT_EQ(k.key, "key1");
+  it2++;
+  k = it2->second;
+  ASSERT_EQ(k.id, "id2");
+  ASSERT_EQ(k.key, "key2");
+
+}
+
+TEST_F(DBStoreTest, GetUserQueryByEmail) {
+  int ret = -1;
+  RGWUserInfo uinfo;
+  string email = "user1@dbstore.com";
+  map<std::string, bufferlist> attrs;
+  RGWObjVersionTracker objv;
+
+  ret = db->get_user(dpp, "email", email, uinfo, &attrs, &objv);
+  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(uinfo.user_id.tenant, "tenant");
+  ASSERT_EQ(uinfo.user_email, "user1@dbstore.com");
+  ASSERT_EQ(uinfo.user_id.id, "user_id1");
+  ASSERT_EQ(uinfo.suspended, 123);
+  ASSERT_EQ(uinfo.max_buckets, 456);
+  ASSERT_EQ(uinfo.assumed_role_arn, "role");
+  ASSERT_EQ(uinfo.placement_tags.back(), "tags");
+  RGWAccessKey k;
+  map<string, RGWAccessKey>::iterator it2 = uinfo.access_keys.begin();
+  k = it2->second;
+  ASSERT_EQ(k.id, "id1");
+  ASSERT_EQ(k.key, "key1");
+  it2++;
+  k = it2->second;
+  ASSERT_EQ(k.id, "id2");
+  ASSERT_EQ(k.key, "key2");
+  ASSERT_EQ(objv.read_version.ver, 1);
+}
+
+TEST_F(DBStoreTest, GetUserQueryByAccessKey) {
+  int ret = -1;
+  RGWUserInfo uinfo;
+  string key = "id1";
+
+  ret = db->get_user(dpp, "access_key", key, uinfo, nullptr, nullptr);
+  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(uinfo.user_id.tenant, "tenant");
+  ASSERT_EQ(uinfo.user_email, "user1@dbstore.com");
+  ASSERT_EQ(uinfo.user_id.id, "user_id1");
+  ASSERT_EQ(uinfo.suspended, 123);
+  ASSERT_EQ(uinfo.max_buckets, 456);
+  ASSERT_EQ(uinfo.assumed_role_arn, "role");
+  ASSERT_EQ(uinfo.placement_tags.back(), "tags");
+  RGWAccessKey k;
+  map<string, RGWAccessKey>::iterator it2 = uinfo.access_keys.begin();
+  k = it2->second;
+  ASSERT_EQ(k.id, "id1");
+  ASSERT_EQ(k.key, "key1");
+  it2++;
+  k = it2->second;
+  ASSERT_EQ(k.id, "id2");
+  ASSERT_EQ(k.key, "key2");
+}
+
+TEST_F(DBStoreTest, StoreUser) {
+  struct DBOpParams params = GlobalParams;
+  int ret = -1;
+  RGWUserInfo uinfo, old_uinfo;
+  map<std::string, bufferlist> attrs;
+  RGWObjVersionTracker objv_tracker;
+
+  bufferlist attr1, attr2;
+  encode("attrs1", attr1);
+  attrs["attr1"] = attr1;
+  encode("attrs2", attr2);
+  attrs["attr2"] = attr2;
+
+  uinfo.user_id.id = "user_id2";
+  uinfo.user_id.tenant = "tenant";
+  uinfo.user_email = "user2@dbstore.com";
+  uinfo.suspended = 123;
+  uinfo.max_buckets = 456;
+  uinfo.assumed_role_arn = "role";
+  uinfo.placement_tags.push_back("tags");
+  RGWAccessKey k1("id1", "key1");
+  RGWAccessKey k2("id2", "key2");
+  uinfo.access_keys["id1"] = k1;
+  uinfo.access_keys["id2"] = k2;
+
+  /* non exclusive create..should create new one */
+  ret = db->store_user(dpp, uinfo, false, &attrs, &objv_tracker, &old_uinfo);
+  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(old_uinfo.user_email, "");
+  ASSERT_EQ(objv_tracker.read_version.ver, 1);
+  ASSERT_EQ(objv_tracker.read_version.tag, "UserTAG");
+
+  /* invalid version number */
+  objv_tracker.read_version.ver = 4;
+  ret = db->store_user(dpp, uinfo, true, &attrs, &objv_tracker, &old_uinfo);
+  ASSERT_EQ(ret, -125); /* returns ECANCELED */
+  ASSERT_EQ(old_uinfo.user_id.id, uinfo.user_id.id);
+  ASSERT_EQ(old_uinfo.user_email, uinfo.user_email);
+
+  /* exclusive create..should not create new one */
+  uinfo.user_email = "user2_new@dbstore.com";
+  objv_tracker.read_version.ver = 1;
+  ret = db->store_user(dpp, uinfo, true, &attrs, &objv_tracker, &old_uinfo);
+  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(old_uinfo.user_email, "user2@dbstore.com");
+  ASSERT_EQ(objv_tracker.read_version.ver, 1);
+
+  ret = db->store_user(dpp, uinfo, false, &attrs, &objv_tracker, &old_uinfo);
+  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(old_uinfo.user_email, "user2@dbstore.com");
+  ASSERT_EQ(objv_tracker.read_version.ver, 2);
+  ASSERT_EQ(objv_tracker.read_version.tag, "UserTAG");
+}
+
+TEST_F(DBStoreTest, GetUserQueryByUserID) {
+  int ret = -1;
+  RGWUserInfo uinfo;
+  map<std::string, bufferlist> attrs;
+  RGWObjVersionTracker objv;
+
+  uinfo.user_id.tenant = "tenant";
+  uinfo.user_id.id = "user_id2";
+
+  ret = db->get_user(dpp, "user_id", "user_id2", uinfo, &attrs, &objv);
+  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(uinfo.user_id.tenant, "tenant");
+  ASSERT_EQ(uinfo.user_email, "user2_new@dbstore.com");
+  ASSERT_EQ(uinfo.user_id.id, "user_id2");
+  ASSERT_EQ(uinfo.suspended, 123);
+  ASSERT_EQ(uinfo.max_buckets, 456);
+  ASSERT_EQ(uinfo.assumed_role_arn, "role");
+  ASSERT_EQ(uinfo.placement_tags.back(), "tags");
+  RGWAccessKey k;
+  map<string, RGWAccessKey>::iterator it = uinfo.access_keys.begin();
+  k = it->second;
+  ASSERT_EQ(k.id, "id1");
+  ASSERT_EQ(k.key, "key1");
+  it++;
+  k = it->second;
+  ASSERT_EQ(k.id, "id2");
+  ASSERT_EQ(k.key, "key2");
+
+  ASSERT_EQ(objv.read_version.ver, 2);
+
+  bufferlist k1, k2;
+  string attr;
+  map<std::string, bufferlist>::iterator it2 = attrs.begin();
+  k1 = it2->second;
+  decode(attr, k1);
+  ASSERT_EQ(attr, "attrs1");
+  it2++;
+  k2 = it2->second;
+  decode(attr, k2);
+  ASSERT_EQ(attr, "attrs2");
+}
+
+TEST_F(DBStoreTest, ListAllUsers) {
+  struct DBOpParams params = GlobalParams;
+  int ret = -1;
+
+  ret = db->ListAllUsers(dpp, &params);
+  ASSERT_EQ(ret, 0);
+}
+
+TEST_F(DBStoreTest, InsertBucket) {
+  struct DBOpParams params = GlobalParams;
+  int ret = -1;
+
+  params.op.bucket.info.bucket.name = "bucket1";
+  params.op.bucket.info.bucket.tenant = "tenant";
+  params.op.bucket.info.bucket.marker = "marker1";
+
+  params.op.bucket.ent.size = 1024;
+
+  params.op.bucket.info.has_instance_obj = false;
+  params.op.bucket.bucket_version.ver = 1;
+  params.op.bucket.bucket_version.tag = "read_tag";
+
+  params.op.bucket.mtime = bucket_mtime;
+
+  ret = db->ProcessOp(dpp, "InsertBucket", &params);
+  ASSERT_EQ(ret, 0);
+}
+
+TEST_F(DBStoreTest, UpdateBucketAttrs) {
+  int ret = -1;
+  RGWBucketInfo info;
+  map<std::string, bufferlist> attrs;
+  RGWObjVersionTracker objv;
+
+  bufferlist aclbl, aclbl2;
+  encode("attrs1", aclbl);
+  attrs["attr1"] = aclbl;
+  encode("attrs2", aclbl2);
+  attrs["attr2"] = aclbl2;
+
+  info.bucket.name = "bucket1";
+
+  /* invalid version number */
+  objv.read_version.ver = 4;
+  ret = db->update_bucket(dpp, "attrs", info, false, nullptr, &attrs, &bucket_mtime, &objv);
+  ASSERT_EQ(ret, -125); /* returns ECANCELED */
+
+  /* right version number */
+  objv.read_version.ver = 1;
+  ret = db->update_bucket(dpp, "attrs", info, false, nullptr, &attrs, &bucket_mtime, &objv);
+  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(objv.read_version.ver, 2);
+}
+
+TEST_F(DBStoreTest, UpdateBucketInfo) {
+  struct DBOpParams params = GlobalParams;
+  int ret = -1;
+  RGWBucketInfo info;
+
+  params.op.bucket.info.bucket.name = "bucket1";
+
+  ret = db->ProcessOp(dpp, "GetBucket", &params);
+  ASSERT_EQ(ret, 0);
+
+  info = params.op.bucket.info;
+
+  info.bucket.marker = "marker2";
+  ret = db->update_bucket(dpp, "info", info, false, nullptr, nullptr, &bucket_mtime, nullptr);
+  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(info.objv_tracker.read_version.ver, 3);
+}
+
+TEST_F(DBStoreTest, GetBucket) {
+  struct DBOpParams params = GlobalParams;
+  int ret = -1;
+
+  params.op.bucket.info.bucket.name = "bucket1";
+  ret = db->ProcessOp(dpp, "GetBucket", &params);
+  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(params.op.bucket.info.bucket.name, "bucket1");
+  ASSERT_EQ(params.op.bucket.info.bucket.tenant, "tenant");
+  ASSERT_EQ(params.op.bucket.info.bucket.marker, "marker2");
+  ASSERT_EQ(params.op.bucket.ent.size, 1024);
+  ASSERT_EQ(params.op.bucket.ent.bucket.name, "bucket1");
+  ASSERT_EQ(params.op.bucket.ent.bucket.tenant, "tenant");
+  ASSERT_EQ(params.op.bucket.info.has_instance_obj, false);
+  ASSERT_EQ(params.op.bucket.info.objv_tracker.read_version.ver, 3);
+  ASSERT_EQ(params.op.bucket.info.objv_tracker.read_version.tag, "read_tag");
+  ASSERT_EQ(params.op.bucket.mtime, bucket_mtime);
+  ASSERT_EQ(params.op.bucket.info.owner.id, "user_id1");
+  bufferlist k, k2;
+  string acl;
+  map<std::string, bufferlist>::iterator it2 = params.op.bucket.bucket_attrs.begin();
+  k = it2->second;
+  decode(acl, k);
+  ASSERT_EQ(acl, "attrs1");
+  it2++;
+  k2 = it2->second;
+  decode(acl, k2);
+  ASSERT_EQ(acl, "attrs2");
+}
+
+TEST_F(DBStoreTest, CreateBucket) {
+  struct DBOpParams params = GlobalParams;
+  int ret = -1;
+  RGWBucketInfo info;
+  RGWUserInfo owner;
+  rgw_bucket bucket;
+  obj_version objv;
+  rgw_placement_rule rule;
+  map<std::string, bufferlist> attrs;
+
+  owner.user_id.id = "user_id1";
+  bucket.name = "bucket1";
+  bucket.tenant = "tenant";
+
+  objv.ver = 2;
+  objv.tag = "write_tag";
+
+  rule.name = "rule1";
+  rule.storage_class = "sc1";
+
+  ret = db->create_bucket(dpp, owner, bucket, "zid", rule, "swift_ver", NULL,
+      attrs, info, &objv, NULL, bucket_mtime, NULL, NULL,
+      null_yield, false);
+  ASSERT_EQ(ret, 0);
+  bucket.name = "bucket2";
+  ret = db->create_bucket(dpp, owner, bucket, "zid", rule, "swift_ver", NULL,
+      attrs, info, &objv, NULL, bucket_mtime, NULL, NULL,
+      null_yield, false);
+  ASSERT_EQ(ret, 0);
+  bucket.name = "bucket3";
+  ret = db->create_bucket(dpp, owner, bucket, "zid", rule, "swift_ver", NULL,
+      attrs, info, &objv, NULL, bucket_mtime, NULL, NULL,
+      null_yield, false);
+  ASSERT_EQ(ret, 0);
+  bucket.name = "bucket4";
+  ret = db->create_bucket(dpp, owner, bucket, "zid", rule, "swift_ver", NULL,
+      attrs, info, &objv, NULL, bucket_mtime, NULL, NULL,
+      null_yield, false);
+  ASSERT_EQ(ret, 0);
+  bucket.name = "bucket5";
+  ret = db->create_bucket(dpp, owner, bucket, "zid", rule, "swift_ver", NULL,
+      attrs, info, &objv, NULL, bucket_mtime, NULL, NULL,
+      null_yield, false);
+  ASSERT_EQ(ret, 0);
+}
+
+TEST_F(DBStoreTest, GetBucketQueryByName) {
+  int ret = -1;
+  RGWBucketInfo binfo;
+  binfo.bucket.name = "bucket2";
+  rgw::sal::Attrs attrs;
+  ceph::real_time mtime;
+  obj_version objv;
+
+  ret = db->get_bucket_info(dpp, "name", "", binfo, &attrs, &mtime, &objv);
+  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(binfo.bucket.name, "bucket2");
+  ASSERT_EQ(binfo.bucket.tenant, "tenant");
+  ASSERT_EQ(binfo.owner.id, "user_id1");
+  ASSERT_EQ(binfo.objv_tracker.read_version.ver, 2);
+  ASSERT_EQ(binfo.objv_tracker.read_version.tag, "write_tag");
+  ASSERT_EQ(binfo.zonegroup, "zid");
+  ASSERT_EQ(binfo.creation_time, bucket_mtime);
+  ASSERT_EQ(binfo.placement_rule.name, "rule1");
+  ASSERT_EQ(binfo.placement_rule.storage_class, "sc1");
+  ASSERT_EQ(objv.ver, 2);
+  ASSERT_EQ(objv.tag, "write_tag");
+
+  marker1 = binfo.bucket.marker;
+}
+
+TEST_F(DBStoreTest, ListUserBuckets) {
+  struct DBOpParams params = GlobalParams;
+  int ret = -1;
+  rgw_user owner;
+  int max = 2;
+  bool need_stats = true;
+  bool is_truncated = false;
+  RGWUserBuckets ulist;
+
+  owner.id = "user_id1";
+
+  marker1 = "";
+  do {
+    is_truncated = false;
+    ret = db->list_buckets(dpp, "", owner, marker1, "", max, need_stats, &ulist,
+          &is_truncated);
+    ASSERT_EQ(ret, 0);
+
+    cout << "marker1 :" << marker1 << "\n";
+
+    cout << "is_truncated :" << is_truncated << "\n";
+
+    for (const auto& ent: ulist.get_buckets()) {
+      RGWBucketEnt e = ent.second;
+      cout << "###################### \n";
+      cout << "ent.bucket.id : " << e.bucket.name << "\n";
+      cout << "ent.bucket.marker : " << e.bucket.marker << "\n";
+      cout << "ent.bucket.bucket_id : " << e.bucket.bucket_id << "\n";
+      cout << "ent.size : " << e.size << "\n";
+      cout << "ent.rule.name : " << e.placement_rule.name << "\n";
+
+      marker1 = e.bucket.name;
+    }
+    ulist.clear();
+  } while(is_truncated);
+}
+
+TEST_F(DBStoreTest, BucketChown) {
+  int ret = -1;
+  RGWBucketInfo info;
+  rgw_user user;
+  user.id = "user_id2";
+
+  info.bucket.name = "bucket5";
+
+  ret = db->update_bucket(dpp, "owner", info, false, &user, nullptr, &bucket_mtime, nullptr);
+  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(info.objv_tracker.read_version.ver, 3);
+}
+
+TEST_F(DBStoreTest, ListAllBuckets) {
+  struct DBOpParams params = GlobalParams;
+  int ret = -1;
+
+  ret = db->ListAllBuckets(dpp, &params);
+  ASSERT_EQ(ret, 0);
+}
+
+TEST_F(DBStoreTest, ListAllBuckets2) {
+  struct DBOpParams params = GlobalParams;
+  int ret = -1;
+  rgw_user owner;
+  int max = 2;
+  bool need_stats = true;
+  bool is_truncated = false;
+  RGWUserBuckets ulist;
+
+  marker1 = "";
+  do {
+    is_truncated = false;
+    ret = db->list_buckets(dpp, "all", owner, marker1, "", max, need_stats, &ulist,
+          &is_truncated);
+    ASSERT_EQ(ret, 0);
+
+    cout << "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ \n";
+    cout << "ownerID : " << owner.id << "\n";
+    cout << "marker1 :" << marker1 << "\n";
+
+    cout << "is_truncated :" << is_truncated << "\n";
+
+    for (const auto& ent: ulist.get_buckets()) {
+      RGWBucketEnt e = ent.second;
+      cout << "###################### \n";
+      cout << "ent.bucket.id : " << e.bucket.name << "\n";
+      cout << "ent.bucket.marker : " << e.bucket.marker << "\n";
+      cout << "ent.bucket.bucket_id : " << e.bucket.bucket_id << "\n";
+      cout << "ent.size : " << e.size << "\n";
+      cout << "ent.rule.name : " << e.placement_rule.name << "\n";
+
+      marker1 = e.bucket.name;
+    }
+    ulist.clear();
+  } while(is_truncated);
+}
+
+TEST_F(DBStoreTest, RemoveBucketAPI) {
+  int ret = -1;
+  RGWBucketInfo info;
+
+  info.bucket.name = "bucket5";
+
+  ret = db->remove_bucket(dpp, info);
+  ASSERT_EQ(ret, 0);
+}
+
+TEST_F(DBStoreTest, RemoveUserAPI) {
+  int ret = -1;
+  RGWUserInfo uinfo;
+  RGWObjVersionTracker objv;
+
+  uinfo.user_id.tenant = "tenant";
+  uinfo.user_id.id = "user_id2";
+
+  /* invalid version number...should fail */
+  objv.read_version.ver = 4;
+  ret = db->remove_user(dpp, uinfo, &objv);
+  ASSERT_EQ(ret, -125);
+
+  objv.read_version.ver = 2;
+  ret = db->remove_user(dpp, uinfo, &objv);
+  ASSERT_EQ(ret, 0);
+}
+
+TEST_F(DBStoreTest, PutObject) {
+  struct DBOpParams params = GlobalParams;
+  int ret = -1;
+
+  params.op.obj.category = RGWObjCategory::Main;
+  params.op.obj.storage_class = "STANDARD";
+  bufferlist b1;
+  encode("HELLO WORLD", b1);
+  cout<<"XXXXXXXXX Insert b1.length " << b1.length() << "\n";
+  params.op.obj.head_data = b1;
+  params.op.obj.state.size = 12;
+  params.op.obj.state.is_olh = false;
+  ret = db->ProcessOp(dpp, "PutObject", &params);
+  ASSERT_EQ(ret, 0);
+
+  /* Insert another objects */
+  params.op.obj.state.obj.key.name = "object2";
+  params.op.obj.state.obj.key.instance = "inst2";
+  ret = db->ProcessOp(dpp, "PutObject", &params);
+  ASSERT_EQ(ret, 0);
+
+  params.op.obj.state.obj.key.name = "object3";
+  params.op.obj.state.obj.key.instance = "inst3";
+  ret = db->ProcessOp(dpp, "PutObject", &params);
+  ASSERT_EQ(ret, 0);
+}
+
+TEST_F(DBStoreTest, ListAllObjects) {
+  struct DBOpParams params = GlobalParams;
+  int ret = -1;
+
+  ret = db->ListAllObjects(dpp, &params);
+  ASSERT_GE(ret, 0);
+}
+
+TEST_F(DBStoreTest, GetObject) {
+  struct DBOpParams params = GlobalParams;
+  int ret = -1;
+
+  ret = db->ProcessOp(dpp, "GetObject", &params);
+  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(params.op.obj.category, RGWObjCategory::Main);
+  ASSERT_EQ(params.op.obj.storage_class, "STANDARD");
+  string data;
+  decode(data, params.op.obj.head_data);
+  ASSERT_EQ(data, "HELLO WORLD");
+  ASSERT_EQ(params.op.obj.state.size, 12);
+  cout << "versionNum :" << params.op.obj.version_num << "\n";
+}
+
+TEST_F(DBStoreTest, GetObjectState) {
+  struct DBOpParams params = GlobalParams;
+  int ret = -1;
+  RGWObjState* s;
+
+  params.op.obj.state.obj.key.name = "object2";
+  params.op.obj.state.obj.key.instance = "inst2";
+  DB::Object op_target(db, params.op.bucket.info,
+      params.op.obj.state.obj);
+
+  ret = op_target.get_obj_state(dpp, params.op.bucket.info, params.op.obj.state.obj,
+      false, &s);
+  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(s->size, 12);
+  ASSERT_EQ(s->is_olh, false);
+  cout << "versionNum :" << params.op.obj.version_num << "\n";
+
+  /* Recheck with get_state API */
+  ret = op_target.get_state(dpp, &s, false);
+  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(s->size, 12);
+  ASSERT_EQ(s->is_olh, false);
+  cout << "versionNum :" << params.op.obj.version_num << "\n";
+}
+
+TEST_F(DBStoreTest, ObjAttrs) {
+  struct DBOpParams params = GlobalParams;
+  int ret = -1;
+  map<string, bufferlist> setattrs;
+  map<string, bufferlist> rmattrs;
+  map<string, bufferlist> readattrs;
+
+  bufferlist b1, b2, b3;
+  encode("ACL", b1);
+  setattrs[RGW_ATTR_ACL] = b1;
+  encode("LC", b2);
+  setattrs[RGW_ATTR_LC] = b2;
+  encode("ETAG", b3);
+  setattrs[RGW_ATTR_ETAG] = b3;
+
+  DB::Object op_target(db, params.op.bucket.info,
+      params.op.obj.state.obj);
+
+  /* Set some attrs */
+  ret = op_target.set_attrs(dpp, setattrs, nullptr);
+  ASSERT_EQ(ret, 0);
+
+  /* read those attrs */
+  DB::Object::Read read_op(&op_target);
+  read_op.params.attrs = &readattrs;
+  ret = read_op.prepare(dpp);
+  ASSERT_EQ(ret, 0);
+
+  string val;
+  decode(val, readattrs[RGW_ATTR_ACL]);
+  ASSERT_EQ(val, "ACL");
+  decode(val, readattrs[RGW_ATTR_LC]);
+  ASSERT_EQ(val, "LC");
+  decode(val, readattrs[RGW_ATTR_ETAG]);
+  ASSERT_EQ(val, "ETAG");
+
+  /* Remove some attrs */
+  rmattrs[RGW_ATTR_ACL] = b1;
+  map<string, bufferlist> empty;
+  ret = op_target.set_attrs(dpp, empty, &rmattrs);
+  ASSERT_EQ(ret, 0);
+
+  /* read those attrs */
+  ret = read_op.prepare(dpp);
+  ASSERT_EQ(ret, 0);
+
+  ASSERT_EQ(readattrs.count(RGW_ATTR_ACL), 0);
+  decode(val, readattrs[RGW_ATTR_LC]);
+  ASSERT_EQ(val, "LC");
+  decode(val, readattrs[RGW_ATTR_ETAG]);
+  ASSERT_EQ(val, "ETAG");
+}
+
+TEST_F(DBStoreTest, WriteObject) {
+  struct DBOpParams params = GlobalParams;
+  int ret = -1;
+  params.op.obj.state.obj.key.name = "object3";
+  params.op.obj.state.obj.key.instance = "inst3";
+  DB::Object op_target(db, params.op.bucket.info,
+      params.op.obj.state.obj);
+
+  bufferlist b1;
+  encode("HELLO WORLD - Object3", b1);
+  params.op.obj.head_data = b1;
+  params.op.obj.state.size = 22;
+
+  ret = write_object(dpp, params);
+  ASSERT_EQ(ret, 0);
+}
+
+TEST_F(DBStoreTest, ReadObject) {
+  struct DBOpParams params = GlobalParams;
+  int ret = -1;
+  map<string, bufferlist> readattrs;
+  params.op.obj.state.obj.key.name = "object3";
+  params.op.obj.state.obj.key.instance = "inst3";
+  uint64_t obj_size;
+  DB::Object op_target(db, params.op.bucket.info,
+      params.op.obj.state.obj);
+  DB::Object::Read read_op(&op_target);
+  read_op.params.attrs = &readattrs;
+  read_op.params.obj_size = &obj_size;
+  ret = read_op.prepare(dpp);
+  ASSERT_EQ(ret, 0);
+
+  bufferlist bl;
+  ret = read_op.read(0, 25, bl, dpp);
+  cout<<"XXXXXXXXX Insert bl.length " << bl.length() << "\n";
+  ASSERT_EQ(ret, 25);
+
+  string data;
+  decode(data, bl);
+  ASSERT_EQ(data, "HELLO WORLD - Object3");
+  ASSERT_EQ(obj_size, 22);
+}
+
+TEST_F(DBStoreTest, IterateObject) {
+  struct DBOpParams params = GlobalParams;
+  int ret = -1;
+  map<string, bufferlist> readattrs;
+  uint64_t obj_size;
+  DBGetDataCB cb;
+
+  DB::Object op_target(db, params.op.bucket.info,
+      params.op.obj.state.obj);
+  DB::Object::Read read_op(&op_target);
+  read_op.params.attrs = &readattrs;
+  read_op.params.obj_size = &obj_size;
+  ret = read_op.prepare(dpp);
+  ASSERT_EQ(ret, 0);
+
+  bufferlist bl;
+  ret = read_op.iterate(dpp, 0, 15, &cb);
+  ASSERT_EQ(ret, 0);
+  string data;
+  decode(data, cb.data_bl);
+  cout << "XXXXXXXXXX iterate data is " << data << ", bl_ofs = " << cb.data_ofs << ", bl_len = " << cb.data_len << "\n";
+  ASSERT_EQ(data, "HELLO WORLD");
+  ASSERT_EQ(cb.data_ofs, 0);
+  ASSERT_EQ(cb.data_len, 15);
+}
+
+TEST_F(DBStoreTest, ListBucketObjects) {
+  struct DBOpParams params = GlobalParams;
+  int ret = -1;
+  
+  int max = 2;
+  bool is_truncated = false;
+  rgw_obj_key marker1;
+  DB::Bucket target(db, params.op.bucket.info);
+  DB::Bucket::List list_op(&target);
+
+  vector<rgw_bucket_dir_entry> dir_list;
+
+  marker1.name = "";
+  do {
+    is_truncated = false;
+    list_op.params.marker = marker1;
+    ret = list_op.list_objects(dpp, max, &dir_list, nullptr, &is_truncated);
+    ASSERT_EQ(ret, 0);
+
+    cout << "marker1 :" << marker1.name << "\n";
+
+    cout << "is_truncated :" << is_truncated << "\n";
+
+    for (const auto& ent: dir_list) {
+      cls_rgw_obj_key key = ent.key;
+      cout << "###################### \n";
+      cout << "key.name : " << key.name << "\n";
+      cout << "key.instance : " << key.instance << "\n";
+
+      marker1 = list_op.get_next_marker();
+    }
+    dir_list.clear();
+  } while(is_truncated);
+}
+
+TEST_F(DBStoreTest, DeleteObj) {
+  struct DBOpParams params = GlobalParams;
+  int ret = -1;
+  RGWObjState *s;
+
+  /* delete object2 */
+  params.op.obj.state.obj.key.name = "object2";
+  params.op.obj.state.obj.key.instance = "inst2";
+  DB::Object op_target(db, params.op.bucket.info,
+      params.op.obj.state.obj);
+
+  DB::Object::Delete delete_op(&op_target);
+  ret = delete_op.delete_obj(dpp);
+  ASSERT_EQ(ret, 0);
+
+  /* Should return ENOENT */
+  ret = op_target.get_state(dpp, &s, false);
+  ASSERT_EQ(ret, -2);
+}
+
+TEST_F(DBStoreTest, WriteVersionedObject) {
+  struct DBOpParams params = GlobalParams;
+  int ret = -1;
+  std::string instances[] = {"inst1", "inst2", "inst3"};
+  bufferlist b1;
+
+  params.op.obj.flags |= rgw_bucket_dir_entry::FLAG_CURRENT;
+  params.op.obj.state.obj.key.name = "object1";
+
+  /* Write versioned objects */
+  DB::Object op_target(db, params.op.bucket.info, params.op.obj.state.obj);
+  DB::Object::Write write_op(&op_target);
+
+  /* Version1 */
+  params.op.obj.state.obj.key.instance = instances[0];
+  encode("HELLO WORLD", b1);
+  params.op.obj.head_data = b1;
+  params.op.obj.state.size = 12;
+  ret = write_object(dpp, params);
+  ASSERT_EQ(ret, 0);
+
+  /* Version2 */
+  params.op.obj.state.obj.key.instance = instances[1];
+  b1.clear();
+  encode("HELLO WORLD ABC", b1);
+  params.op.obj.head_data = b1;
+  params.op.obj.state.size = 16;
+  ret = write_object(dpp, params);
+  ASSERT_EQ(ret, 0);
+
+  /* Version3 */
+  params.op.obj.state.obj.key.instance = instances[2];
+  b1.clear();
+  encode("HELLO WORLD A", b1);
+  params.op.obj.head_data = b1;
+  params.op.obj.state.size = 14;
+  ret = write_object(dpp, params);
+  ASSERT_EQ(ret, 0);
+}
+
+TEST_F(DBStoreTest, ListVersionedObject) {
+  struct DBOpParams params = GlobalParams;
+  int ret = -1;
+  std::string instances[] = {"inst1", "inst2", "inst3"};
+  int i = 0;
+
+  /* list versioned objects */
+  params.op.obj.state.obj.key.instance.clear();
+  params.op.list_max_count = MAX_VERSIONED_OBJECTS;
+  ret = db->ProcessOp(dpp, "ListVersionedObjects", &params);
+  ASSERT_EQ(ret, 0);
+
+  i = 2;
+  for (auto ent: params.op.obj.list_entries) {
+
+
+    ASSERT_EQ(ent.key.instance, instances[i]);
+    i--;
+  }
+}
+
+TEST_F(DBStoreTest, ReadVersionedObject) {
+  struct DBOpParams params = GlobalParams;
+  int ret = -1;
+  std::string instances[] = {"inst1", "inst2", "inst3"};
+  std::string data;
+
+  /* read object.. should fetch latest version */
+  RGWObjState* s;
+  params = GlobalParams;
+  params.op.obj.state.obj.key.instance.clear();
+  DB::Object op_target2(db, params.op.bucket.info, params.op.obj.state.obj);
+  ret = op_target2.get_obj_state(dpp, params.op.bucket.info, params.op.obj.state.obj,
+                                 true, &s);
+  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(s->obj.key.instance, instances[2]);
+  decode(data, s->data);
+  ASSERT_EQ(data, "HELLO WORLD A");
+  ASSERT_EQ(s->size, 14);
+
+  /* read a particular non-current version */
+  params.op.obj.state.obj.key.instance = instances[1];
+  DB::Object op_target3(db, params.op.bucket.info, params.op.obj.state.obj);
+  ret = op_target3.get_obj_state(dpp, params.op.bucket.info, params.op.obj.state.obj,
+                                 true, &s);
+  ASSERT_EQ(ret, 0);
+  decode(data, s->data);
+  ASSERT_EQ(data, "HELLO WORLD ABC");
+  ASSERT_EQ(s->size, 16);
+}
+
+TEST_F(DBStoreTest, DeleteVersionedObject) {
+  struct DBOpParams params = GlobalParams;
+  int ret = -1;
+  std::string instances[] = {"inst1", "inst2", "inst3"};
+  std::string data;
+  std::string dm_instance;
+  int i = 0;
+
+  /* Delete object..should create delete marker */
+  params.op.obj.state.obj.key.instance.clear();
+  DB::Object op_target(db, params.op.bucket.info, params.op.obj.state.obj);
+  DB::Object::Delete delete_op(&op_target);
+  delete_op.params.versioning_status |= BUCKET_VERSIONED;
+
+  ret = delete_op.delete_obj(dpp);
+  ASSERT_EQ(ret, 0);
+
+  /* list versioned objects */
+  params = GlobalParams;
+  params.op.obj.state.obj.key.instance.clear();
+  params.op.list_max_count = MAX_VERSIONED_OBJECTS;
+  ret = db->ProcessOp(dpp, "ListVersionedObjects", &params);
+
+  i = 3;
+  for (auto ent: params.op.obj.list_entries) {
+    string is_delete_marker = (ent.flags & rgw_bucket_dir_entry::FLAG_DELETE_MARKER)? "true" : "false";
+    cout << "ent.name: " << ent.key.name << ". ent.instance: " << ent.key.instance << " is_delete_marker = " << is_delete_marker << "\n";
+
+    if (i == 3) {
+      ASSERT_EQ(is_delete_marker, "true");
+      dm_instance = ent.key.instance;
+    } else {
+      ASSERT_EQ(is_delete_marker, "false");
+      ASSERT_EQ(ent.key.instance, instances[i]);
+    }
+
+    i--;
+  }
+
+  /* read object.. should return -ENOENT */
+  RGWObjState* s;
+  params = GlobalParams;
+  params.op.obj.state.obj.key.instance.clear();
+  DB::Object op_target2(db, params.op.bucket.info, params.op.obj.state.obj);
+  ret = op_target2.get_obj_state(dpp, params.op.bucket.info, params.op.obj.state.obj,
+                                 true, &s);
+  ASSERT_EQ(ret, -ENOENT);
+
+  /* Delete delete marker..should be able to read object now */ 
+  params.op.obj.state.obj.key.instance = dm_instance;
+  DB::Object op_target3(db, params.op.bucket.info, params.op.obj.state.obj);
+  DB::Object::Delete delete_op2(&op_target3);
+  delete_op2.params.versioning_status |= BUCKET_VERSIONED;
+
+  ret = delete_op2.delete_obj(dpp);
+  ASSERT_EQ(ret, 0);
+
+  /* read object.. should fetch latest version */
+  params = GlobalParams;
+  params.op.obj.state.obj.key.instance.clear();
+  DB::Object op_target4(db, params.op.bucket.info, params.op.obj.state.obj);
+  ret = op_target4.get_obj_state(dpp, params.op.bucket.info, params.op.obj.state.obj,
+                                 true, &s);
+  ASSERT_EQ(s->obj.key.instance, instances[2]);
+  decode(data, s->data);
+  ASSERT_EQ(data, "HELLO WORLD A");
+  ASSERT_EQ(s->size, 14);
+
+  /* delete latest version using version-id. Next version should get promoted */
+  params.op.obj.state.obj.key.instance = instances[2];
+  DB::Object op_target5(db, params.op.bucket.info, params.op.obj.state.obj);
+  DB::Object::Delete delete_op3(&op_target5);
+  delete_op3.params.versioning_status |= BUCKET_VERSIONED;
+
+  ret = delete_op3.delete_obj(dpp);
+  ASSERT_EQ(ret, 0);
+
+  /* list versioned objects..only two versions should be present
+   * with second version marked as CURRENT */
+  params = GlobalParams;
+  params.op.obj.state.obj.key.instance.clear();
+  params.op.list_max_count = MAX_VERSIONED_OBJECTS;
+  ret = db->ProcessOp(dpp, "ListVersionedObjects", &params);
+
+  i = 1;
+  for (auto ent: params.op.obj.list_entries) {
+
+    if (i == 1) {
+      dm_instance = ent.key.instance;
+    } else {
+      ASSERT_EQ(ent.key.instance, instances[i]);
+    }
+
+    i--;
+  }
+
+}
+
+TEST_F(DBStoreTest, ObjectOmapSetVal) {
+  struct DBOpParams params = GlobalParams;
+  int ret = -1;
+
+  DB::Object op_target(db, params.op.bucket.info,
+      params.op.obj.state.obj);
+
+  string val = "part1_val";
+  bufferlist bl;
+  encode(val, bl);
+  ret = op_target.obj_omap_set_val_by_key(dpp, "part1", bl, false);
+  ASSERT_EQ(ret, 0);
+
+  val = "part2_val";
+  bl.clear();
+  encode(val, bl);
+  ret = op_target.obj_omap_set_val_by_key(dpp, "part2", bl, false);
+  ASSERT_EQ(ret, 0);
+
+  val = "part3_val";
+  bl.clear();
+  encode(val, bl);
+  ret = op_target.obj_omap_set_val_by_key(dpp, "part3", bl, false);
+  ASSERT_EQ(ret, 0);
+
+  val = "part4_val";
+  bl.clear();
+  encode(val, bl);
+  ret = op_target.obj_omap_set_val_by_key(dpp, "part4", bl, false);
+  ASSERT_EQ(ret, 0);
+}
+
+TEST_F(DBStoreTest, ObjectOmapGetValsByKeys) {
+  struct DBOpParams params = GlobalParams;
+  int ret = -1;
+  std::set<std::string> keys;
+  std::map<std::string, bufferlist> vals;
+
+  DB::Object op_target(db, params.op.bucket.info,
+      params.op.obj.state.obj);
+
+  keys.insert("part2");
+  keys.insert("part4");
+
+  ret = op_target.obj_omap_get_vals_by_keys(dpp, "", keys, &vals);
+  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(vals.size(), 2);
+
+  string val;
+  decode(val, vals["part2"]);
+  ASSERT_EQ(val, "part2_val");
+  decode(val, vals["part4"]);
+  ASSERT_EQ(val, "part4_val");
+}
+
+TEST_F(DBStoreTest, ObjectOmapGetAll) {
+  struct DBOpParams params = GlobalParams;
+  int ret = -1;
+  std::map<std::string, bufferlist> vals;
+
+  DB::Object op_target(db, params.op.bucket.info,
+      params.op.obj.state.obj);
+
+  ret = op_target.obj_omap_get_all(dpp, &vals);
+  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(vals.size(), 4);
+
+  string val;
+  decode(val, vals["part1"]);
+  ASSERT_EQ(val, "part1_val");
+  decode(val, vals["part2"]);
+  ASSERT_EQ(val, "part2_val");
+  decode(val, vals["part3"]);
+  ASSERT_EQ(val, "part3_val");
+  decode(val, vals["part4"]);
+  ASSERT_EQ(val, "part4_val");
+}
+
+TEST_F(DBStoreTest, ObjectOmapGetVals) {
+  struct DBOpParams params = GlobalParams;
+  int ret = -1;
+  std::set<std::string> keys;
+  std::map<std::string, bufferlist> vals;
+  bool pmore;
+
+  DB::Object op_target(db, params.op.bucket.info,
+      params.op.obj.state.obj);
+
+  ret = op_target.obj_omap_get_vals(dpp, "part3", 10, &vals, &pmore);
+  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(vals.size(), 2);
+
+  string val;
+  decode(val, vals["part3"]);
+  ASSERT_EQ(val, "part3_val");
+  decode(val, vals["part4"]);
+  ASSERT_EQ(val, "part4_val");
+}
+
+TEST_F(DBStoreTest, PutObjectData) {
+  struct DBOpParams params = GlobalParams;
+  int ret = -1;
+
+  params.op.obj_data.part_num = 1;
+  params.op.obj_data.offset = 10;
+  params.op.obj_data.multipart_part_str = "2";
+  bufferlist b1;
+  encode("HELLO WORLD", b1);
+  params.op.obj_data.data = b1;
+  params.op.obj_data.size = 12;
+  params.op.obj.state.mtime = real_clock::now();
+  ret = db->ProcessOp(dpp, "PutObjectData", &params);
+  ASSERT_EQ(ret, 0);
+}
+
+TEST_F(DBStoreTest, UpdateObjectData) {
+  struct DBOpParams params = GlobalParams;
+  int ret = -1;
+
+  params.op.obj.state.mtime = bucket_mtime;
+  ret = db->ProcessOp(dpp, "UpdateObjectData", &params);
+  ASSERT_EQ(ret, 0);
+}
+
+TEST_F(DBStoreTest, GetObjectData) {
+  struct DBOpParams params = GlobalParams;
+  int ret = -1;
+
+  params.op.obj.state.obj.key.instance = "inst1";
+  params.op.obj.state.obj.key.name = "object1";
+  ret = db->ProcessOp(dpp, "GetObjectData", &params);
+  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(params.op.obj_data.part_num, 1);
+  ASSERT_EQ(params.op.obj_data.offset, 10);
+  ASSERT_EQ(params.op.obj_data.multipart_part_str, "2");
+  ASSERT_EQ(params.op.obj.state.obj.key.instance, "inst1");
+  ASSERT_EQ(params.op.obj.state.obj.key.name, "object1");
+  ASSERT_EQ(params.op.obj.state.mtime, bucket_mtime);
+  string data;
+  decode(data, params.op.obj_data.data);
+  ASSERT_EQ(data, "HELLO WORLD");
+}
+
+TEST_F(DBStoreTest, DeleteObjectData) {
+  struct DBOpParams params = GlobalParams;
+  int ret = -1;
+
+  ret = db->ProcessOp(dpp, "DeleteObjectData", &params);
+  ASSERT_EQ(ret, 0);
+}
+
+TEST_F(DBStoreTest, DeleteObject) {
+  struct DBOpParams params = GlobalParams;
+  int ret = -1;
+
+  ret = db->ProcessOp(dpp, "DeleteObject", &params);
+  ASSERT_EQ(ret, 0);
+}
+
+TEST_F(DBStoreTest, LCTables) {
+  struct DBOpParams params = GlobalParams;
+  int ret = -1;
+
+  ret = db->createLCTables(dpp);
+  ASSERT_GE(ret, 0);
+}
+
+TEST_F(DBStoreTest, LCHead) {
+  struct DBOpParams params = GlobalParams;
+  int ret = -1;
+  std::string index1 = "bucket1";
+  std::string index2 = "bucket2";
+  time_t lc_time = ceph_clock_now();
+  std::unique_ptr<rgw::sal::Lifecycle::LCHead> head;
+  std::string ents[] = {"entry1", "entry2", "entry3"};
+  rgw::sal::StoreLifecycle::StoreLCHead head1(lc_time, 0, ents[0]);
+  rgw::sal::StoreLifecycle::StoreLCHead head2(lc_time, 0, ents[1]);
+  rgw::sal::StoreLifecycle::StoreLCHead head3(lc_time, 0, ents[2]);
+
+  ret = db->put_head(index1, head1);
+  ASSERT_EQ(ret, 0);
+  ret = db->put_head(index2, head2);
+  ASSERT_EQ(ret, 0);
+
+  ret = db->get_head(index1, &head);
+  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(head->get_marker(), "entry1");
+
+  ret = db->get_head(index2, &head);
+  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(head->get_marker(), "entry2");
+
+  // update index1
+  ret = db->put_head(index1, head3);
+  ASSERT_EQ(ret, 0);
+  ret = db->get_head(index1, &head);
+  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(head->get_marker(), "entry3");
+
+}
+TEST_F(DBStoreTest, LCEntry) {
+  struct DBOpParams params = GlobalParams;
+  int ret = -1;
+  uint64_t lc_time = ceph_clock_now();
+  std::string index1 = "lcindex1";
+  std::string index2 = "lcindex2";
+  typedef enum {lc_uninitial = 1, lc_complete} status;
+  std::string ents[] = {"bucket1", "bucket2", "bucket3", "bucket4"};
+  std::unique_ptr<rgw::sal::Lifecycle::LCEntry> entry;
+  rgw::sal::StoreLifecycle::StoreLCEntry entry1(ents[0], lc_time, lc_uninitial);
+  rgw::sal::StoreLifecycle::StoreLCEntry entry2(ents[1], lc_time, lc_uninitial);
+  rgw::sal::StoreLifecycle::StoreLCEntry entry3(ents[2], lc_time, lc_uninitial);
+  rgw::sal::StoreLifecycle::StoreLCEntry entry4(ents[3], lc_time, lc_uninitial);
+
+  vector<std::unique_ptr<rgw::sal::Lifecycle::LCEntry>> lc_entries;
+
+  ret = db->set_entry(index1, entry1);
+  ASSERT_EQ(ret, 0);
+  ret = db->set_entry(index1, entry2);
+  ASSERT_EQ(ret, 0);
+  ret = db->set_entry(index1, entry3);
+  ASSERT_EQ(ret, 0);
+  ret = db->set_entry(index2, entry4);
+  ASSERT_EQ(ret, 0);
+
+  // get entry index1, entry1
+  ret = db->get_entry(index1, ents[0], &entry); 
+  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(entry->get_status(), lc_uninitial);
+  ASSERT_EQ(entry->get_start_time(), lc_time);
+
+  // get next entry index1, entry2
+  ret = db->get_next_entry(index1, ents[1], &entry); 
+  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(entry->get_bucket(), ents[2]);
+  ASSERT_EQ(entry->get_status(), lc_uninitial);
+  ASSERT_EQ(entry->get_start_time(), lc_time);
+
+  // update entry4 to entry5
+  entry4.status = lc_complete;
+  ret = db->set_entry(index2, entry4);
+  ASSERT_EQ(ret, 0);
+  ret = db->get_entry(index2, ents[3], &entry); 
+  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(entry->get_status(), lc_complete);
+
+  // list entries
+  ret = db->list_entries(index1, "", 5, lc_entries);
+  ASSERT_EQ(ret, 0);
+  for (const auto& ent: lc_entries) {
+    cout << "###################### \n";
+    cout << "lc entry.bucket : " << ent->get_bucket() << "\n";
+    cout << "lc entry.status : " << ent->get_status() << "\n";
+  }
+
+  // remove index1, entry3
+  ret = db->rm_entry(index1, entry3); 
+  ASSERT_EQ(ret, 0);
+
+  // get next entry index1, entry2.. should be null
+  entry.release();
+  ret = db->get_next_entry(index1, ents[1], &entry); 
+  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(entry.get(), nullptr);
+}
+
+TEST_F(DBStoreTest, RemoveBucket) {
+  struct DBOpParams params = GlobalParams;
+  int ret = -1;
+
+  ret = db->ProcessOp(dpp, "RemoveBucket", &params);
+  ASSERT_EQ(ret, 0);
+}
+
+TEST_F(DBStoreTest, RemoveUser) {
+  struct DBOpParams params = GlobalParams;
+  int ret = -1;
+
+  ret = db->ProcessOp(dpp, "RemoveUser", &params);
+  ASSERT_EQ(ret, 0);
+}
+
+TEST_F(DBStoreTest, InsertTestIDUser) {
+  struct DBOpParams params = GlobalParams;
+  int ret = -1;
+
+  params.op.user.uinfo.user_id.id = "testid";
+  params.op.user.uinfo.display_name = "M. Tester";
+  params.op.user.uinfo.user_id.tenant = "tenant";
+  params.op.user.uinfo.user_email = "tester@ceph.com";
+  RGWAccessKey k1("0555b35654ad1656d804", "h7GhxuBLTrlhVUyxSPUKUV8r/2EI4ngqJxD7iBdBYLhwluN30JaT3Q==");
+  params.op.user.uinfo.access_keys["0555b35654ad1656d804"] = k1;
+  params.op.user.user_version.ver = 1;    
+  params.op.user.user_version.tag = "UserTAG";    
+
+  ret = db->ProcessOp(dpp, "InsertUser", &params);
+  ASSERT_EQ(ret, 0);
+}
+
+int main(int argc, char **argv)
+{
+  int ret = -1;
+  string c_logfile = "rgw_dbstore_tests.log";
+  int c_loglevel = 20;
+
+  // format: ./dbstore-tests logfile loglevel
+  if (argc == 3) {
+    c_logfile = argv[1];
+    c_loglevel = (atoi)(argv[2]);
+    cout << "logfile:" << c_logfile << ", loglevel set to " << c_loglevel << "\n";
+  }
+
+  ::testing::InitGoogleTest(&argc, argv);
+
+  gtest::env = new gtest::Environment();
+  gtest::env->logfile = c_logfile;
+  gtest::env->loglevel = c_loglevel;
+  ::testing::AddGlobalTestEnvironment(gtest::env);
+
+  ret = RUN_ALL_TESTS();
+
+  return ret;
+}
diff --git a/src/rgw/driver/immutable_config/store.cc b/src/rgw/driver/immutable_config/store.cc
new file mode 100644 (file)
index 0000000..8d3e076
--- /dev/null
@@ -0,0 +1,422 @@
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2022 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "rgw_zone.h"
+#include "store.h"
+
+namespace rgw::sal {
+
+ImmutableConfigStore::ImmutableConfigStore(const RGWZoneGroup& zonegroup,
+                                           const RGWZoneParams& zone,
+                                           const RGWPeriodConfig& period_config)
+    : zonegroup(zonegroup), zone(zone), period_config(period_config)
+{
+}
+
+// Realm
+int ImmutableConfigStore::write_default_realm_id(const DoutPrefixProvider* dpp,
+                                                 optional_yield y, bool exclusive,
+                                                 std::string_view realm_id)
+{
+  return -EROFS;
+}
+
+int ImmutableConfigStore::read_default_realm_id(const DoutPrefixProvider* dpp,
+                                                optional_yield y,
+                                                std::string& realm_id)
+{
+  return -ENOENT;
+}
+
+int ImmutableConfigStore::delete_default_realm_id(const DoutPrefixProvider* dpp,
+                                                  optional_yield y)
+{
+  return -EROFS;
+}
+
+
+int ImmutableConfigStore::create_realm(const DoutPrefixProvider* dpp,
+                                       optional_yield y, bool exclusive,
+                                       const RGWRealm& info,
+                                       std::unique_ptr<RealmWriter>* writer)
+{
+  return -EROFS;
+}
+
+int ImmutableConfigStore::read_realm_by_id(const DoutPrefixProvider* dpp,
+                                           optional_yield y,
+                                           std::string_view realm_id,
+                                           RGWRealm& info,
+                                           std::unique_ptr<RealmWriter>* writer)
+{
+  return -ENOENT;
+}
+
+int ImmutableConfigStore::read_realm_by_name(const DoutPrefixProvider* dpp,
+                                             optional_yield y,
+                                             std::string_view realm_name,
+                                             RGWRealm& info,
+                                             std::unique_ptr<RealmWriter>* writer)
+{
+  return -ENOENT;
+}
+
+int ImmutableConfigStore::read_default_realm(const DoutPrefixProvider* dpp,
+                                             optional_yield y,
+                                             RGWRealm& info,
+                                             std::unique_ptr<RealmWriter>* writer)
+{
+  return -ENOENT;
+}
+
+int ImmutableConfigStore::read_realm_id(const DoutPrefixProvider* dpp,
+                                        optional_yield y, std::string_view realm_name,
+                                        std::string& realm_id)
+{
+  return -ENOENT;
+}
+
+int ImmutableConfigStore::realm_notify_new_period(const DoutPrefixProvider* dpp,
+                                                  optional_yield y,
+                                                  const RGWPeriod& period)
+{
+  return -ENOTSUP;
+}
+
+int ImmutableConfigStore::list_realm_names(const DoutPrefixProvider* dpp,
+                                           optional_yield y, const std::string& marker,
+                                           std::span<std::string> entries,
+                                           ListResult<std::string>& result)
+{
+  result.next.clear();
+  result.entries = entries.first(0);
+  return 0;
+}
+
+
+// Period
+int ImmutableConfigStore::create_period(const DoutPrefixProvider* dpp,
+                                        optional_yield y, bool exclusive,
+                                        const RGWPeriod& info)
+{
+  return -EROFS;
+}
+
+int ImmutableConfigStore::read_period(const DoutPrefixProvider* dpp,
+                                      optional_yield y, std::string_view period_id,
+                                      std::optional<uint32_t> epoch, RGWPeriod& info)
+{
+  return -ENOENT;
+}
+
+int ImmutableConfigStore::delete_period(const DoutPrefixProvider* dpp,
+                                        optional_yield y,
+                                        std::string_view period_id)
+{
+  return -EROFS;
+}
+
+int ImmutableConfigStore::list_period_ids(const DoutPrefixProvider* dpp,
+                                          optional_yield y, const std::string& marker,
+                                          std::span<std::string> entries,
+                                          ListResult<std::string>& result)
+{
+  result.next.clear();
+  result.entries = entries.first(0);
+  return 0;
+}
+
+
+// ZoneGroup
+
+class ImmutableZoneGroupWriter : public ZoneGroupWriter {
+ public:
+  int write(const DoutPrefixProvider* dpp, optional_yield y,
+            const RGWZoneGroup& info) override
+  {
+    return -EROFS;
+  }
+  int rename(const DoutPrefixProvider* dpp, optional_yield y,
+             RGWZoneGroup& info, std::string_view new_name) override
+  {
+    return -EROFS;
+  }
+  int remove(const DoutPrefixProvider* dpp, optional_yield y) override
+  {
+    return -EROFS;
+  }
+};
+
+int ImmutableConfigStore::write_default_zonegroup_id(const DoutPrefixProvider* dpp,
+                                                     optional_yield y, bool exclusive,
+                                                     std::string_view realm_id,
+                                                     std::string_view zonegroup_id)
+{
+  return -EROFS;
+}
+
+int ImmutableConfigStore::read_default_zonegroup_id(const DoutPrefixProvider* dpp,
+                                                    optional_yield y,
+                                                    std::string_view realm_id,
+                                                    std::string& zonegroup_id)
+{
+  if (!realm_id.empty()) {
+    return -ENOENT;
+  }
+  zonegroup_id = zonegroup.id;
+  return 0;
+}
+
+int ImmutableConfigStore::delete_default_zonegroup_id(const DoutPrefixProvider* dpp,
+                                                      optional_yield y,
+                                                      std::string_view realm_id)
+{
+  return -EROFS;
+}
+
+
+int ImmutableConfigStore::create_zonegroup(const DoutPrefixProvider* dpp,
+                                           optional_yield y, bool exclusive,
+                                           const RGWZoneGroup& info,
+                                           std::unique_ptr<ZoneGroupWriter>* writer)
+{
+  return -EROFS;
+}
+
+int ImmutableConfigStore::read_zonegroup_by_id(const DoutPrefixProvider* dpp,
+                                               optional_yield y,
+                                               std::string_view zonegroup_id,
+                                               RGWZoneGroup& info,
+                                               std::unique_ptr<ZoneGroupWriter>* writer)
+{
+  if (zonegroup_id != zonegroup.id) {
+    return -ENOENT;
+  }
+
+  info = zonegroup;
+
+  if (writer) {
+    *writer = std::make_unique<ImmutableZoneGroupWriter>();
+  }
+  return 0;
+}
+int ImmutableConfigStore::read_zonegroup_by_name(const DoutPrefixProvider* dpp,
+                                                 optional_yield y,
+                                                 std::string_view zonegroup_name,
+                                                 RGWZoneGroup& info,
+                                                 std::unique_ptr<ZoneGroupWriter>* writer)
+{
+  if (zonegroup_name != zonegroup.name) {
+    return -ENOENT;
+  }
+
+  info = zonegroup;
+
+  if (writer) {
+    *writer = std::make_unique<ImmutableZoneGroupWriter>();
+  }
+  return 0;
+}
+
+int ImmutableConfigStore::read_default_zonegroup(const DoutPrefixProvider* dpp,
+                                                 optional_yield y,
+                                                 std::string_view realm_id,
+                                                 RGWZoneGroup& info,
+                                                 std::unique_ptr<ZoneGroupWriter>* writer)
+{
+  info = zonegroup;
+
+  if (writer) {
+    *writer = std::make_unique<ImmutableZoneGroupWriter>();
+  }
+  return 0;
+}
+
+int ImmutableConfigStore::list_zonegroup_names(const DoutPrefixProvider* dpp,
+                                               optional_yield y, const std::string& marker,
+                                               std::span<std::string> entries,
+                                               ListResult<std::string>& result)
+{
+  if (marker < zonegroup.name) {
+    entries[0] = zonegroup.name;
+    result.next = zonegroup.name;
+    result.entries = entries.first(1);
+  } else {
+    result.next.clear();
+    result.entries = entries.first(0);
+  }
+  return 0;
+}
+
+// Zone
+
+class ImmutableZoneWriter : public ZoneWriter {
+ public:
+  int write(const DoutPrefixProvider* dpp, optional_yield y,
+            const RGWZoneParams& info) override
+  {
+    return -EROFS;
+  }
+  int rename(const DoutPrefixProvider* dpp, optional_yield y,
+             RGWZoneParams& info, std::string_view new_name) override
+  {
+    return -EROFS;
+  }
+  int remove(const DoutPrefixProvider* dpp, optional_yield y) override
+  {
+    return -EROFS;
+  }
+};
+
+int ImmutableConfigStore::write_default_zone_id(const DoutPrefixProvider* dpp,
+                                                optional_yield y, bool exclusive,
+                                                std::string_view realm_id,
+                                                std::string_view zone_id)
+{
+  return -EROFS;
+}
+
+int ImmutableConfigStore::read_default_zone_id(const DoutPrefixProvider* dpp,
+                                               optional_yield y,
+                                               std::string_view realm_id,
+                                               std::string& zone_id)
+{
+  if (realm_id.empty()) {
+    return -ENOENT;
+  }
+  zone_id = zone.id;
+  return 0;
+}
+
+int ImmutableConfigStore::delete_default_zone_id(const DoutPrefixProvider* dpp,
+                                                 optional_yield y,
+                                                 std::string_view realm_id)
+{
+  return -EROFS;
+}
+
+
+int ImmutableConfigStore::create_zone(const DoutPrefixProvider* dpp,
+                                      optional_yield y, bool exclusive,
+                                      const RGWZoneParams& info,
+                                      std::unique_ptr<ZoneWriter>* writer)
+{
+  return -EROFS;
+}
+
+int ImmutableConfigStore::read_zone_by_id(const DoutPrefixProvider* dpp,
+                                          optional_yield y,
+                                          std::string_view zone_id,
+                                          RGWZoneParams& info,
+                                          std::unique_ptr<ZoneWriter>* writer)
+{
+  if (zone_id != zone.id) {
+    return -ENOENT;
+  }
+
+  info = zone;
+
+  if (writer) {
+    *writer = std::make_unique<ImmutableZoneWriter>();
+  }
+  return 0;
+}
+
+int ImmutableConfigStore::read_zone_by_name(const DoutPrefixProvider* dpp,
+                                            optional_yield y,
+                                            std::string_view zone_name,
+                                            RGWZoneParams& info,
+                                            std::unique_ptr<ZoneWriter>* writer)
+{
+  if (zone_name != zone.name) {
+    return -ENOENT;
+  }
+
+  info = zone;
+
+  if (writer) {
+    *writer = std::make_unique<ImmutableZoneWriter>();
+  }
+  return 0;
+}
+
+int ImmutableConfigStore::read_default_zone(const DoutPrefixProvider* dpp,
+                                            optional_yield y,
+                                            std::string_view realm_id,
+                                            RGWZoneParams& info,
+                                            std::unique_ptr<ZoneWriter>* writer)
+{
+  if (!realm_id.empty()) {
+    return -ENOENT;
+  }
+
+  info = zone;
+
+  if (writer) {
+    *writer = std::make_unique<ImmutableZoneWriter>();
+  }
+  return 0;
+}
+
+int ImmutableConfigStore::list_zone_names(const DoutPrefixProvider* dpp,
+                                          optional_yield y, const std::string& marker,
+                                          std::span<std::string> entries,
+                                          ListResult<std::string>& result)
+{
+  if (marker < zone.name) {
+    entries[0] = zone.name;
+    result.next = zone.name;
+    result.entries = entries.first(1);
+  } else {
+    result.next.clear();
+    result.entries = entries.first(0);
+  }
+  return 0;
+}
+
+
+// PeriodConfig
+int ImmutableConfigStore::read_period_config(const DoutPrefixProvider* dpp,
+                                             optional_yield y,
+                                             std::string_view realm_id,
+                                             RGWPeriodConfig& info)
+{
+  if (!realm_id.empty()) {
+    return -ENOENT;
+  }
+
+  info = period_config;
+  return 0;
+}
+
+int ImmutableConfigStore::write_period_config(const DoutPrefixProvider* dpp,
+                                              optional_yield y, bool exclusive,
+                                              std::string_view realm_id,
+                                              const RGWPeriodConfig& info)
+{
+  return -EROFS;
+}
+
+
+/// ImmutableConfigStore factory function
+auto create_immutable_config_store(const DoutPrefixProvider* dpp,
+                                   const RGWZoneGroup& zonegroup,
+                                   const RGWZoneParams& zone,
+                                   const RGWPeriodConfig& period_config)
+  -> std::unique_ptr<ConfigStore>
+{
+  return std::make_unique<ImmutableConfigStore>(zonegroup, zone, period_config);
+}
+
+} // namespace rgw::sal
diff --git a/src/rgw/driver/immutable_config/store.h b/src/rgw/driver/immutable_config/store.h
new file mode 100644 (file)
index 0000000..9a1ac5f
--- /dev/null
@@ -0,0 +1,180 @@
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2022 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#pragma once
+
+#include "rgw_sal_config.h"
+
+namespace rgw::sal {
+
+/// A read-only ConfigStore that serves the given default zonegroup and zone.
+class ImmutableConfigStore : public ConfigStore {
+ public:
+  explicit ImmutableConfigStore(const RGWZoneGroup& zonegroup,
+                                const RGWZoneParams& zone,
+                                const RGWPeriodConfig& period_config);
+
+  // Realm
+  virtual int write_default_realm_id(const DoutPrefixProvider* dpp,
+                                     optional_yield y, bool exclusive,
+                                     std::string_view realm_id) override;
+  virtual int read_default_realm_id(const DoutPrefixProvider* dpp,
+                                    optional_yield y,
+                                    std::string& realm_id) override;
+  virtual int delete_default_realm_id(const DoutPrefixProvider* dpp,
+                                      optional_yield y) override;
+
+  virtual int create_realm(const DoutPrefixProvider* dpp,
+                           optional_yield y, bool exclusive,
+                           const RGWRealm& info,
+                           std::unique_ptr<RealmWriter>* writer) override;
+  virtual int read_realm_by_id(const DoutPrefixProvider* dpp,
+                               optional_yield y,
+                               std::string_view realm_id,
+                               RGWRealm& info,
+                               std::unique_ptr<RealmWriter>* writer) override;
+  virtual int read_realm_by_name(const DoutPrefixProvider* dpp,
+                                 optional_yield y,
+                                 std::string_view realm_name,
+                                 RGWRealm& info,
+                                 std::unique_ptr<RealmWriter>* writer) override;
+  virtual int read_default_realm(const DoutPrefixProvider* dpp,
+                                 optional_yield y,
+                                 RGWRealm& info,
+                                 std::unique_ptr<RealmWriter>* writer) override;
+  virtual int read_realm_id(const DoutPrefixProvider* dpp,
+                            optional_yield y, std::string_view realm_name,
+                            std::string& realm_id) override;
+  virtual int realm_notify_new_period(const DoutPrefixProvider* dpp,
+                                      optional_yield y,
+                                      const RGWPeriod& period) override;
+  virtual int list_realm_names(const DoutPrefixProvider* dpp,
+                               optional_yield y, const std::string& marker,
+                               std::span<std::string> entries,
+                               ListResult<std::string>& result) override;
+
+  // Period
+  virtual int create_period(const DoutPrefixProvider* dpp,
+                            optional_yield y, bool exclusive,
+                            const RGWPeriod& info) override;
+  virtual int read_period(const DoutPrefixProvider* dpp,
+                          optional_yield y, std::string_view period_id,
+                          std::optional<uint32_t> epoch, RGWPeriod& info) override;
+  virtual int delete_period(const DoutPrefixProvider* dpp,
+                            optional_yield y,
+                            std::string_view period_id) override;
+  virtual int list_period_ids(const DoutPrefixProvider* dpp,
+                              optional_yield y, const std::string& marker,
+                              std::span<std::string> entries,
+                              ListResult<std::string>& result) override;
+
+  // ZoneGroup
+  virtual int write_default_zonegroup_id(const DoutPrefixProvider* dpp,
+                                         optional_yield y, bool exclusive,
+                                         std::string_view realm_id,
+                                         std::string_view zonegroup_id) override;
+  virtual int read_default_zonegroup_id(const DoutPrefixProvider* dpp,
+                                        optional_yield y,
+                                        std::string_view realm_id,
+                                        std::string& zonegroup_id) override;
+  virtual int delete_default_zonegroup_id(const DoutPrefixProvider* dpp,
+                                          optional_yield y,
+                                          std::string_view realm_id) override;
+
+  virtual int create_zonegroup(const DoutPrefixProvider* dpp,
+                               optional_yield y, bool exclusive,
+                               const RGWZoneGroup& info,
+                               std::unique_ptr<ZoneGroupWriter>* writer) override;
+  virtual int read_zonegroup_by_id(const DoutPrefixProvider* dpp,
+                                   optional_yield y,
+                                   std::string_view zonegroup_id,
+                                   RGWZoneGroup& info,
+                                   std::unique_ptr<ZoneGroupWriter>* writer) override;
+  virtual int read_zonegroup_by_name(const DoutPrefixProvider* dpp,
+                                     optional_yield y,
+                                     std::string_view zonegroup_name,
+                                     RGWZoneGroup& info,
+                                     std::unique_ptr<ZoneGroupWriter>* writer) override;
+  virtual int read_default_zonegroup(const DoutPrefixProvider* dpp,
+                                     optional_yield y,
+                                     std::string_view realm_id,
+                                     RGWZoneGroup& info,
+                                     std::unique_ptr<ZoneGroupWriter>* writer) override;
+  virtual int list_zonegroup_names(const DoutPrefixProvider* dpp,
+                                   optional_yield y, const std::string& marker,
+                                   std::span<std::string> entries,
+                                   ListResult<std::string>& result) override;
+
+  // Zone
+  virtual int write_default_zone_id(const DoutPrefixProvider* dpp,
+                                    optional_yield y, bool exclusive,
+                                    std::string_view realm_id,
+                                    std::string_view zone_id) override;
+  virtual int read_default_zone_id(const DoutPrefixProvider* dpp,
+                                   optional_yield y,
+                                   std::string_view realm_id,
+                                   std::string& zone_id) override;
+  virtual int delete_default_zone_id(const DoutPrefixProvider* dpp,
+                                     optional_yield y,
+                                     std::string_view realm_id) override;
+
+  virtual int create_zone(const DoutPrefixProvider* dpp,
+                          optional_yield y, bool exclusive,
+                          const RGWZoneParams& info,
+                          std::unique_ptr<ZoneWriter>* writer) override;
+  virtual int read_zone_by_id(const DoutPrefixProvider* dpp,
+                              optional_yield y,
+                              std::string_view zone_id,
+                              RGWZoneParams& info,
+                              std::unique_ptr<ZoneWriter>* writer) override;
+  virtual int read_zone_by_name(const DoutPrefixProvider* dpp,
+                                optional_yield y,
+                                std::string_view zone_name,
+                                RGWZoneParams& info,
+                                std::unique_ptr<ZoneWriter>* writer) override;
+  virtual int read_default_zone(const DoutPrefixProvider* dpp,
+                                optional_yield y,
+                                std::string_view realm_id,
+                                RGWZoneParams& info,
+                                std::unique_ptr<ZoneWriter>* writer) override;
+  virtual int list_zone_names(const DoutPrefixProvider* dpp,
+                              optional_yield y, const std::string& marker,
+                              std::span<std::string> entries,
+                              ListResult<std::string>& result) override;
+
+  // PeriodConfig
+  virtual int read_period_config(const DoutPrefixProvider* dpp,
+                                 optional_yield y,
+                                 std::string_view realm_id,
+                                 RGWPeriodConfig& info) override;
+  virtual int write_period_config(const DoutPrefixProvider* dpp,
+                                  optional_yield y, bool exclusive,
+                                  std::string_view realm_id,
+                                  const RGWPeriodConfig& info) override;
+
+ private:
+  const RGWZoneGroup zonegroup;
+  const RGWZoneParams zone;
+  const RGWPeriodConfig period_config;
+}; // ImmutableConfigStore
+
+
+/// ImmutableConfigStore factory function
+auto create_immutable_config_store(const DoutPrefixProvider* dpp,
+                                   const RGWZoneGroup& zonegroup,
+                                   const RGWZoneParams& zone,
+                                   const RGWPeriodConfig& period_config)
+  -> std::unique_ptr<ConfigStore>;
+
+} // namespace rgw::sal
diff --git a/src/rgw/driver/json_config/store.cc b/src/rgw/driver/json_config/store.cc
new file mode 100644 (file)
index 0000000..330aa34
--- /dev/null
@@ -0,0 +1,176 @@
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2022 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include <system_error>
+#include "include/buffer.h"
+#include "common/errno.h"
+#include "common/ceph_json.h"
+#include "rgw_zone.h"
+#include "driver/immutable_config/store.h"
+#include "store.h"
+
+namespace rgw::sal {
+
+namespace {
+
+struct DecodedConfig {
+  RGWZoneGroup zonegroup;
+  RGWZoneParams zone;
+  RGWPeriodConfig period_config;
+
+  void decode_json(JSONObj *obj)
+  {
+    JSONDecoder::decode_json("zonegroup", zonegroup, obj);
+    JSONDecoder::decode_json("zone", zone, obj);
+    JSONDecoder::decode_json("period_config", period_config, obj);
+  }
+};
+
+static void parse_config(const DoutPrefixProvider* dpp, const char* filename)
+{
+  bufferlist bl;
+  std::string errmsg;
+  int r = bl.read_file(filename, &errmsg);
+  if (r < 0) {
+    ldpp_dout(dpp, 0) << "failed to read json config file '" << filename
+        << "': " << errmsg << dendl;
+    throw std::system_error(-r, std::system_category());
+  }
+
+  JSONParser p;
+  if (!p.parse(bl.c_str(), bl.length())) {
+    ldpp_dout(dpp, 0) << "failed to parse json config file" << dendl;
+    throw std::system_error(make_error_code(std::errc::invalid_argument));
+  }
+
+  DecodedConfig config;
+  try {
+    decode_json_obj(config, &p);
+  } catch (const JSONDecoder::err& e) {
+    ldpp_dout(dpp, 0) << "failed to decode JSON input: " << e.what() << dendl;
+    throw std::system_error(make_error_code(std::errc::invalid_argument));
+  }
+}
+
+void sanity_check_config(const DoutPrefixProvider* dpp, DecodedConfig& config)
+{
+  if (config.zonegroup.id.empty()) {
+    config.zonegroup.id = "default";
+  }
+  if (config.zonegroup.name.empty()) {
+    config.zonegroup.name = "default";
+  }
+  if (config.zonegroup.api_name.empty()) {
+    config.zonegroup.api_name = config.zonegroup.name;
+  }
+
+  if (config.zone.id.empty()) {
+    config.zone.id = "default";
+  }
+  if (config.zone.name.empty()) {
+    config.zone.name = "default";
+  }
+
+  // add default placement if it doesn't exist
+  rgw_pool pool;
+  RGWZonePlacementInfo placement;
+  placement.storage_classes.set_storage_class(
+      RGW_STORAGE_CLASS_STANDARD, &pool, nullptr);
+  config.zone.placement_pools.emplace("default-placement",
+                                      std::move(placement));
+
+  std::set<rgw_pool> pools;
+  int r = rgw::init_zone_pool_names(dpp, null_yield, pools, config.zone);
+  if (r < 0) {
+    ldpp_dout(dpp, 0) << "failed to set default zone pool names" << dendl;
+    throw std::system_error(-r, std::system_category());
+  }
+
+  // verify that config.zonegroup only contains config.zone
+  if (config.zonegroup.zones.size() > 1) {
+    ldpp_dout(dpp, 0) << "zonegroup cannot contain multiple zones" << dendl;
+    throw std::system_error(make_error_code(std::errc::invalid_argument));
+  }
+
+  if (config.zonegroup.zones.size() == 1) {
+    auto z = config.zonegroup.zones.begin();
+    if (z->first != config.zone.id) {
+      ldpp_dout(dpp, 0) << "zonegroup contains unknown zone id="
+          << z->first << dendl;
+      throw std::system_error(make_error_code(std::errc::invalid_argument));
+    }
+    if (z->second.id != config.zone.id) {
+      ldpp_dout(dpp, 0) << "zonegroup contains unknown zone id="
+          << z->second.id << dendl;
+      throw std::system_error(make_error_code(std::errc::invalid_argument));
+    }
+    if (z->second.name != config.zone.name) {
+      ldpp_dout(dpp, 0) << "zonegroup contains unknown zone name="
+          << z->second.name << dendl;
+      throw std::system_error(make_error_code(std::errc::invalid_argument));
+    }
+    if (config.zonegroup.master_zone != config.zone.id) {
+      ldpp_dout(dpp, 0) << "zonegroup contains unknown master_zone="
+          << config.zonegroup.master_zone << dendl;
+      throw std::system_error(make_error_code(std::errc::invalid_argument));
+    }
+  } else {
+    // add the zone to the group
+    const bool is_master = true;
+    const bool read_only = false;
+    std::list<std::string> endpoints;
+    std::list<std::string> sync_from;
+    std::list<std::string> sync_from_rm;
+    rgw::zone_features::set enable_features;
+    rgw::zone_features::set disable_features;
+
+    enable_features.insert(rgw::zone_features::supported.begin(),
+                           rgw::zone_features::supported.end());
+
+    int r = rgw::add_zone_to_group(dpp, config.zonegroup, config.zone,
+                                   &is_master, &read_only, endpoints,
+                                   nullptr, nullptr, sync_from, sync_from_rm,
+                                   nullptr, std::nullopt,
+                                   enable_features, disable_features);
+    if (r < 0) {
+      ldpp_dout(dpp, 0) << "failed to add zone to zonegroup: "
+          << cpp_strerror(r) << dendl;
+      throw std::system_error(-r, std::system_category());
+    }
+
+    config.zonegroup.enabled_features = std::move(enable_features);
+  }
+
+  // insert the default placement target if it doesn't exist
+  auto target = RGWZoneGroupPlacementTarget{.name = "default-placement"};
+  config.zonegroup.placement_targets.emplace(target.name, target);
+  if (config.zonegroup.default_placement.name.empty()) {
+    config.zonegroup.default_placement.name = target.name;
+  }
+}
+
+} // anonymous namespace
+
+auto create_json_config_store(const DoutPrefixProvider* dpp,
+                              const std::string& filename)
+    -> std::unique_ptr<ConfigStore>
+{
+  DecodedConfig config;
+  parse_config(dpp, filename.c_str());
+  sanity_check_config(dpp, config);
+  return create_immutable_config_store(dpp, config.zonegroup, config.zone,
+                                       config.period_config);
+}
+
+} // namespace rgw::sal
diff --git a/src/rgw/driver/json_config/store.h b/src/rgw/driver/json_config/store.h
new file mode 100644 (file)
index 0000000..4482f67
--- /dev/null
@@ -0,0 +1,27 @@
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2022 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#pragma once
+
+#include "driver/immutable_config/store.h"
+
+namespace rgw::sal {
+
+/// Create an immutable ConfigStore by parsing the zonegroup and zone from the
+/// given json filename.
+auto create_json_config_store(const DoutPrefixProvider* dpp,
+                              const std::string& filename)
+    -> std::unique_ptr<ConfigStore>;
+
+} // namespace rgw::sal
diff --git a/src/rgw/driver/rados/cls_fifo_legacy.cc b/src/rgw/driver/rados/cls_fifo_legacy.cc
new file mode 100644 (file)
index 0000000..23b39b9
--- /dev/null
@@ -0,0 +1,2484 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2020 Red Hat <contact@redhat.com>
+ * Author: Adam C. Emerson
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include <cstdint>
+#include <numeric>
+#include <optional>
+#include <string_view>
+
+#undef FMT_HEADER_ONLY
+#define FMT_HEADER_ONLY 1
+#include <fmt/format.h>
+
+#include "include/rados/librados.hpp"
+
+#include "include/buffer.h"
+
+#include "common/async/yield_context.h"
+#include "common/random_string.h"
+
+#include "cls/fifo/cls_fifo_types.h"
+#include "cls/fifo/cls_fifo_ops.h"
+
+#include "cls_fifo_legacy.h"
+
+namespace rgw::cls::fifo {
+namespace cb = ceph::buffer;
+namespace fifo = rados::cls::fifo;
+
+using ceph::from_error_code;
+
+inline constexpr auto MAX_RACE_RETRIES = 10;
+
+void create_meta(lr::ObjectWriteOperation* op,
+                std::string_view id,
+                std::optional<fifo::objv> objv,
+                std::optional<std::string_view> oid_prefix,
+                bool exclusive,
+                std::uint64_t max_part_size,
+                std::uint64_t max_entry_size)
+{
+  fifo::op::create_meta cm;
+
+  cm.id = id;
+  cm.version = objv;
+  cm.oid_prefix = oid_prefix;
+  cm.max_part_size = max_part_size;
+  cm.max_entry_size = max_entry_size;
+  cm.exclusive = exclusive;
+
+  cb::list in;
+  encode(cm, in);
+  op->exec(fifo::op::CLASS, fifo::op::CREATE_META, in);
+}
+
+int get_meta(const DoutPrefixProvider *dpp, lr::IoCtx& ioctx, const std::string& oid,
+            std::optional<fifo::objv> objv, fifo::info* info,
+            std::uint32_t* part_header_size,
+            std::uint32_t* part_entry_overhead,
+            uint64_t tid, optional_yield y,
+            bool probe)
+{
+  lr::ObjectReadOperation op;
+  fifo::op::get_meta gm;
+  gm.version = objv;
+  cb::list in;
+  encode(gm, in);
+  cb::list bl;
+
+  op.exec(fifo::op::CLASS, fifo::op::GET_META, in,
+         &bl, nullptr);
+  auto r = rgw_rados_operate(dpp, ioctx, oid, &op, nullptr, y);
+  if (r >= 0) try {
+      fifo::op::get_meta_reply reply;
+      auto iter = bl.cbegin();
+      decode(reply, iter);
+      if (info) *info = std::move(reply.info);
+      if (part_header_size) *part_header_size = reply.part_header_size;
+      if (part_entry_overhead)
+       *part_entry_overhead = reply.part_entry_overhead;
+    } catch (const cb::error& err) {
+      ldpp_dout(dpp, -1)
+       << __PRETTY_FUNCTION__ << ":" << __LINE__
+       << " decode failed: " << err.what()
+       << " tid=" << tid << dendl;
+      r = from_error_code(err.code());
+    } else if (!(probe && (r == -ENOENT || r == -ENODATA))) {
+    ldpp_dout(dpp, -1)
+      << __PRETTY_FUNCTION__ << ":" << __LINE__
+      << " fifo::op::GET_META failed r=" << r << " tid=" << tid
+      << dendl;
+  }
+  return r;
+};
+
+namespace {
+void update_meta(lr::ObjectWriteOperation* op, const fifo::objv& objv,
+                const fifo::update& update)
+{
+  fifo::op::update_meta um;
+
+  um.version = objv;
+  um.tail_part_num = update.tail_part_num();
+  um.head_part_num = update.head_part_num();
+  um.min_push_part_num = update.min_push_part_num();
+  um.max_push_part_num = update.max_push_part_num();
+  um.journal_entries_add = std::move(update).journal_entries_add();
+  um.journal_entries_rm = std::move(update).journal_entries_rm();
+
+  cb::list in;
+  encode(um, in);
+  op->exec(fifo::op::CLASS, fifo::op::UPDATE_META, in);
+}
+
+void part_init(lr::ObjectWriteOperation* op, std::string_view tag,
+              fifo::data_params params)
+{
+  fifo::op::init_part ip;
+
+  ip.tag = tag;
+  ip.params = params;
+
+  cb::list in;
+  encode(ip, in);
+  op->exec(fifo::op::CLASS, fifo::op::INIT_PART, in);
+}
+
+int push_part(const DoutPrefixProvider *dpp, lr::IoCtx& ioctx, const std::string& oid, std::string_view tag,
+             std::deque<cb::list> data_bufs, std::uint64_t tid,
+             optional_yield y)
+{
+  lr::ObjectWriteOperation op;
+  fifo::op::push_part pp;
+
+  pp.tag = tag;
+  pp.data_bufs = data_bufs;
+  pp.total_len = 0;
+
+  for (const auto& bl : data_bufs)
+    pp.total_len += bl.length();
+
+  cb::list in;
+  encode(pp, in);
+  auto retval = 0;
+  op.exec(fifo::op::CLASS, fifo::op::PUSH_PART, in, nullptr, &retval);
+  auto r = rgw_rados_operate(dpp, ioctx, oid, &op, y, lr::OPERATION_RETURNVEC);
+  if (r < 0) {
+    ldpp_dout(dpp, -1)
+      << __PRETTY_FUNCTION__ << ":" << __LINE__
+      << " fifo::op::PUSH_PART failed r=" << r
+      << " tid=" << tid << dendl;
+    return r;
+  }
+  if (retval < 0) {
+    ldpp_dout(dpp, -1)
+      << __PRETTY_FUNCTION__ << ":" << __LINE__
+      << " error handling response retval=" << retval
+      << " tid=" << tid << dendl;
+  }
+  return retval;
+}
+
+void push_part(lr::IoCtx& ioctx, const std::string& oid, std::string_view tag,
+              std::deque<cb::list> data_bufs, std::uint64_t tid,
+              lr::AioCompletion* c)
+{
+  lr::ObjectWriteOperation op;
+  fifo::op::push_part pp;
+
+  pp.tag = tag;
+  pp.data_bufs = data_bufs;
+  pp.total_len = 0;
+
+  for (const auto& bl : data_bufs)
+    pp.total_len += bl.length();
+
+  cb::list in;
+  encode(pp, in);
+  op.exec(fifo::op::CLASS, fifo::op::PUSH_PART, in);
+  auto r = ioctx.aio_operate(oid, c, &op, lr::OPERATION_RETURNVEC);
+  ceph_assert(r >= 0);
+}
+
+void trim_part(lr::ObjectWriteOperation* op,
+              std::optional<std::string_view> tag,
+              std::uint64_t ofs, bool exclusive)
+{
+  fifo::op::trim_part tp;
+
+  tp.tag = tag;
+  tp.ofs = ofs;
+  tp.exclusive = exclusive;
+
+  cb::list in;
+  encode(tp, in);
+  op->exec(fifo::op::CLASS, fifo::op::TRIM_PART, in);
+}
+
+int list_part(const DoutPrefixProvider *dpp, lr::IoCtx& ioctx, const std::string& oid,
+             std::optional<std::string_view> tag, std::uint64_t ofs,
+             std::uint64_t max_entries,
+             std::vector<fifo::part_list_entry>* entries,
+             bool* more, bool* full_part, std::string* ptag,
+             std::uint64_t tid, optional_yield y)
+{
+  lr::ObjectReadOperation op;
+  fifo::op::list_part lp;
+
+  lp.tag = tag;
+  lp.ofs = ofs;
+  lp.max_entries = max_entries;
+
+  cb::list in;
+  encode(lp, in);
+  cb::list bl;
+  op.exec(fifo::op::CLASS, fifo::op::LIST_PART, in, &bl, nullptr);
+  auto r = rgw_rados_operate(dpp, ioctx, oid, &op, nullptr, y);
+  if (r >= 0) try {
+      fifo::op::list_part_reply reply;
+      auto iter = bl.cbegin();
+      decode(reply, iter);
+      if (entries) *entries = std::move(reply.entries);
+      if (more) *more = reply.more;
+      if (full_part) *full_part = reply.full_part;
+      if (ptag) *ptag = reply.tag;
+    } catch (const cb::error& err) {
+      ldpp_dout(dpp, -1)
+       << __PRETTY_FUNCTION__ << ":" << __LINE__
+       << " decode failed: " << err.what()
+       << " tid=" << tid << dendl;
+      r = from_error_code(err.code());
+    } else if (r != -ENOENT) {
+    ldpp_dout(dpp, -1)
+      << __PRETTY_FUNCTION__ << ":" << __LINE__
+      << " fifo::op::LIST_PART failed r=" << r << " tid=" << tid
+      << dendl;
+  }
+  return r;
+}
+
+struct list_entry_completion : public lr::ObjectOperationCompletion {
+  CephContext* cct;
+  int* r_out;
+  std::vector<fifo::part_list_entry>* entries;
+  bool* more;
+  bool* full_part;
+  std::string* ptag;
+  std::uint64_t tid;
+
+  list_entry_completion(CephContext* cct, int* r_out, std::vector<fifo::part_list_entry>* entries,
+                       bool* more, bool* full_part, std::string* ptag,
+                       std::uint64_t tid)
+    : cct(cct), r_out(r_out), entries(entries), more(more),
+      full_part(full_part), ptag(ptag), tid(tid) {}
+  virtual ~list_entry_completion() = default;
+  void handle_completion(int r, bufferlist& bl) override {
+    if (r >= 0) try {
+       fifo::op::list_part_reply reply;
+       auto iter = bl.cbegin();
+       decode(reply, iter);
+       if (entries) *entries = std::move(reply.entries);
+       if (more) *more = reply.more;
+       if (full_part) *full_part = reply.full_part;
+       if (ptag) *ptag = reply.tag;
+      } catch (const cb::error& err) {
+       lderr(cct)
+         << __PRETTY_FUNCTION__ << ":" << __LINE__
+         << " decode failed: " << err.what()
+         << " tid=" << tid << dendl;
+       r = from_error_code(err.code());
+      } else if (r < 0) {
+      lderr(cct)
+       << __PRETTY_FUNCTION__ << ":" << __LINE__
+       << " fifo::op::LIST_PART failed r=" << r << " tid=" << tid
+       << dendl;
+    }
+    if (r_out) *r_out = r;
+  }
+};
+
+lr::ObjectReadOperation list_part(CephContext* cct,
+                                 std::optional<std::string_view> tag,
+                                 std::uint64_t ofs,
+                                 std::uint64_t max_entries,
+                                 int* r_out,
+                                 std::vector<fifo::part_list_entry>* entries,
+                                 bool* more, bool* full_part,
+                                 std::string* ptag, std::uint64_t tid)
+{
+  lr::ObjectReadOperation op;
+  fifo::op::list_part lp;
+
+  lp.tag = tag;
+  lp.ofs = ofs;
+  lp.max_entries = max_entries;
+
+  cb::list in;
+  encode(lp, in);
+  op.exec(fifo::op::CLASS, fifo::op::LIST_PART, in,
+         new list_entry_completion(cct, r_out, entries, more, full_part,
+                                   ptag, tid));
+  return op;
+}
+
+int get_part_info(const DoutPrefixProvider *dpp, lr::IoCtx& ioctx, const std::string& oid,
+                 fifo::part_header* header,
+                 std::uint64_t tid, optional_yield y)
+{
+  lr::ObjectReadOperation op;
+  fifo::op::get_part_info gpi;
+
+  cb::list in;
+  cb::list bl;
+  encode(gpi, in);
+  op.exec(fifo::op::CLASS, fifo::op::GET_PART_INFO, in, &bl, nullptr);
+  auto r = rgw_rados_operate(dpp, ioctx, oid, &op, nullptr, y);
+  if (r >= 0) try {
+      fifo::op::get_part_info_reply reply;
+      auto iter = bl.cbegin();
+      decode(reply, iter);
+      if (header) *header = std::move(reply.header);
+    } catch (const cb::error& err) {
+      ldpp_dout(dpp, -1)
+       << __PRETTY_FUNCTION__ << ":" << __LINE__
+       << " decode failed: " << err.what()
+       << " tid=" << tid << dendl;
+      r = from_error_code(err.code());
+    } else {
+    ldpp_dout(dpp, -1)
+      << __PRETTY_FUNCTION__ << ":" << __LINE__
+      << " fifo::op::GET_PART_INFO failed r=" << r << " tid=" << tid
+      << dendl;
+  }
+  return r;
+}
+
+struct partinfo_completion : public lr::ObjectOperationCompletion {
+  CephContext* cct;
+  int* rp;
+  fifo::part_header* h;
+  std::uint64_t tid;
+  partinfo_completion(CephContext* cct, int* rp, fifo::part_header* h,
+                     std::uint64_t tid) :
+    cct(cct), rp(rp), h(h), tid(tid) {
+  }
+  virtual ~partinfo_completion() = default;
+  void handle_completion(int r, bufferlist& bl) override {
+    if (r >= 0) try {
+       fifo::op::get_part_info_reply reply;
+       auto iter = bl.cbegin();
+       decode(reply, iter);
+       if (h) *h = std::move(reply.header);
+      } catch (const cb::error& err) {
+       r = from_error_code(err.code());
+       lderr(cct) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                  << " decode failed: " << err.what()
+                  << " tid=" << tid << dendl;
+      } else {
+      lderr(cct) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                << " fifo::op::GET_PART_INFO failed r=" << r << " tid=" << tid
+                << dendl;
+    }
+    if (rp) {
+      *rp = r;
+    }
+  }
+};
+
+lr::ObjectReadOperation get_part_info(CephContext* cct,
+                                     fifo::part_header* header,
+                                     std::uint64_t tid, int* r = 0)
+{
+  lr::ObjectReadOperation op;
+  fifo::op::get_part_info gpi;
+
+  cb::list in;
+  cb::list bl;
+  encode(gpi, in);
+  op.exec(fifo::op::CLASS, fifo::op::GET_PART_INFO, in,
+         new partinfo_completion(cct, r, header, tid));
+  return op;
+}
+}
+
+std::optional<marker> FIFO::to_marker(std::string_view s)
+{
+  marker m;
+  if (s.empty()) {
+    m.num = info.tail_part_num;
+    m.ofs = 0;
+    return m;
+  }
+
+  auto pos = s.find(':');
+  if (pos == s.npos) {
+    return std::nullopt;
+  }
+
+  auto num = s.substr(0, pos);
+  auto ofs = s.substr(pos + 1);
+
+  auto n = ceph::parse<decltype(m.num)>(num);
+  if (!n) {
+    return std::nullopt;
+  }
+  m.num = *n;
+  auto o = ceph::parse<decltype(m.ofs)>(ofs);
+  if (!o) {
+    return std::nullopt;
+  }
+  m.ofs = *o;
+  return m;
+}
+
+std::string FIFO::generate_tag() const
+{
+  static constexpr auto HEADER_TAG_SIZE = 16;
+  return gen_rand_alphanumeric_plain(static_cast<CephContext*>(ioctx.cct()),
+                                    HEADER_TAG_SIZE);
+}
+
+
+int FIFO::apply_update(const DoutPrefixProvider *dpp,
+                       fifo::info* info,
+                      const fifo::objv& objv,
+                      const fifo::update& update,
+                      std::uint64_t tid)
+{
+  ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                << " entering: tid=" << tid << dendl;
+  std::unique_lock l(m);
+  if (objv != info->version) {
+    ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+              << " version mismatch, canceling: tid=" << tid << dendl;
+    return -ECANCELED;
+  }
+  auto err = info->apply_update(update);
+  if (err) {
+    ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+              << " error applying update: " << *err << " tid=" << tid << dendl;
+    return -ECANCELED;
+  }
+
+  ++info->version.ver;
+
+  return {};
+}
+
+int FIFO::_update_meta(const DoutPrefixProvider *dpp, const fifo::update& update,
+                      fifo::objv version, bool* pcanceled,
+                      std::uint64_t tid, optional_yield y)
+{
+  ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                << " entering: tid=" << tid << dendl;
+  lr::ObjectWriteOperation op;
+  bool canceled = false;
+  update_meta(&op, info.version, update);
+  auto r = rgw_rados_operate(dpp, ioctx, oid, &op, y);
+  if (r >= 0 || r == -ECANCELED) {
+    canceled = (r == -ECANCELED);
+    if (!canceled) {
+      r = apply_update(dpp, &info, version, update, tid);
+      if (r < 0) canceled = true;
+    }
+    if (canceled) {
+      r = read_meta(dpp, tid, y);
+      canceled = r < 0 ? false : true;
+    }
+  }
+  if (pcanceled) *pcanceled = canceled;
+  if (canceled) {
+    ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                  << " canceled: tid=" << tid << dendl;
+  }
+  if (r < 0) {
+    ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+              << " returning error: r=" << r << " tid=" << tid << dendl;
+  }
+  return r;
+}
+
+struct Updater : public Completion<Updater> {
+  FIFO* fifo;
+  fifo::update update;
+  fifo::objv version;
+  bool reread = false;
+  bool* pcanceled = nullptr;
+  std::uint64_t tid;
+  Updater(const DoutPrefixProvider *dpp, FIFO* fifo, lr::AioCompletion* super,
+         const fifo::update& update, fifo::objv version,
+         bool* pcanceled, std::uint64_t tid)
+    : Completion(dpp, super), fifo(fifo), update(update), version(version),
+      pcanceled(pcanceled) {}
+
+  void handle(const DoutPrefixProvider *dpp, Ptr&& p, int r) {
+    ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                        << " entering: tid=" << tid << dendl;
+    if (reread)
+      handle_reread(dpp, std::move(p), r);
+    else
+      handle_update(dpp, std::move(p), r);
+  }
+
+  void handle_update(const DoutPrefixProvider *dpp, Ptr&& p, int r) {
+    ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                        << " handling async update_meta: tid="
+                        << tid << dendl;
+    if (r < 0 && r != -ECANCELED) {
+      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                << " update failed: r=" << r << " tid=" << tid << dendl;
+      complete(std::move(p), r);
+      return;
+    }
+    bool canceled = (r == -ECANCELED);
+    if (!canceled) {
+      int r = fifo->apply_update(dpp, &fifo->info, version, update, tid);
+      if (r < 0) {
+       ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                            << " update failed, marking canceled: r=" << r
+                            << " tid=" << tid << dendl;
+       canceled = true;
+      }
+    }
+    if (canceled) {
+      reread = true;
+      fifo->read_meta(dpp, tid, call(std::move(p)));
+      return;
+    }
+    if (pcanceled)
+      *pcanceled = false;
+    ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                        << " completing: tid=" << tid << dendl;
+    complete(std::move(p), 0);
+  }
+
+  void handle_reread(const DoutPrefixProvider *dpp, Ptr&& p, int r) {
+    ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                        << " handling async read_meta: tid="
+                        << tid << dendl;
+    if (r < 0 && pcanceled) {
+      *pcanceled = false;
+    } else if (r >= 0 && pcanceled) {
+      *pcanceled = true;
+    }
+    if (r < 0) {
+      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                      << " failed dispatching read_meta: r=" << r << " tid="
+                      << tid << dendl;
+    } else {
+      ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                          << " completing: tid=" << tid << dendl;
+    }
+    complete(std::move(p), r);
+  }
+};
+
+void FIFO::_update_meta(const DoutPrefixProvider *dpp, const fifo::update& update,
+                       fifo::objv version, bool* pcanceled,
+                       std::uint64_t tid, lr::AioCompletion* c)
+{
+  ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                << " entering: tid=" << tid << dendl;
+  lr::ObjectWriteOperation op;
+  update_meta(&op, info.version, update);
+  auto updater = std::make_unique<Updater>(dpp, this, c, update, version, pcanceled,
+                                          tid);
+  auto r = ioctx.aio_operate(oid, Updater::call(std::move(updater)), &op);
+  assert(r >= 0);
+}
+
+int FIFO::create_part(const DoutPrefixProvider *dpp, int64_t part_num, std::string_view tag, std::uint64_t tid,
+                     optional_yield y)
+{
+  ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                << " entering: tid=" << tid << dendl;
+  lr::ObjectWriteOperation op;
+  op.create(false); /* We don't need exclusivity, part_init ensures
+                      we're creating from the same journal entry. */
+  std::unique_lock l(m);
+  part_init(&op, tag, info.params);
+  auto oid = info.part_oid(part_num);
+  l.unlock();
+  auto r = rgw_rados_operate(dpp, ioctx, oid, &op, y);
+  if (r < 0) {
+    ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+              << " part_init failed: r=" << r << " tid="
+              << tid << dendl;
+  }
+  return r;
+}
+
+int FIFO::remove_part(const DoutPrefixProvider *dpp, int64_t part_num, std::string_view tag, std::uint64_t tid,
+                     optional_yield y)
+{
+  ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                << " entering: tid=" << tid << dendl;
+  lr::ObjectWriteOperation op;
+  op.remove();
+  std::unique_lock l(m);
+  auto oid = info.part_oid(part_num);
+  l.unlock();
+  auto r = rgw_rados_operate(dpp, ioctx, oid, &op, y);
+  if (r < 0) {
+    ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+              << " remove failed: r=" << r << " tid="
+              << tid << dendl;
+  }
+  return r;
+}
+
+int FIFO::process_journal(const DoutPrefixProvider *dpp, std::uint64_t tid, optional_yield y)
+{
+  ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                << " entering: tid=" << tid << dendl;
+  std::vector<fifo::journal_entry> processed;
+
+  std::unique_lock l(m);
+  auto tmpjournal = info.journal;
+  auto new_tail = info.tail_part_num;
+  auto new_head = info.head_part_num;
+  auto new_max = info.max_push_part_num;
+  l.unlock();
+
+  int r = 0;
+  for (auto& [n, entry] : tmpjournal) {
+    ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                  << " processing entry: entry=" << entry << " tid=" << tid
+                  << dendl;
+    switch (entry.op) {
+    case fifo::journal_entry::Op::create:
+      r = create_part(dpp, entry.part_num, entry.part_tag, tid, y);
+      if (entry.part_num > new_max) {
+       new_max = entry.part_num;
+      }
+      break;
+    case fifo::journal_entry::Op::set_head:
+      r = 0;
+      if (entry.part_num > new_head) {
+       new_head = entry.part_num;
+      }
+      break;
+    case fifo::journal_entry::Op::remove:
+      r = remove_part(dpp, entry.part_num, entry.part_tag, tid, y);
+      if (r == -ENOENT) r = 0;
+      if (entry.part_num >= new_tail) {
+       new_tail = entry.part_num + 1;
+      }
+      break;
+    default:
+      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                << " unknown journaled op: entry=" << entry << " tid="
+                << tid << dendl;
+      return -EIO;
+    }
+
+    if (r < 0) {
+      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                << " processing entry failed: entry=" << entry
+                << " r=" << r << " tid=" << tid << dendl;
+      return -r;
+    }
+
+    processed.push_back(std::move(entry));
+  }
+
+  // Postprocess
+  bool canceled = true;
+
+  for (auto i = 0; canceled && i < MAX_RACE_RETRIES; ++i) {
+    ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                  << " postprocessing: i=" << i << " tid=" << tid << dendl;
+
+    std::optional<int64_t> tail_part_num;
+    std::optional<int64_t> head_part_num;
+    std::optional<int64_t> max_part_num;
+
+    std::unique_lock l(m);
+    auto objv = info.version;
+    if (new_tail > tail_part_num) tail_part_num = new_tail;
+    if (new_head > info.head_part_num) head_part_num = new_head;
+    if (new_max > info.max_push_part_num) max_part_num = new_max;
+    l.unlock();
+
+    if (processed.empty() &&
+       !tail_part_num &&
+       !max_part_num) {
+      ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                    << " nothing to update any more: i=" << i << " tid="
+                    << tid << dendl;
+      canceled = false;
+      break;
+    }
+    auto u = fifo::update().tail_part_num(tail_part_num)
+      .head_part_num(head_part_num).max_push_part_num(max_part_num)
+      .journal_entries_rm(processed);
+    r = _update_meta(dpp, u, objv, &canceled, tid, y);
+    if (r < 0) {
+      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                << " _update_meta failed: update=" << u
+                << " r=" << r << " tid=" << tid << dendl;
+      break;
+    }
+
+    if (canceled) {
+      std::vector<fifo::journal_entry> new_processed;
+      std::unique_lock l(m);
+      ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                    << " update canceled, retrying: i=" << i << " tid="
+                    << tid << dendl;
+      for (auto& e : processed) {
+       auto jiter = info.journal.find(e.part_num);
+       /* journal entry was already processed */
+       if (jiter == info.journal.end() ||
+           !(jiter->second == e)) {
+         continue;
+       }
+       new_processed.push_back(e);
+      }
+      processed = std::move(new_processed);
+    }
+  }
+  if (r == 0 && canceled) {
+    ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+              << " canceled too many times, giving up: tid=" << tid << dendl;
+    r = -ECANCELED;
+  }
+  if (r < 0) {
+    ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+              << " failed, r=: " << r << " tid=" << tid << dendl;
+  }
+  return r;
+}
+
+int FIFO::_prepare_new_part(const DoutPrefixProvider *dpp, bool is_head, std::uint64_t tid, optional_yield y)
+{
+  ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                << " entering: tid=" << tid << dendl;
+  std::unique_lock l(m);
+  std::vector jentries = { info.next_journal_entry(generate_tag()) };
+  if (info.journal.find(jentries.front().part_num) != info.journal.end()) {
+    l.unlock();
+    ldpp_dout(dpp, 5) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                 << " new part journaled, but not processed: tid="
+                 << tid << dendl;
+    auto r = process_journal(dpp, tid, y);
+    if (r < 0) {
+      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                << " process_journal failed: r=" << r << " tid=" << tid << dendl;
+    }
+    return r;
+  }
+  std::int64_t new_head_part_num = info.head_part_num;
+  auto version = info.version;
+
+  if (is_head) {
+    ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                  << " needs new head: tid=" << tid << dendl;
+    auto new_head_jentry = jentries.front();
+    new_head_jentry.op = fifo::journal_entry::Op::set_head;
+    new_head_part_num = jentries.front().part_num;
+    jentries.push_back(std::move(new_head_jentry));
+  }
+  l.unlock();
+
+  int r = 0;
+  bool canceled = true;
+  for (auto i = 0; canceled && i < MAX_RACE_RETRIES; ++i) {
+    canceled = false;
+    ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                  << " updating metadata: i=" << i << " tid=" << tid << dendl;
+    auto u = fifo::update{}.journal_entries_add(jentries);
+    r = _update_meta(dpp, u, version, &canceled, tid, y);
+    if (r >= 0 && canceled) {
+      std::unique_lock l(m);
+      auto found = (info.journal.find(jentries.front().part_num) !=
+                   info.journal.end());
+      if ((info.max_push_part_num >= jentries.front().part_num &&
+          info.head_part_num >= new_head_part_num)) {
+       ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                      << " raced, but journaled and processed: i=" << i
+                      << " tid=" << tid << dendl;
+       return 0;
+      }
+      if (found) {
+       ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                      << " raced, journaled but not processed: i=" << i
+                      << " tid=" << tid << dendl;
+       canceled = false;
+      }
+      l.unlock();
+    }
+    if (r < 0) {
+      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                << " _update_meta failed: update=" << u << " r=" << r
+                << " tid=" << tid << dendl;
+      return r;
+    }
+  }
+  if (canceled) {
+    ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+              << " canceled too many times, giving up: tid=" << tid << dendl;
+    return -ECANCELED;
+  }
+  r = process_journal(dpp, tid, y);
+  if (r < 0) {
+    ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+              << " process_journal failed: r=" << r << " tid=" << tid << dendl;
+  }
+  return r;
+}
+
+int FIFO::_prepare_new_head(const DoutPrefixProvider *dpp, std::uint64_t tid, optional_yield y)
+{
+  ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                << " entering: tid=" << tid << dendl;
+  std::unique_lock l(m);
+  std::int64_t new_head_num = info.head_part_num + 1;
+  auto max_push_part_num = info.max_push_part_num;
+  auto version = info.version;
+  l.unlock();
+
+  int r = 0;
+  if (max_push_part_num < new_head_num) {
+    ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                  << " need new part: tid=" << tid << dendl;
+    r = _prepare_new_part(dpp, true, tid, y);
+    if (r < 0) {
+      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                << " _prepare_new_part failed: r=" << r
+                << " tid=" << tid << dendl;
+      return r;
+    }
+    std::unique_lock l(m);
+    if (info.max_push_part_num < new_head_num) {
+      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                << " inconsistency, push part less than head part: "
+                << " tid=" << tid << dendl;
+      return -EIO;
+    }
+    l.unlock();
+    return 0;
+  }
+
+  bool canceled = true;
+  for (auto i = 0; canceled && i < MAX_RACE_RETRIES; ++i) {
+    ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                  << " updating head: i=" << i << " tid=" << tid << dendl;
+    auto u = fifo::update{}.head_part_num(new_head_num);
+    r = _update_meta(dpp, u, version, &canceled, tid, y);
+    if (r < 0) {
+      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                << " _update_meta failed: update=" << u << " r=" << r
+                << " tid=" << tid << dendl;
+      return r;
+    }
+    std::unique_lock l(m);
+    auto head_part_num = info.head_part_num;
+    version = info.version;
+    l.unlock();
+    if (canceled && (head_part_num >= new_head_num)) {
+      ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                    << " raced, but completed by the other caller: i=" << i
+                    << " tid=" << tid << dendl;
+      canceled = false;
+    }
+  }
+  if (canceled) {
+    ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+              << " canceled too many times, giving up: tid=" << tid << dendl;
+    return -ECANCELED;
+  }
+  return 0;
+}
+
+struct NewPartPreparer : public Completion<NewPartPreparer> {
+  FIFO* f;
+  std::vector<fifo::journal_entry> jentries;
+  int i = 0;
+  std::int64_t new_head_part_num;
+  bool canceled = false;
+  uint64_t tid;
+
+  NewPartPreparer(const DoutPrefixProvider *dpp, FIFO* f, lr::AioCompletion* super,
+                 std::vector<fifo::journal_entry> jentries,
+                 std::int64_t new_head_part_num,
+                 std::uint64_t tid)
+    : Completion(dpp, super), f(f), jentries(std::move(jentries)),
+      new_head_part_num(new_head_part_num), tid(tid) {}
+
+  void handle(const DoutPrefixProvider *dpp, Ptr&& p, int r) {
+    ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                     << " entering: tid=" << tid << dendl;
+    if (r < 0) {
+      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                   << " _update_meta failed:  r=" << r
+                   << " tid=" << tid << dendl;
+      complete(std::move(p), r);
+      return;
+    }
+
+    if (canceled) {
+      std::unique_lock l(f->m);
+      auto iter = f->info.journal.find(jentries.front().part_num);
+      auto max_push_part_num = f->info.max_push_part_num;
+      auto head_part_num = f->info.head_part_num;
+      auto version = f->info.version;
+      auto found = (iter != f->info.journal.end());
+      l.unlock();
+      if ((max_push_part_num >= jentries.front().part_num &&
+          head_part_num >= new_head_part_num)) {
+       ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                         << " raced, but journaled and processed: i=" << i
+                         << " tid=" << tid << dendl;
+       complete(std::move(p), 0);
+       return;
+      }
+      if (i >= MAX_RACE_RETRIES) {
+       complete(std::move(p), -ECANCELED);
+       return;
+      }
+      if (!found) {
+       ++i;
+       f->_update_meta(dpp, fifo::update{}
+                       .journal_entries_add(jentries),
+                        version, &canceled, tid, call(std::move(p)));
+       return;
+      } else {
+       ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                         << " raced, journaled but not processed: i=" << i
+                         << " tid=" << tid << dendl;
+       canceled = false;
+      }
+      // Fall through. We still need to process the journal.
+    }
+    f->process_journal(dpp, tid, super());
+    return;
+  }
+};
+
+void FIFO::_prepare_new_part(const DoutPrefixProvider *dpp, bool is_head, std::uint64_t tid,
+                            lr::AioCompletion* c)
+{
+  std::unique_lock l(m);
+  std::vector jentries = { info.next_journal_entry(generate_tag()) };
+  if (info.journal.find(jentries.front().part_num) != info.journal.end()) {
+    l.unlock();
+    ldpp_dout(dpp, 5) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                 << " new part journaled, but not processed: tid="
+                 << tid << dendl;
+    process_journal(dpp, tid, c);
+    return;
+  }
+  std::int64_t new_head_part_num = info.head_part_num;
+  auto version = info.version;
+
+  if (is_head) {
+    auto new_head_jentry = jentries.front();
+    new_head_jentry.op = fifo::journal_entry::Op::set_head;
+    new_head_part_num = jentries.front().part_num;
+    jentries.push_back(std::move(new_head_jentry));
+  }
+  l.unlock();
+
+  auto n = std::make_unique<NewPartPreparer>(dpp, this, c, jentries,
+                                            new_head_part_num, tid);
+  auto np = n.get();
+  _update_meta(dpp, fifo::update{}.journal_entries_add(jentries), version,
+              &np->canceled, tid, NewPartPreparer::call(std::move(n)));
+}
+
+struct NewHeadPreparer : public Completion<NewHeadPreparer> {
+  FIFO* f;
+  int i = 0;
+  bool newpart;
+  std::int64_t new_head_num;
+  bool canceled = false;
+  std::uint64_t tid;
+
+  NewHeadPreparer(const DoutPrefixProvider *dpp, FIFO* f, lr::AioCompletion* super,
+                 bool newpart, std::int64_t new_head_num, std::uint64_t tid)
+    : Completion(dpp, super), f(f), newpart(newpart), new_head_num(new_head_num),
+      tid(tid) {}
+
+  void handle(const DoutPrefixProvider *dpp, Ptr&& p, int r) {
+    if (newpart)
+      handle_newpart(std::move(p), r);
+    else
+      handle_update(dpp, std::move(p), r);
+  }
+
+  void handle_newpart(Ptr&& p, int r) {
+    if (r < 0) {
+      lderr(f->cct) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                   << " _prepare_new_part failed: r=" << r
+                   << " tid=" << tid << dendl;
+      complete(std::move(p), r);
+      return;
+    }
+    std::unique_lock l(f->m);
+    if (f->info.max_push_part_num < new_head_num) {
+      l.unlock();
+      lderr(f->cct) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                   << " _prepare_new_part failed: r=" << r
+                   << " tid=" << tid << dendl;
+      complete(std::move(p), -EIO);
+    } else {
+      l.unlock();
+      complete(std::move(p), 0);
+    }
+  }
+
+  void handle_update(const DoutPrefixProvider *dpp, Ptr&& p, int r) {
+    std::unique_lock l(f->m);
+    auto head_part_num = f->info.head_part_num;
+    auto version = f->info.version;
+    l.unlock();
+
+    if (r < 0) {
+      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                   << " _update_meta failed: r=" << r
+                   << " tid=" << tid << dendl;
+      complete(std::move(p), r);
+      return;
+    }
+    if (canceled) {
+      if (i >= MAX_RACE_RETRIES) {
+       ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                     << " canceled too many times, giving up: tid=" << tid << dendl;
+       complete(std::move(p), -ECANCELED);
+       return;
+      }
+
+      // Raced, but there's still work to do!
+      if (head_part_num < new_head_num) {
+       canceled = false;
+       ++i;
+       ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                         << " updating head: i=" << i << " tid=" << tid << dendl;
+       f->_update_meta(dpp, fifo::update{}.head_part_num(new_head_num),
+                       version, &this->canceled, tid, call(std::move(p)));
+       return;
+      }
+    }
+    ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                  << " succeeded : i=" << i << " tid=" << tid << dendl;
+    complete(std::move(p), 0);
+    return;
+  }
+};
+
+void FIFO::_prepare_new_head(const DoutPrefixProvider *dpp, std::uint64_t tid, lr::AioCompletion* c)
+{
+  ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                << " entering: tid=" << tid << dendl;
+  std::unique_lock l(m);
+  int64_t new_head_num = info.head_part_num + 1;
+  auto max_push_part_num = info.max_push_part_num;
+  auto version = info.version;
+  l.unlock();
+
+  if (max_push_part_num < new_head_num) {
+    ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                  << " need new part: tid=" << tid << dendl;
+    auto n = std::make_unique<NewHeadPreparer>(dpp, this, c, true, new_head_num,
+                                              tid);
+    _prepare_new_part(dpp, true, tid, NewHeadPreparer::call(std::move(n)));
+  } else {
+    ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                  << " updating head: tid=" << tid << dendl;
+    auto n = std::make_unique<NewHeadPreparer>(dpp, this, c, false, new_head_num,
+                                              tid);
+    auto np = n.get();
+    _update_meta(dpp, fifo::update{}.head_part_num(new_head_num), version,
+                &np->canceled, tid, NewHeadPreparer::call(std::move(n)));
+  }
+}
+
+int FIFO::push_entries(const DoutPrefixProvider *dpp, const std::deque<cb::list>& data_bufs,
+                      std::uint64_t tid, optional_yield y)
+{
+  ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                << " entering: tid=" << tid << dendl;
+  std::unique_lock l(m);
+  auto head_part_num = info.head_part_num;
+  auto tag = info.head_tag;
+  const auto part_oid = info.part_oid(head_part_num);
+  l.unlock();
+
+  auto r = push_part(dpp, ioctx, part_oid, tag, data_bufs, tid, y);
+  if (r < 0) {
+    ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+              << " push_part failed: r=" << r << " tid=" << tid << dendl;
+  }
+  return r;
+}
+
+void FIFO::push_entries(const std::deque<cb::list>& data_bufs,
+                       std::uint64_t tid, lr::AioCompletion* c)
+{
+  std::unique_lock l(m);
+  auto head_part_num = info.head_part_num;
+  auto tag = info.head_tag;
+  const auto part_oid = info.part_oid(head_part_num);
+  l.unlock();
+
+  push_part(ioctx, part_oid, tag, data_bufs, tid, c);
+}
+
+int FIFO::trim_part(const DoutPrefixProvider *dpp, int64_t part_num, uint64_t ofs,
+                   std::optional<std::string_view> tag,
+                   bool exclusive, std::uint64_t tid,
+                   optional_yield y)
+{
+  ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                << " entering: tid=" << tid << dendl;
+  lr::ObjectWriteOperation op;
+  std::unique_lock l(m);
+  const auto part_oid = info.part_oid(part_num);
+  l.unlock();
+  rgw::cls::fifo::trim_part(&op, tag, ofs, exclusive);
+  auto r = rgw_rados_operate(dpp, ioctx, part_oid, &op, y);
+  if (r < 0) {
+    ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+              << " trim_part failed: r=" << r << " tid=" << tid << dendl;
+  }
+  return 0;
+}
+
+void FIFO::trim_part(const DoutPrefixProvider *dpp, int64_t part_num, uint64_t ofs,
+                    std::optional<std::string_view> tag,
+                    bool exclusive, std::uint64_t tid,
+                    lr::AioCompletion* c)
+{
+  ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                << " entering: tid=" << tid << dendl;
+  lr::ObjectWriteOperation op;
+  std::unique_lock l(m);
+  const auto part_oid = info.part_oid(part_num);
+  l.unlock();
+  rgw::cls::fifo::trim_part(&op, tag, ofs, exclusive);
+  auto r = ioctx.aio_operate(part_oid, c, &op);
+  ceph_assert(r >= 0);
+}
+
+int FIFO::open(const DoutPrefixProvider *dpp, lr::IoCtx ioctx, std::string oid, std::unique_ptr<FIFO>* fifo,
+              optional_yield y, std::optional<fifo::objv> objv,
+              bool probe)
+{
+  ldpp_dout(dpp, 20)
+    << __PRETTY_FUNCTION__ << ":" << __LINE__
+    << " entering" << dendl;
+  fifo::info info;
+  std::uint32_t size;
+  std::uint32_t over;
+  int r = get_meta(dpp, ioctx, std::move(oid), objv, &info, &size, &over, 0, y,
+                  probe);
+  if (r < 0) {
+    if (!(probe && (r == -ENOENT || r == -ENODATA))) {
+      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                << " get_meta failed: r=" << r << dendl;
+    }
+    return r;
+  }
+  std::unique_ptr<FIFO> f(new FIFO(std::move(ioctx), oid));
+  f->info = info;
+  f->part_header_size = size;
+  f->part_entry_overhead = over;
+  // If there are journal entries, process them, in case
+  // someone crashed mid-transaction.
+  if (!info.journal.empty()) {
+    ldpp_dout(dpp, 20)
+      << __PRETTY_FUNCTION__ << ":" << __LINE__
+      << " processing leftover journal" << dendl;
+    r = f->process_journal(dpp, 0, y);
+    if (r < 0) {
+      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                << " process_journal failed: r=" << r << dendl;
+      return r;
+    }
+  }
+  *fifo = std::move(f);
+  return 0;
+}
+
+int FIFO::create(const DoutPrefixProvider *dpp, lr::IoCtx ioctx, std::string oid, std::unique_ptr<FIFO>* fifo,
+                optional_yield y, std::optional<fifo::objv> objv,
+                std::optional<std::string_view> oid_prefix,
+                bool exclusive, std::uint64_t max_part_size,
+                std::uint64_t max_entry_size)
+{
+  ldpp_dout(dpp, 20)
+    << __PRETTY_FUNCTION__ << ":" << __LINE__
+    << " entering" << dendl;
+  lr::ObjectWriteOperation op;
+  create_meta(&op, oid, objv, oid_prefix, exclusive, max_part_size,
+             max_entry_size);
+  auto r = rgw_rados_operate(dpp, ioctx, oid, &op, y);
+  if (r < 0) {
+    ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+              << " create_meta failed: r=" << r << dendl;
+    return r;
+  }
+  r = open(dpp, std::move(ioctx), std::move(oid), fifo, y, objv);
+  return r;
+}
+
+int FIFO::read_meta(const DoutPrefixProvider *dpp, std::uint64_t tid, optional_yield y) {
+  ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                << " entering: tid=" << tid << dendl;
+  fifo::info _info;
+  std::uint32_t _phs;
+  std::uint32_t _peo;
+
+  auto r = get_meta(dpp, ioctx, oid, std::nullopt, &_info, &_phs, &_peo, tid, y);
+  if (r < 0) {
+    ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+              << " get_meta failed: r=" << r << " tid=" << tid << dendl;
+    return r;
+  }
+  std::unique_lock l(m);
+  // We have a newer version already!
+  if (_info.version.same_or_later(this->info.version)) {
+    info = std::move(_info);
+    part_header_size = _phs;
+    part_entry_overhead = _peo;
+  }
+  return 0;
+}
+
+int FIFO::read_meta(const DoutPrefixProvider *dpp, optional_yield y) {
+  std::unique_lock l(m);
+  auto tid = ++next_tid;
+  l.unlock();
+  return read_meta(dpp, tid, y);
+}
+
+struct Reader : public Completion<Reader> {
+  FIFO* fifo;
+  cb::list bl;
+  std::uint64_t tid;
+  Reader(const DoutPrefixProvider *dpp, FIFO* fifo, lr::AioCompletion* super, std::uint64_t tid)
+    : Completion(dpp, super), fifo(fifo), tid(tid) {}
+
+  void handle(const DoutPrefixProvider *dpp, Ptr&& p, int r) {
+    ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                  << " entering: tid=" << tid << dendl;
+    if (r >= 0) try {
+       fifo::op::get_meta_reply reply;
+       auto iter = bl.cbegin();
+       decode(reply, iter);
+       std::unique_lock l(fifo->m);
+       if (reply.info.version.same_or_later(fifo->info.version)) {
+         fifo->info = std::move(reply.info);
+         fifo->part_header_size = reply.part_header_size;
+         fifo->part_entry_overhead = reply.part_entry_overhead;
+       }
+      } catch (const cb::error& err) {
+       ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                  << " failed to decode response err=" << err.what()
+                  << " tid=" << tid << dendl;
+       r = from_error_code(err.code());
+      } else {
+      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                << " read_meta failed r=" << r
+                << " tid=" << tid << dendl;
+    }
+    complete(std::move(p), r);
+  }
+};
+
+void FIFO::read_meta(const DoutPrefixProvider *dpp, std::uint64_t tid, lr::AioCompletion* c)
+{
+  ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                << " entering: tid=" << tid << dendl;
+  lr::ObjectReadOperation op;
+  fifo::op::get_meta gm;
+  cb::list in;
+  encode(gm, in);
+  auto reader = std::make_unique<Reader>(dpp, this, c, tid);
+  auto rp = reader.get();
+  auto r = ioctx.aio_exec(oid, Reader::call(std::move(reader)), fifo::op::CLASS,
+                         fifo::op::GET_META, in, &rp->bl);
+  assert(r >= 0);
+}
+
+const fifo::info& FIFO::meta() const {
+  return info;
+}
+
+std::pair<std::uint32_t, std::uint32_t> FIFO::get_part_layout_info() const {
+  return {part_header_size, part_entry_overhead};
+}
+
+int FIFO::push(const DoutPrefixProvider *dpp, const cb::list& bl, optional_yield y) {
+  return push(dpp, std::vector{ bl }, y);
+}
+
+void FIFO::push(const DoutPrefixProvider *dpp, const cb::list& bl, lr::AioCompletion* c) {
+  push(dpp, std::vector{ bl }, c);
+}
+
+int FIFO::push(const DoutPrefixProvider *dpp, const std::vector<cb::list>& data_bufs, optional_yield y)
+{
+  std::unique_lock l(m);
+  auto tid = ++next_tid;
+  auto max_entry_size = info.params.max_entry_size;
+  auto need_new_head = info.need_new_head();
+  l.unlock();
+  ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                << " entering: tid=" << tid << dendl;
+  if (data_bufs.empty()) {
+    ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                  << " empty push, returning success tid=" << tid << dendl;
+    return 0;
+  }
+
+  // Validate sizes
+  for (const auto& bl : data_bufs) {
+    if (bl.length() > max_entry_size) {
+      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                << " entry bigger than max_entry_size tid=" << tid << dendl;
+      return -E2BIG;
+    }
+  }
+
+  int r = 0;
+  if (need_new_head) {
+    ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                  << " need new head tid=" << tid << dendl;
+    r = _prepare_new_head(dpp, tid, y);
+    if (r < 0) {
+      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                << " _prepare_new_head failed: r=" << r
+                << " tid=" << tid << dendl;
+      return r;
+    }
+  }
+
+  std::deque<cb::list> remaining(data_bufs.begin(), data_bufs.end());
+  std::deque<cb::list> batch;
+
+  uint64_t batch_len = 0;
+  auto retries = 0;
+  bool canceled = true;
+  while ((!remaining.empty() || !batch.empty()) &&
+        (retries <= MAX_RACE_RETRIES)) {
+    ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                  << " preparing push: remaining=" << remaining.size()
+                  << " batch=" << batch.size() << " retries=" << retries
+                  << " tid=" << tid << dendl;
+    std::unique_lock l(m);
+    auto max_part_size = info.params.max_part_size;
+    auto overhead = part_entry_overhead;
+    l.unlock();
+
+    while (!remaining.empty() &&
+          (remaining.front().length() + batch_len <= max_part_size)) {
+      /* We can send entries with data_len up to max_entry_size,
+        however, we want to also account the overhead when
+        dealing with multiple entries. Previous check doesn't
+        account for overhead on purpose. */
+      batch_len += remaining.front().length() + overhead;
+      batch.push_back(std::move(remaining.front()));
+      remaining.pop_front();
+    }
+    ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                  << " prepared push: remaining=" << remaining.size()
+                  << " batch=" << batch.size() << " retries=" << retries
+                  << " batch_len=" << batch_len
+                  << " tid=" << tid << dendl;
+
+    auto r = push_entries(dpp, batch, tid, y);
+    if (r == -ERANGE) {
+      canceled = true;
+      ++retries;
+      ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                    << " need new head tid=" << tid << dendl;
+      r = _prepare_new_head(dpp, tid, y);
+      if (r < 0) {
+       ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                  << " prepare_new_head failed: r=" << r
+                  << " tid=" << tid << dendl;
+       return r;
+      }
+      r = 0;
+      continue;
+    }
+    if (r < 0) {
+      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                << " push_entries failed: r=" << r
+                << " tid=" << tid << dendl;
+      return r;
+    }
+    // Made forward progress!
+    canceled = false;
+    retries = 0;
+    batch_len = 0;
+    if (r == ssize(batch)) {
+      batch.clear();
+    } else  {
+      batch.erase(batch.begin(), batch.begin() + r);
+      for (const auto& b : batch) {
+       batch_len +=  b.length() + part_entry_overhead;
+      }
+    }
+  }
+  if (canceled) {
+    ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+              << " canceled too many times, giving up: tid=" << tid << dendl;
+    return -ECANCELED;
+  }
+  return 0;
+}
+
+struct Pusher : public Completion<Pusher> {
+  FIFO* f;
+  std::deque<cb::list> remaining;
+  std::deque<cb::list> batch;
+  int i = 0;
+  std::uint64_t tid;
+  bool new_heading = false;
+
+  void prep_then_push(const DoutPrefixProvider *dpp, Ptr&& p, const unsigned successes) {
+    std::unique_lock l(f->m);
+    auto max_part_size = f->info.params.max_part_size;
+    auto part_entry_overhead = f->part_entry_overhead;
+    l.unlock();
+
+    ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                     << " preparing push: remaining=" << remaining.size()
+                     << " batch=" << batch.size() << " i=" << i
+                     << " tid=" << tid << dendl;
+
+    uint64_t batch_len = 0;
+    if (successes > 0) {
+      if (successes == batch.size()) {
+       batch.clear();
+      } else  {
+       batch.erase(batch.begin(), batch.begin() + successes);
+       for (const auto& b : batch) {
+         batch_len +=  b.length() + part_entry_overhead;
+       }
+      }
+    }
+
+    if (batch.empty() && remaining.empty()) {
+      complete(std::move(p), 0);
+      return;
+    }
+
+    while (!remaining.empty() &&
+          (remaining.front().length() + batch_len <= max_part_size)) {
+
+      /* We can send entries with data_len up to max_entry_size,
+        however, we want to also account the overhead when
+        dealing with multiple entries. Previous check doesn't
+        account for overhead on purpose. */
+      batch_len += remaining.front().length() + part_entry_overhead;
+      batch.push_back(std::move(remaining.front()));
+      remaining.pop_front();
+    }
+    ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                     << " prepared push: remaining=" << remaining.size()
+                     << " batch=" << batch.size() << " i=" << i
+                     << " batch_len=" << batch_len
+                     << " tid=" << tid << dendl;
+    push(std::move(p));
+  }
+
+  void push(Ptr&& p) {
+    f->push_entries(batch, tid, call(std::move(p)));
+  }
+
+  void new_head(const DoutPrefixProvider *dpp, Ptr&& p) {
+    new_heading = true;
+    f->_prepare_new_head(dpp, tid, call(std::move(p)));
+  }
+
+  void handle(const DoutPrefixProvider *dpp, Ptr&& p, int r) {
+    if (!new_heading) {
+      if (r == -ERANGE) {
+       ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                      << " need new head tid=" << tid << dendl;
+       new_head(dpp, std::move(p));
+       return;
+      }
+      if (r < 0) {
+       ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                     << " push_entries failed: r=" << r
+                     << " tid=" << tid << dendl;
+       complete(std::move(p), r);
+       return;
+      }
+      i = 0; // We've made forward progress, so reset the race counter!
+      prep_then_push(dpp, std::move(p), r);
+    } else {
+      if (r < 0) {
+       ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                     << " prepare_new_head failed: r=" << r
+                     << " tid=" << tid << dendl;
+       complete(std::move(p), r);
+       return;
+      }
+      new_heading = false;
+      handle_new_head(dpp, std::move(p), r);
+    }
+  }
+
+  void handle_new_head(const DoutPrefixProvider *dpp, Ptr&& p, int r) {
+    if (r == -ECANCELED) {
+      if (p->i == MAX_RACE_RETRIES) {
+       ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                     << " canceled too many times, giving up: tid=" << tid << dendl;
+       complete(std::move(p), -ECANCELED);
+       return;
+      }
+      ++p->i;
+    } else if (r) {
+      complete(std::move(p), r);
+      return;
+    }
+
+    if (p->batch.empty()) {
+      prep_then_push(dpp, std::move(p), 0);
+      return;
+    } else {
+      push(std::move(p));
+      return;
+    }
+  }
+
+  Pusher(const DoutPrefixProvider *dpp, FIFO* f, std::deque<cb::list>&& remaining,
+        std::uint64_t tid, lr::AioCompletion* super)
+    : Completion(dpp, super), f(f), remaining(std::move(remaining)),
+      tid(tid) {}
+};
+
+void FIFO::push(const DoutPrefixProvider *dpp, const std::vector<cb::list>& data_bufs,
+               lr::AioCompletion* c)
+{
+  std::unique_lock l(m);
+  auto tid = ++next_tid;
+  auto max_entry_size = info.params.max_entry_size;
+  auto need_new_head = info.need_new_head();
+  l.unlock();
+  ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                << " entering: tid=" << tid << dendl;
+  auto p = std::make_unique<Pusher>(dpp, this, std::deque<cb::list>(data_bufs.begin(), data_bufs.end()),
+                                   tid, c);
+  // Validate sizes
+  for (const auto& bl : data_bufs) {
+    if (bl.length() > max_entry_size) {
+      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                << " entry bigger than max_entry_size tid=" << tid << dendl;
+      Pusher::complete(std::move(p), -E2BIG);
+      return;
+    }
+  }
+
+  if (data_bufs.empty() ) {
+    ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                  << " empty push, returning success tid=" << tid << dendl;
+    Pusher::complete(std::move(p), 0);
+    return;
+  }
+
+  if (need_new_head) {
+    ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                  << " need new head tid=" << tid << dendl;
+    p->new_head(dpp, std::move(p));
+  } else {
+    p->prep_then_push(dpp, std::move(p), 0);
+  }
+}
+
+int FIFO::list(const DoutPrefixProvider *dpp, int max_entries,
+              std::optional<std::string_view> markstr,
+              std::vector<list_entry>* presult, bool* pmore,
+              optional_yield y)
+{
+  std::unique_lock l(m);
+  auto tid = ++next_tid;
+  std::int64_t part_num = info.tail_part_num;
+  l.unlock();
+  ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                << " entering: tid=" << tid << dendl;
+  std::uint64_t ofs = 0;
+  if (markstr) {
+    auto marker = to_marker(*markstr);
+    if (!marker) {
+      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                << " invalid marker string: " << markstr
+                << " tid= "<< tid << dendl;
+      return -EINVAL;
+    }
+    part_num = marker->num;
+    ofs = marker->ofs;
+  }
+
+  std::vector<list_entry> result;
+  result.reserve(max_entries);
+  bool more = false;
+
+  std::vector<fifo::part_list_entry> entries;
+  int r = 0;
+  while (max_entries > 0) {
+    ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                  << " max_entries=" << max_entries << " tid=" << tid << dendl;
+    bool part_more = false;
+    bool part_full = false;
+
+    std::unique_lock l(m);
+    auto part_oid = info.part_oid(part_num);
+    l.unlock();
+
+    r = list_part(dpp, ioctx, part_oid, {}, ofs, max_entries, &entries,
+                 &part_more, &part_full, nullptr, tid, y);
+    if (r == -ENOENT) {
+      ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                    << " missing part, rereading metadata"
+                    << " tid= "<< tid << dendl;
+      r = read_meta(dpp, tid, y);
+      if (r < 0) {
+       ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                  << " read_meta failed: r=" << r
+                  << " tid= "<< tid << dendl;
+       return r;
+      }
+      if (part_num < info.tail_part_num) {
+       /* raced with trim? restart */
+       ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                      << " raced with trim, restarting: tid=" << tid << dendl;
+       max_entries += result.size();
+       result.clear();
+       std::unique_lock l(m);
+       part_num = info.tail_part_num;
+       l.unlock();
+       ofs = 0;
+       continue;
+      }
+      ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                    << " assuming part was not written yet, so end of data: "
+                    << "tid=" << tid << dendl;
+      more = false;
+      r = 0;
+      break;
+    }
+    if (r < 0) {
+      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                << " list_entries failed: r=" << r
+                << " tid= "<< tid << dendl;
+      return r;
+    }
+    more = part_full || part_more;
+    for (auto& entry : entries) {
+      list_entry e;
+      e.data = std::move(entry.data);
+      e.marker = marker{part_num, entry.ofs}.to_string();
+      e.mtime = entry.mtime;
+      result.push_back(std::move(e));
+      --max_entries;
+      if (max_entries == 0)
+       break;
+    }
+    entries.clear();
+    if (max_entries > 0 &&
+       part_more) {
+    }
+
+    if (!part_full) {
+      ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                    << " head part is not full, so we can assume we're done: "
+                    << "tid=" << tid << dendl;
+      break;
+    }
+    if (!part_more) {
+      ++part_num;
+      ofs = 0;
+    }
+  }
+  if (presult)
+    *presult = std::move(result);
+  if (pmore)
+    *pmore =  more;
+  return 0;
+}
+
+int FIFO::trim(const DoutPrefixProvider *dpp, std::string_view markstr, bool exclusive, optional_yield y)
+{
+  bool overshoot = false;
+  auto marker = to_marker(markstr);
+  if (!marker) {
+    return -EINVAL;
+  }
+  auto part_num = marker->num;
+  auto ofs = marker->ofs;
+  std::unique_lock l(m);
+  auto tid = ++next_tid;
+  auto hn = info.head_part_num;
+  const auto max_part_size = info.params.max_part_size;
+  if (part_num > hn) {
+    l.unlock();
+    auto r = read_meta(dpp, tid, y);
+    if (r < 0) {
+      return r;
+    }
+    l.lock();
+    auto hn = info.head_part_num;
+    if (part_num > hn) {
+      overshoot = true;
+      part_num = hn;
+      ofs = max_part_size;
+    }
+  }
+  if (part_num < info.tail_part_num) {
+    return -ENODATA;
+  }
+  auto pn = info.tail_part_num;
+  l.unlock();
+  ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                << " entering: tid=" << tid << dendl;
+
+  int r = 0;
+  while (pn < part_num) {
+    ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                  << " pn=" << pn << " tid=" << tid << dendl;
+    std::unique_lock l(m);
+    l.unlock();
+    r = trim_part(dpp, pn, max_part_size, std::nullopt, false, tid, y);
+    if (r < 0 && r == -ENOENT) {
+      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                << " trim_part failed: r=" << r
+                << " tid= "<< tid << dendl;
+      return r;
+    }
+    ++pn;
+  }
+  r = trim_part(dpp, part_num, ofs, std::nullopt, exclusive, tid, y);
+  if (r < 0 && r != -ENOENT) {
+    ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+              << " trim_part failed: r=" << r
+              << " tid= "<< tid << dendl;
+    return r;
+  }
+
+  l.lock();
+  auto tail_part_num = info.tail_part_num;
+  auto objv = info.version;
+  l.unlock();
+  bool canceled = tail_part_num < part_num;
+  int retries = 0;
+  while ((tail_part_num < part_num) &&
+        canceled &&
+        (retries <= MAX_RACE_RETRIES)) {
+    r = _update_meta(dpp, fifo::update{}.tail_part_num(part_num), objv, &canceled,
+                    tid, y);
+    if (r < 0) {
+      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                << " _update_meta failed: r=" << r
+                << " tid= "<< tid << dendl;
+      return r;
+    }
+    if (canceled) {
+      ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                    << " canceled: retries=" << retries
+                    << " tid=" << tid << dendl;
+      l.lock();
+      tail_part_num = info.tail_part_num;
+      objv = info.version;
+      l.unlock();
+      ++retries;
+    }
+  }
+  if (canceled) {
+    ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+              << " canceled too many times, giving up: tid=" << tid << dendl;
+    return -EIO;
+  }
+  return overshoot ? -ENODATA : 0;
+}
+
+struct Trimmer : public Completion<Trimmer> {
+  FIFO* fifo;
+  std::int64_t part_num;
+  std::uint64_t ofs;
+  std::int64_t pn;
+  bool exclusive;
+  std::uint64_t tid;
+  bool update = false;
+  bool reread = false;
+  bool canceled = false;
+  bool overshoot = false;
+  int retries = 0;
+
+  Trimmer(const DoutPrefixProvider *dpp, FIFO* fifo, std::int64_t part_num, std::uint64_t ofs, std::int64_t pn,
+         bool exclusive, lr::AioCompletion* super, std::uint64_t tid)
+    : Completion(dpp, super), fifo(fifo), part_num(part_num), ofs(ofs), pn(pn),
+      exclusive(exclusive), tid(tid) {}
+
+  void handle(const DoutPrefixProvider *dpp, Ptr&& p, int r) {
+    ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                  << " entering: tid=" << tid << dendl;
+
+    if (reread) {
+      reread = false;
+      if (r < 0) {
+       ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                  << " read_meta failed: r="
+                  << r << " tid=" << tid << dendl;
+       complete(std::move(p), r);
+       return;
+      }
+      std::unique_lock l(fifo->m);
+      auto hn = fifo->info.head_part_num;
+      const auto max_part_size = fifo->info.params.max_part_size;
+      const auto tail_part_num = fifo->info.tail_part_num;
+      l.unlock();
+      if (part_num > hn) {
+       part_num = hn;
+       ofs = max_part_size;
+       overshoot = true;
+      }
+      if (part_num < tail_part_num) {
+       complete(std::move(p), -ENODATA);
+       return;
+      }
+      pn = tail_part_num;
+      if (pn < part_num) {
+       ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                      << " pn=" << pn << " tid=" << tid << dendl;
+       fifo->trim_part(dpp, pn++, max_part_size, std::nullopt,
+                       false, tid, call(std::move(p)));
+      } else {
+       update = true;
+       canceled = tail_part_num < part_num;
+       fifo->trim_part(dpp, part_num, ofs, std::nullopt, exclusive, tid,
+                       call(std::move(p)));
+      }
+      return;
+    }
+
+    if (r == -ENOENT) {
+      r = 0;
+    }
+
+    if (r < 0) {
+      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                << (update ? " update_meta " : " trim ") << "failed: r="
+                << r << " tid=" << tid << dendl;
+      complete(std::move(p), r);
+      return;
+    }
+
+    if (!update) {
+      ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                    << " handling preceding trim callback: tid=" << tid << dendl;
+      retries = 0;
+      if (pn < part_num) {
+       ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                      << " pn=" << pn << " tid=" << tid << dendl;
+       std::unique_lock l(fifo->m);
+       const auto max_part_size = fifo->info.params.max_part_size;
+       l.unlock();
+       fifo->trim_part(dpp, pn++, max_part_size, std::nullopt,
+                       false, tid, call(std::move(p)));
+       return;
+      }
+
+      std::unique_lock l(fifo->m);
+      const auto tail_part_num = fifo->info.tail_part_num;
+      l.unlock();
+      update = true;
+      canceled = tail_part_num < part_num;
+      fifo->trim_part(dpp, part_num, ofs, std::nullopt, exclusive, tid,
+                     call(std::move(p)));
+      return;
+    }
+
+    ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                  << " handling update-needed callback: tid=" << tid << dendl;
+    std::unique_lock l(fifo->m);
+    auto tail_part_num = fifo->info.tail_part_num;
+    auto objv = fifo->info.version;
+    l.unlock();
+    if ((tail_part_num < part_num) &&
+       canceled) {
+      if (retries > MAX_RACE_RETRIES) {
+       ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                  << " canceled too many times, giving up: tid=" << tid << dendl;
+       complete(std::move(p), -EIO);
+       return;
+      }
+      ++retries;
+      fifo->_update_meta(dpp, fifo::update{}
+                        .tail_part_num(part_num), objv, &canceled,
+                         tid, call(std::move(p)));
+    } else {
+      complete(std::move(p), overshoot ? -ENODATA : 0);
+    }
+  }
+};
+
+void FIFO::trim(const DoutPrefixProvider *dpp, std::string_view markstr, bool exclusive,
+               lr::AioCompletion* c) {
+  auto marker = to_marker(markstr);
+  auto realmark = marker.value_or(::rgw::cls::fifo::marker{});
+  std::unique_lock l(m);
+  const auto hn = info.head_part_num;
+  const auto max_part_size = info.params.max_part_size;
+  const auto pn = info.tail_part_num;
+  const auto part_oid = info.part_oid(pn);
+  auto tid = ++next_tid;
+  l.unlock();
+  ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                << " entering: tid=" << tid << dendl;
+  auto trimmer = std::make_unique<Trimmer>(dpp, this, realmark.num, realmark.ofs,
+                                          pn, exclusive, c, tid);
+  if (!marker) {
+    Trimmer::complete(std::move(trimmer), -EINVAL);
+    return;
+  }
+  ++trimmer->pn;
+  auto ofs = marker->ofs;
+  if (marker->num > hn) {
+    trimmer->reread = true;
+    read_meta(dpp, tid, Trimmer::call(std::move(trimmer)));
+    return;
+  }
+  if (pn < marker->num) {
+    ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                  << " pn=" << pn << " tid=" << tid << dendl;
+    ofs = max_part_size;
+  } else {
+    trimmer->update = true;
+  }
+  trim_part(dpp, pn, ofs, std::nullopt, exclusive,
+           tid, Trimmer::call(std::move(trimmer)));
+}
+
+int FIFO::get_part_info(const DoutPrefixProvider *dpp, int64_t part_num,
+                       fifo::part_header* header,
+                       optional_yield y)
+{
+  std::unique_lock l(m);
+  const auto part_oid = info.part_oid(part_num);
+  auto tid = ++next_tid;
+  l.unlock();
+  auto r = rgw::cls::fifo::get_part_info(dpp, ioctx, part_oid, header, tid, y);
+  if (r < 0) {
+    ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+              << " get_part_info failed: r="
+              << r << " tid=" << tid << dendl;
+  }
+  return r;
+}
+
+void FIFO::get_part_info(int64_t part_num,
+                        fifo::part_header* header,
+                        lr::AioCompletion* c)
+{
+  std::unique_lock l(m);
+  const auto part_oid = info.part_oid(part_num);
+  auto tid = ++next_tid;
+  l.unlock();
+  auto op = rgw::cls::fifo::get_part_info(cct, header, tid);
+  auto r = ioctx.aio_operate(part_oid, c, &op, nullptr);
+  ceph_assert(r >= 0);
+}
+
+struct InfoGetter : Completion<InfoGetter> {
+  FIFO* fifo;
+  fifo::part_header header;
+  fu2::function<void(int r, fifo::part_header&&)> f;
+  std::uint64_t tid;
+  bool headerread = false;
+
+  InfoGetter(const DoutPrefixProvider *dpp, FIFO* fifo, fu2::function<void(int r, fifo::part_header&&)> f,
+            std::uint64_t tid, lr::AioCompletion* super)
+    : Completion(dpp, super), fifo(fifo), f(std::move(f)), tid(tid) {}
+  void handle(const DoutPrefixProvider *dpp, Ptr&& p, int r) {
+    if (!headerread) {
+      if (r < 0) {
+       ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                        << " read_meta failed: r="
+                        << r << " tid=" << tid << dendl;
+       if (f)
+         f(r, {});
+       complete(std::move(p), r);
+       return;
+      }
+
+      auto info = fifo->meta();
+      auto hpn = info.head_part_num;
+      if (hpn < 0) {
+       ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                            << " no head, returning empty partinfo r="
+                            << r << " tid=" << tid << dendl;
+       if (f)
+         f(0, {});
+       complete(std::move(p), r);
+       return;
+      }
+      headerread = true;
+      auto op = rgw::cls::fifo::get_part_info(fifo->cct, &header, tid);
+      std::unique_lock l(fifo->m);
+      auto oid = fifo->info.part_oid(hpn);
+      l.unlock();
+      r = fifo->ioctx.aio_operate(oid, call(std::move(p)), &op,
+                                 nullptr);
+      ceph_assert(r >= 0);
+      return;
+    }
+
+    if (r < 0) {
+      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                      << " get_part_info failed: r="
+                      << r << " tid=" << tid << dendl;
+    }
+
+    if (f)
+      f(r, std::move(header));
+    complete(std::move(p), r);
+    return;
+  }
+};
+
+void FIFO::get_head_info(const DoutPrefixProvider *dpp, fu2::unique_function<void(int r,
+                                                  fifo::part_header&&)> f,
+                        lr::AioCompletion* c)
+{
+  std::unique_lock l(m);
+  auto tid = ++next_tid;
+  l.unlock();
+  auto ig = std::make_unique<InfoGetter>(dpp, this, std::move(f), tid, c);
+  read_meta(dpp, tid, InfoGetter::call(std::move(ig)));
+}
+
+struct JournalProcessor : public Completion<JournalProcessor> {
+private:
+  FIFO* const fifo;
+
+  std::vector<fifo::journal_entry> processed;
+  std::multimap<std::int64_t, fifo::journal_entry> journal;
+  std::multimap<std::int64_t, fifo::journal_entry>::iterator iter;
+  std::int64_t new_tail;
+  std::int64_t new_head;
+  std::int64_t new_max;
+  int race_retries = 0;
+  bool first_pp = true;
+  bool canceled = false;
+  std::uint64_t tid;
+
+  enum {
+    entry_callback,
+    pp_callback,
+  } state;
+
+  void create_part(const DoutPrefixProvider *dpp, Ptr&& p, int64_t part_num,
+                  std::string_view tag) {
+    ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                        << " entering: tid=" << tid << dendl;
+    state = entry_callback;
+    lr::ObjectWriteOperation op;
+    op.create(false); /* We don't need exclusivity, part_init ensures
+                        we're creating from the  same journal entry. */
+    std::unique_lock l(fifo->m);
+    part_init(&op, tag, fifo->info.params);
+    auto oid = fifo->info.part_oid(part_num);
+    l.unlock();
+    auto r = fifo->ioctx.aio_operate(oid, call(std::move(p)), &op);
+    ceph_assert(r >= 0);
+    return;
+  }
+
+  void remove_part(const DoutPrefixProvider *dpp, Ptr&& p, int64_t part_num,
+                  std::string_view tag) {
+    ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                        << " entering: tid=" << tid << dendl;
+    state = entry_callback;
+    lr::ObjectWriteOperation op;
+    op.remove();
+    std::unique_lock l(fifo->m);
+    auto oid = fifo->info.part_oid(part_num);
+    l.unlock();
+    auto r = fifo->ioctx.aio_operate(oid, call(std::move(p)), &op);
+    ceph_assert(r >= 0);
+    return;
+  }
+
+  void finish_je(const DoutPrefixProvider *dpp, Ptr&& p, int r,
+                const fifo::journal_entry& entry) {
+    ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                        << " entering: tid=" << tid << dendl;
+
+    ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                        << " finishing entry: entry=" << entry
+                        << " tid=" << tid << dendl;
+
+    if (entry.op == fifo::journal_entry::Op::remove && r == -ENOENT)
+      r = 0;
+
+    if (r < 0) {
+      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                      << " processing entry failed: entry=" << entry
+                      << " r=" << r << " tid=" << tid << dendl;
+      complete(std::move(p), r);
+      return;
+    } else {
+      switch (entry.op) {
+      case fifo::journal_entry::Op::unknown:
+      case fifo::journal_entry::Op::set_head:
+       // Can't happen. Filtered out in process.
+       complete(std::move(p), -EIO);
+       return;
+
+      case fifo::journal_entry::Op::create:
+       if (entry.part_num > new_max) {
+         new_max = entry.part_num;
+       }
+       break;
+      case fifo::journal_entry::Op::remove:
+       if (entry.part_num >= new_tail) {
+         new_tail = entry.part_num + 1;
+       }
+       break;
+      }
+      processed.push_back(entry);
+    }
+    ++iter;
+    process(dpp, std::move(p));
+  }
+
+  void postprocess(const DoutPrefixProvider *dpp, Ptr&& p) {
+    ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                        << " entering: tid=" << tid << dendl;
+    if (processed.empty()) {
+      ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                          << " nothing to update any more: race_retries="
+                          << race_retries << " tid=" << tid << dendl;
+      complete(std::move(p), 0);
+      return;
+    }
+    pp_run(dpp, std::move(p), 0, false);
+  }
+
+public:
+
+  JournalProcessor(const DoutPrefixProvider *dpp, FIFO* fifo, std::uint64_t tid, lr::AioCompletion* super)
+    : Completion(dpp, super), fifo(fifo), tid(tid) {
+    std::unique_lock l(fifo->m);
+    journal = fifo->info.journal;
+    iter = journal.begin();
+    new_tail = fifo->info.tail_part_num;
+    new_head = fifo->info.head_part_num;
+    new_max = fifo->info.max_push_part_num;
+  }
+
+  void pp_run(const DoutPrefixProvider *dpp, Ptr&& p, int r, bool canceled) {
+    ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                        << " entering: tid=" << tid << dendl;
+    std::optional<int64_t> tail_part_num;
+    std::optional<int64_t> head_part_num;
+    std::optional<int64_t> max_part_num;
+
+    if (r < 0) {
+      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                      << " failed, r=: " << r << " tid=" << tid << dendl;
+      complete(std::move(p), r);
+    }
+
+
+    ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                        << " postprocessing: race_retries="
+                        << race_retries << " tid=" << tid << dendl;
+
+    if (!first_pp && r == 0 && !canceled) {
+      ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                          << " nothing to update any more: race_retries="
+                          << race_retries << " tid=" << tid << dendl;
+      complete(std::move(p), 0);
+      return;
+    }
+
+    first_pp = false;
+
+    if (canceled) {
+      if (race_retries >= MAX_RACE_RETRIES) {
+       ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                        << " canceled too many times, giving up: tid="
+                        << tid << dendl;
+       complete(std::move(p), -ECANCELED);
+       return;
+      }
+      ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                          << " update canceled, retrying: race_retries="
+                          << race_retries << " tid=" << tid << dendl;
+
+      ++race_retries;
+
+      std::vector<fifo::journal_entry> new_processed;
+      std::unique_lock l(fifo->m);
+      for (auto& e : processed) {
+       auto jiter = fifo->info.journal.find(e.part_num);
+       /* journal entry was already processed */
+       if (jiter == fifo->info.journal.end() ||
+           !(jiter->second == e)) {
+         continue;
+       }
+       new_processed.push_back(e);
+      }
+      processed = std::move(new_processed);
+    }
+
+    std::unique_lock l(fifo->m);
+    auto objv = fifo->info.version;
+    if (new_tail > fifo->info.tail_part_num) {
+      tail_part_num = new_tail;
+    }
+
+    if (new_head > fifo->info.head_part_num) {
+      head_part_num = new_head;
+    }
+
+    if (new_max > fifo->info.max_push_part_num) {
+      max_part_num = new_max;
+    }
+    l.unlock();
+
+    if (processed.empty() &&
+       !tail_part_num &&
+       !max_part_num) {
+      /* nothing to update anymore */
+      ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                          << " nothing to update any more: race_retries="
+                          << race_retries << " tid=" << tid << dendl;
+      complete(std::move(p), 0);
+      return;
+    }
+    state = pp_callback;
+    fifo->_update_meta(dpp, fifo::update{}
+                      .tail_part_num(tail_part_num)
+                      .head_part_num(head_part_num)
+                      .max_push_part_num(max_part_num)
+                      .journal_entries_rm(processed),
+                       objv, &this->canceled, tid, call(std::move(p)));
+    return;
+  }
+
+  JournalProcessor(const JournalProcessor&) = delete;
+  JournalProcessor& operator =(const JournalProcessor&) = delete;
+  JournalProcessor(JournalProcessor&&) = delete;
+  JournalProcessor& operator =(JournalProcessor&&) = delete;
+
+  void process(const DoutPrefixProvider *dpp, Ptr&& p) {
+    ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                        << " entering: tid=" << tid << dendl;
+    while (iter != journal.end()) {
+      ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                          << " processing entry: entry=" << *iter
+                          << " tid=" << tid << dendl;
+      const auto entry = iter->second;
+      switch (entry.op) {
+      case fifo::journal_entry::Op::create:
+       create_part(dpp, std::move(p), entry.part_num, entry.part_tag);
+       return;
+      case fifo::journal_entry::Op::set_head:
+       if (entry.part_num > new_head) {
+         new_head = entry.part_num;
+       }
+       processed.push_back(entry);
+       ++iter;
+       continue;
+      case fifo::journal_entry::Op::remove:
+       remove_part(dpp, std::move(p), entry.part_num, entry.part_tag);
+       return;
+      default:
+       ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                        << " unknown journaled op: entry=" << entry << " tid="
+                        << tid << dendl;
+       complete(std::move(p), -EIO);
+       return;
+      }
+    }
+    postprocess(dpp, std::move(p));
+    return;
+  }
+
+  void handle(const DoutPrefixProvider *dpp, Ptr&& p, int r) {
+    ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                        << " entering: tid=" << tid << dendl;
+    switch (state) {
+    case entry_callback:
+      finish_je(dpp, std::move(p), r, iter->second);
+      return;
+    case pp_callback:
+      auto c = canceled;
+      canceled = false;
+      pp_run(dpp, std::move(p), r, c);
+      return;
+    }
+
+    abort();
+  }
+
+};
+
+void FIFO::process_journal(const DoutPrefixProvider *dpp, std::uint64_t tid, lr::AioCompletion* c) {
+  auto p = std::make_unique<JournalProcessor>(dpp, this, tid, c);
+  p->process(dpp, std::move(p));
+}
+
+struct Lister : Completion<Lister> {
+  FIFO* f;
+  std::vector<list_entry> result;
+  bool more = false;
+  std::int64_t part_num;
+  std::uint64_t ofs;
+  int max_entries;
+  int r_out = 0;
+  std::vector<fifo::part_list_entry> entries;
+  bool part_more = false;
+  bool part_full = false;
+  std::vector<list_entry>* entries_out;
+  bool* more_out;
+  std::uint64_t tid;
+
+  bool read = false;
+
+  void complete(Ptr&& p, int r) {
+    if (r >= 0) {
+      if (more_out) *more_out = more;
+      if (entries_out) *entries_out = std::move(result);
+    }
+    Completion::complete(std::move(p), r);
+  }
+
+public:
+  Lister(const DoutPrefixProvider *dpp, FIFO* f, std::int64_t part_num, std::uint64_t ofs, int max_entries,
+        std::vector<list_entry>* entries_out, bool* more_out,
+        std::uint64_t tid, lr::AioCompletion* super)
+    : Completion(dpp, super), f(f), part_num(part_num), ofs(ofs), max_entries(max_entries),
+      entries_out(entries_out), more_out(more_out), tid(tid) {
+    result.reserve(max_entries);
+  }
+
+  Lister(const Lister&) = delete;
+  Lister& operator =(const Lister&) = delete;
+  Lister(Lister&&) = delete;
+  Lister& operator =(Lister&&) = delete;
+
+  void handle(const DoutPrefixProvider *dpp, Ptr&& p, int r) {
+    if (read)
+      handle_read(std::move(p), r);
+    else
+      handle_list(dpp, std::move(p), r);
+  }
+
+  void list(Ptr&& p) {
+    if (max_entries > 0) {
+      part_more = false;
+      part_full = false;
+      entries.clear();
+
+      std::unique_lock l(f->m);
+      auto part_oid = f->info.part_oid(part_num);
+      l.unlock();
+
+      read = false;
+      auto op = list_part(f->cct, {}, ofs, max_entries, &r_out,
+                         &entries, &part_more, &part_full,
+                         nullptr, tid);
+      f->ioctx.aio_operate(part_oid, call(std::move(p)), &op, nullptr);
+    } else {
+      complete(std::move(p), 0);
+    }
+  }
+
+  void handle_read(Ptr&& p, int r) {
+    read = false;
+    if (r >= 0) r = r_out;
+    r_out = 0;
+
+    if (r < 0) {
+      complete(std::move(p), r);
+      return;
+    }
+
+    if (part_num < f->info.tail_part_num) {
+      /* raced with trim? restart */
+      max_entries += result.size();
+      result.clear();
+      part_num = f->info.tail_part_num;
+      ofs = 0;
+      list(std::move(p));
+      return;
+    }
+    /* assuming part was not written yet, so end of data */
+    more = false;
+    complete(std::move(p), 0);
+    return;
+  }
+
+  void handle_list(const DoutPrefixProvider *dpp, Ptr&& p, int r) {
+    if (r >= 0) r = r_out;
+    r_out = 0;
+    std::unique_lock l(f->m);
+    auto part_oid = f->info.part_oid(part_num);
+    l.unlock();
+    if (r == -ENOENT) {
+      read = true;
+      f->read_meta(dpp, tid, call(std::move(p)));
+      return;
+    }
+    if (r < 0) {
+      complete(std::move(p), r);
+      return;
+    }
+
+    more = part_full || part_more;
+    for (auto& entry : entries) {
+      list_entry e;
+      e.data = std::move(entry.data);
+      e.marker = marker{part_num, entry.ofs}.to_string();
+      e.mtime = entry.mtime;
+      result.push_back(std::move(e));
+    }
+    max_entries -= entries.size();
+    entries.clear();
+    if (max_entries > 0 && part_more) {
+      list(std::move(p));
+      return;
+    }
+
+    if (!part_full) { /* head part is not full */
+      complete(std::move(p), 0);
+      return;
+    }
+    ++part_num;
+    ofs = 0;
+    list(std::move(p));
+  }
+};
+
+void FIFO::list(const DoutPrefixProvider *dpp, int max_entries,
+               std::optional<std::string_view> markstr,
+               std::vector<list_entry>* out,
+               bool* more,
+               lr::AioCompletion* c) {
+  std::unique_lock l(m);
+  auto tid = ++next_tid;
+  std::int64_t part_num = info.tail_part_num;
+  l.unlock();
+  std::uint64_t ofs = 0;
+  std::optional<::rgw::cls::fifo::marker> marker;
+
+  if (markstr) {
+    marker = to_marker(*markstr);
+    if (marker) {
+      part_num = marker->num;
+      ofs = marker->ofs;
+    }
+  }
+
+  auto ls = std::make_unique<Lister>(dpp, this, part_num, ofs, max_entries, out,
+                                    more, tid, c);
+  if (markstr && !marker) {
+    auto l = ls.get();
+    l->complete(std::move(ls), -EINVAL);
+  } else {
+    ls->list(std::move(ls));
+  }
+}
+}
diff --git a/src/rgw/driver/rados/cls_fifo_legacy.h b/src/rgw/driver/rados/cls_fifo_legacy.h
new file mode 100644 (file)
index 0000000..9a35e4d
--- /dev/null
@@ -0,0 +1,342 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2020 Red Hat <contact@redhat.com>
+ * Author: Adam C. Emerson
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_RGW_CLS_FIFO_LEGACY_H
+#define CEPH_RGW_CLS_FIFO_LEGACY_H
+
+#include <cstdint>
+#include <deque>
+#include <map>
+#include <memory>
+#include <mutex>
+#include <optional>
+#include <string_view>
+#include <vector>
+
+#undef FMT_HEADER_ONLY
+#define FMT_HEADER_ONLY 1
+#include <fmt/format.h>
+
+#include "include/rados/librados.hpp"
+#include "include/buffer.h"
+#include "include/function2.hpp"
+
+#include "common/async/yield_context.h"
+
+#include "cls/fifo/cls_fifo_types.h"
+#include "cls/fifo/cls_fifo_ops.h"
+
+#include "librados/AioCompletionImpl.h"
+
+#include "rgw_tools.h"
+
+namespace rgw::cls::fifo {
+namespace cb = ceph::buffer;
+namespace fifo = rados::cls::fifo;
+namespace lr = librados;
+
+inline constexpr std::uint64_t default_max_part_size = 4 * 1024 * 1024;
+inline constexpr std::uint64_t default_max_entry_size = 32 * 1024;
+
+void create_meta(lr::ObjectWriteOperation* op, std::string_view id,
+                std::optional<fifo::objv> objv,
+                std::optional<std::string_view> oid_prefix,
+                bool exclusive = false,
+                std::uint64_t max_part_size = default_max_part_size,
+                std::uint64_t max_entry_size = default_max_entry_size);
+int get_meta(const DoutPrefixProvider *dpp, lr::IoCtx& ioctx, const std::string& oid,
+            std::optional<fifo::objv> objv, fifo::info* info,
+            std::uint32_t* part_header_size,
+            std::uint32_t* part_entry_overhead,
+            std::uint64_t tid, optional_yield y,
+            bool probe = false);
+struct marker {
+  std::int64_t num = 0;
+  std::uint64_t ofs = 0;
+
+  marker() = default;
+  marker(std::int64_t num, std::uint64_t ofs) : num(num), ofs(ofs) {}
+  static marker max() {
+    return { std::numeric_limits<decltype(num)>::max(),
+            std::numeric_limits<decltype(ofs)>::max() };
+  }
+
+  std::string to_string() {
+    return fmt::format("{:0>20}:{:0>20}", num, ofs);
+  }
+};
+
+struct list_entry {
+  cb::list data;
+  std::string marker;
+  ceph::real_time mtime;
+};
+
+using part_info = fifo::part_header;
+
+/// This is an implementation of FIFO using librados to facilitate
+/// backports. Please see /src/neorados/cls/fifo.h for full
+/// information.
+///
+/// This library uses optional_yield. Please see
+/// /src/common/async/yield_context.h. In summary, optional_yield
+/// contains either a spawn::yield_context (in which case the current
+/// coroutine is suspended until completion) or null_yield (in which
+/// case the current thread is blocked until completion.)
+///
+/// Please see the librados documentation for information on
+/// AioCompletion and IoCtx.
+
+class FIFO {
+  friend struct Reader;
+  friend struct Updater;
+  friend struct Trimmer;
+  friend struct InfoGetter;
+  friend struct Pusher;
+  friend struct NewPartPreparer;
+  friend struct NewHeadPreparer;
+  friend struct JournalProcessor;
+  friend struct Lister;
+
+  mutable lr::IoCtx ioctx;
+  CephContext* cct = static_cast<CephContext*>(ioctx.cct());
+  const std::string oid;
+  std::mutex m;
+  std::uint64_t next_tid = 0;
+
+  fifo::info info;
+
+  std::uint32_t part_header_size = 0xdeadbeef;
+  std::uint32_t part_entry_overhead = 0xdeadbeef;
+
+  std::optional<marker> to_marker(std::string_view s);
+
+  FIFO(lr::IoCtx&& ioc,
+       std::string oid)
+    : ioctx(std::move(ioc)), oid(oid) {}
+
+  std::string generate_tag() const;
+
+  int apply_update(const DoutPrefixProvider *dpp,
+                   fifo::info* info,
+                  const fifo::objv& objv,
+                  const fifo::update& update,
+                  std::uint64_t tid);
+  int _update_meta(const DoutPrefixProvider *dpp, const fifo::update& update,
+                  fifo::objv version, bool* pcanceled,
+                  std::uint64_t tid, optional_yield y);
+  void _update_meta(const DoutPrefixProvider *dpp, const fifo::update& update,
+                   fifo::objv version, bool* pcanceled,
+                   std::uint64_t tid, lr::AioCompletion* c);
+  int create_part(const DoutPrefixProvider *dpp, int64_t part_num, std::string_view tag, std::uint64_t tid,
+                 optional_yield y);
+  int remove_part(const DoutPrefixProvider *dpp, int64_t part_num, std::string_view tag, std::uint64_t tid,
+                 optional_yield y);
+  int process_journal(const DoutPrefixProvider *dpp, std::uint64_t tid, optional_yield y);
+  void process_journal(const DoutPrefixProvider *dpp, std::uint64_t tid, lr::AioCompletion* c);
+  int _prepare_new_part(const DoutPrefixProvider *dpp, bool is_head, std::uint64_t tid, optional_yield y);
+  void _prepare_new_part(const DoutPrefixProvider *dpp, bool is_head, std::uint64_t tid, lr::AioCompletion* c);
+  int _prepare_new_head(const DoutPrefixProvider *dpp, std::uint64_t tid, optional_yield y);
+  void _prepare_new_head(const DoutPrefixProvider *dpp, std::uint64_t tid, lr::AioCompletion* c);
+  int push_entries(const DoutPrefixProvider *dpp, const std::deque<cb::list>& data_bufs,
+                  std::uint64_t tid, optional_yield y);
+  void push_entries(const std::deque<cb::list>& data_bufs,
+                   std::uint64_t tid, lr::AioCompletion* c);
+  int trim_part(const DoutPrefixProvider *dpp, int64_t part_num, uint64_t ofs,
+               std::optional<std::string_view> tag, bool exclusive,
+               std::uint64_t tid, optional_yield y);
+  void trim_part(const DoutPrefixProvider *dpp, int64_t part_num, uint64_t ofs,
+                std::optional<std::string_view> tag, bool exclusive,
+                std::uint64_t tid, lr::AioCompletion* c);
+
+  /// Force refresh of metadata, yielding/blocking style
+  int read_meta(const DoutPrefixProvider *dpp, std::uint64_t tid, optional_yield y);
+  /// Force refresh of metadata, with a librados Completion
+  void read_meta(const DoutPrefixProvider *dpp, std::uint64_t tid, lr::AioCompletion* c);
+
+public:
+
+  FIFO(const FIFO&) = delete;
+  FIFO& operator =(const FIFO&) = delete;
+  FIFO(FIFO&&) = delete;
+  FIFO& operator =(FIFO&&) = delete;
+
+  /// Open an existing FIFO.
+  static int open(const DoutPrefixProvider *dpp, lr::IoCtx ioctx, //< IO Context
+                 std::string oid, //< OID for metadata object
+                 std::unique_ptr<FIFO>* fifo, //< OUT: Pointer to FIFO object
+                 optional_yield y, //< Optional yield context
+                 /// Operation will fail if FIFO is not at this version
+                 std::optional<fifo::objv> objv = std::nullopt,
+                 /// Probing for existence, don't print errors if we
+                 /// can't find it.
+                 bool probe = false);
+  /// Create a new or open an existing FIFO.
+  static int create(const DoutPrefixProvider *dpp, lr::IoCtx ioctx, //< IO Context
+                   std::string oid, //< OID for metadata object
+                   std::unique_ptr<FIFO>* fifo, //< OUT: Pointer to FIFO object
+                   optional_yield y, //< Optional yield context
+                   /// Operation will fail if the FIFO exists and is
+                   /// not of this version.
+                   std::optional<fifo::objv> objv = std::nullopt,
+                   /// Prefix for all objects
+                   std::optional<std::string_view> oid_prefix = std::nullopt,
+                   /// Fail if the FIFO already exists
+                   bool exclusive = false,
+                   /// Maximum allowed size of parts
+                   std::uint64_t max_part_size = default_max_part_size,
+                   /// Maximum allowed size of entries
+                   std::uint64_t max_entry_size = default_max_entry_size);
+
+  /// Force refresh of metadata, yielding/blocking style
+  int read_meta(const DoutPrefixProvider *dpp, optional_yield y);
+  /// Get currently known metadata
+  const fifo::info& meta() const;
+  /// Get partition header and entry overhead size
+  std::pair<std::uint32_t, std::uint32_t> get_part_layout_info() const;
+  /// Push an entry to the FIFO
+  int push(const DoutPrefixProvider *dpp, 
+           const cb::list& bl, //< Entry to push
+          optional_yield y //< Optional yield
+    );
+  /// Push an entry to the FIFO
+  void push(const DoutPrefixProvider *dpp, const cb::list& bl, //< Entry to push
+           lr::AioCompletion* c //< Async Completion
+    );
+  /// Push entries to the FIFO
+  int push(const DoutPrefixProvider *dpp, 
+           const std::vector<cb::list>& data_bufs, //< Entries to push
+          optional_yield y //< Optional yield
+    );
+  /// Push entries to the FIFO
+  void push(const DoutPrefixProvider *dpp, const std::vector<cb::list>& data_bufs, //< Entries to push
+           lr::AioCompletion* c //< Async Completion
+    );
+  /// List entries
+  int list(const DoutPrefixProvider *dpp, 
+           int max_entries, //< Maximum entries to list
+          /// Point after which to begin listing. Start at tail if null
+          std::optional<std::string_view> markstr,
+          std::vector<list_entry>* out, //< OUT: entries
+          /// OUT: True if more entries in FIFO beyond the last returned
+          bool* more,
+          optional_yield y //< Optional yield
+    );
+  void list(const DoutPrefixProvider *dpp, 
+            int max_entries, //< Maximum entries to list
+           /// Point after which to begin listing. Start at tail if null
+           std::optional<std::string_view> markstr,
+           std::vector<list_entry>* out, //< OUT: entries
+           /// OUT: True if more entries in FIFO beyond the last returned
+           bool* more,
+           lr::AioCompletion* c //< Async Completion
+    );
+  /// Trim entries, coroutine/block style
+  int trim(const DoutPrefixProvider *dpp, 
+           std::string_view markstr, //< Position to which to trim, inclusive
+          bool exclusive, //< If true, do not trim the target entry
+                          //< itself, just all those before it.
+          optional_yield y //< Optional yield
+    );
+  /// Trim entries, librados AioCompletion style
+  void trim(const DoutPrefixProvider *dpp, 
+            std::string_view markstr, //< Position to which to trim, inclusive
+           bool exclusive, //< If true, do not trim the target entry
+                           //< itself, just all those before it.
+           lr::AioCompletion* c //< librados AIO Completion
+    );
+  /// Get part info
+  int get_part_info(const DoutPrefixProvider *dpp, int64_t part_num, /// Part number
+                   fifo::part_header* header, //< OUT: Information
+                   optional_yield y //< Optional yield
+    );
+  /// Get part info
+  void get_part_info(int64_t part_num, //< Part number
+                   fifo::part_header* header, //< OUT: Information
+                   lr::AioCompletion* c //< AIO Completion
+    );
+  /// A convenience method to fetch the part information for the FIFO
+  /// head, using librados::AioCompletion, since
+  /// libradio::AioCompletions compose lousily.
+  void get_head_info(const DoutPrefixProvider *dpp, fu2::unique_function< //< Function to receive info
+                      void(int r, fifo::part_header&&)>,
+                    lr::AioCompletion* c //< AIO Completion
+    );
+};
+
+template<typename T>
+struct Completion {
+private:
+  const DoutPrefixProvider *_dpp;
+  lr::AioCompletion* _cur = nullptr;
+  lr::AioCompletion* _super;
+public:
+
+  using Ptr = std::unique_ptr<T>;
+
+  lr::AioCompletion* cur() const {
+    return _cur;
+  }
+  lr::AioCompletion* super() const {
+    return _super;
+  }
+
+  Completion(const DoutPrefixProvider *dpp, lr::AioCompletion* super) : _dpp(dpp), _super(super) {
+    super->pc->get();
+  }
+
+  ~Completion() {
+    if (_super) {
+      _super->pc->put();
+    }
+    if (_cur)
+      _cur->release();
+    _super = nullptr;
+    _cur = nullptr;
+  }
+
+  // The only times that aio_operate can return an error are:
+  // 1. The completion contains a null pointer. This should just
+  //    crash, and in our case it does.
+  // 2. An attempt is made to write to a snapshot. RGW doesn't use
+  //    snapshots, so we don't care.
+  //
+  // So we will just assert that initiating an Aio operation succeeds
+  // and not worry about recovering.
+  static lr::AioCompletion* call(Ptr&& p) {
+    p->_cur = lr::Rados::aio_create_completion(static_cast<void*>(p.get()),
+                                              &cb);
+    auto c = p->_cur;
+    p.release();
+    return c;
+  }
+  static void complete(Ptr&& p, int r) {
+    auto c = p->_super;
+    p->_super = nullptr;
+    rgw_complete_aio_completion(c, r);
+  }
+
+  static void cb(lr::completion_t, void* arg) {
+    auto t = static_cast<T*>(arg);
+    auto r = t->_cur->get_return_value();
+    t->_cur->release();
+    t->_cur = nullptr;
+    t->handle(t->_dpp, Ptr(t), r);
+  }
+};
+
+}
+
+#endif // CEPH_RGW_CLS_FIFO_LEGACY_H
diff --git a/src/rgw/driver/rados/config/impl.cc b/src/rgw/driver/rados/config/impl.cc
new file mode 100644 (file)
index 0000000..f1b2bef
--- /dev/null
@@ -0,0 +1,129 @@
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2022 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "impl.h"
+
+#include "common/async/yield_context.h"
+#include "common/errno.h"
+#include "rgw_string.h"
+#include "rgw_zone.h"
+
+namespace rgw::rados {
+
+// default pool names
+constexpr std::string_view default_zone_root_pool = "rgw.root";
+constexpr std::string_view default_zonegroup_root_pool = "rgw.root";
+constexpr std::string_view default_realm_root_pool = "rgw.root";
+constexpr std::string_view default_period_root_pool = "rgw.root";
+
+static rgw_pool default_pool(std::string_view name,
+                             std::string_view default_name)
+{
+  return std::string{name_or_default(name, default_name)};
+}
+
+ConfigImpl::ConfigImpl(const ceph::common::ConfigProxy& conf)
+  : realm_pool(default_pool(conf->rgw_realm_root_pool,
+                            default_realm_root_pool)),
+    period_pool(default_pool(conf->rgw_period_root_pool,
+                             default_period_root_pool)),
+    zonegroup_pool(default_pool(conf->rgw_zonegroup_root_pool,
+                                default_zonegroup_root_pool)),
+    zone_pool(default_pool(conf->rgw_zone_root_pool,
+                           default_zone_root_pool))
+{
+}
+
+int ConfigImpl::read(const DoutPrefixProvider* dpp, optional_yield y,
+                     const rgw_pool& pool, const std::string& oid,
+                     bufferlist& bl, RGWObjVersionTracker* objv)
+{
+  librados::IoCtx ioctx;
+  int r = rgw_init_ioctx(dpp, &rados, pool, ioctx, true, false);
+  if (r < 0) {
+    return r;
+  }
+  librados::ObjectReadOperation op;
+  if (objv) {
+    objv->prepare_op_for_read(&op);
+  }
+  op.read(0, 0, &bl, nullptr);
+  return rgw_rados_operate(dpp, ioctx, oid, &op, nullptr, y);
+}
+
+int ConfigImpl::write(const DoutPrefixProvider* dpp, optional_yield y,
+                      const rgw_pool& pool, const std::string& oid,
+                      Create create, const bufferlist& bl,
+                      RGWObjVersionTracker* objv)
+{
+  librados::IoCtx ioctx;
+  int r = rgw_init_ioctx(dpp, &rados, pool, ioctx, true, false);
+  if (r < 0) {
+    return r;
+  }
+
+  librados::ObjectWriteOperation op;
+  switch (create) {
+    case Create::MustNotExist: op.create(true); break;
+    case Create::MayExist: op.create(false); break;
+    case Create::MustExist: op.assert_exists(); break;
+  }
+  if (objv) {
+    objv->prepare_op_for_write(&op);
+  }
+  op.write_full(bl);
+
+  r = rgw_rados_operate(dpp, ioctx, oid, &op, y);
+  if (r >= 0 && objv) {
+    objv->apply_write();
+  }
+  return r;
+}
+
+int ConfigImpl::remove(const DoutPrefixProvider* dpp, optional_yield y,
+                       const rgw_pool& pool, const std::string& oid,
+                       RGWObjVersionTracker* objv)
+{
+  librados::IoCtx ioctx;
+  int r = rgw_init_ioctx(dpp, &rados, pool, ioctx, true, false);
+  if (r < 0) {
+    return r;
+  }
+
+  librados::ObjectWriteOperation op;
+  if (objv) {
+    objv->prepare_op_for_write(&op);
+  }
+  op.remove();
+
+  r = rgw_rados_operate(dpp, ioctx, oid, &op, y);
+  if (r >= 0 && objv) {
+    objv->apply_write();
+  }
+  return r;
+}
+
+int ConfigImpl::notify(const DoutPrefixProvider* dpp, optional_yield y,
+                       const rgw_pool& pool, const std::string& oid,
+                       bufferlist& bl, uint64_t timeout_ms)
+{
+  librados::IoCtx ioctx;
+  int r = rgw_init_ioctx(dpp, &rados, pool, ioctx, true, false);
+  if (r < 0) {
+    return r;
+  }
+  return rgw_rados_notify(dpp, ioctx, oid, bl, timeout_ms, nullptr, y);
+}
+
+} // namespace rgw::rados
diff --git a/src/rgw/driver/rados/config/impl.h b/src/rgw/driver/rados/config/impl.h
new file mode 100644 (file)
index 0000000..3aed451
--- /dev/null
@@ -0,0 +1,139 @@
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2022 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#pragma once
+
+#include "include/rados/librados.hpp"
+#include "common/dout.h"
+#include "rgw_basic_types.h"
+#include "rgw_tools.h"
+#include "rgw_sal_config.h"
+
+namespace rgw::rados {
+
+// write options that control object creation
+enum class Create {
+  MustNotExist, // fail with EEXIST if the object already exists
+  MayExist, // create if the object didn't exist, overwrite if it did
+  MustExist, // fail with ENOENT if the object doesn't exist
+};
+
+struct ConfigImpl {
+  librados::Rados rados;
+
+  const rgw_pool realm_pool;
+  const rgw_pool period_pool;
+  const rgw_pool zonegroup_pool;
+  const rgw_pool zone_pool;
+
+  ConfigImpl(const ceph::common::ConfigProxy& conf);
+
+  int read(const DoutPrefixProvider* dpp, optional_yield y,
+           const rgw_pool& pool, const std::string& oid,
+           bufferlist& bl, RGWObjVersionTracker* objv);
+
+  template <typename T>
+  int read(const DoutPrefixProvider* dpp, optional_yield y,
+           const rgw_pool& pool, const std::string& oid,
+           T& data, RGWObjVersionTracker* objv)
+  {
+    bufferlist bl;
+    int r = read(dpp, y, pool, oid, bl, objv);
+    if (r < 0) {
+      return r;
+    }
+    try {
+      auto p = bl.cbegin();
+      decode(data, p);
+    } catch (const buffer::error& err) {
+      ldpp_dout(dpp, 0) << "ERROR: failed to decode obj from "
+          << pool << ":" << oid << dendl;
+      return -EIO;
+    }
+    return 0;
+  }
+
+  int write(const DoutPrefixProvider* dpp, optional_yield y,
+            const rgw_pool& pool, const std::string& oid, Create create,
+            const bufferlist& bl, RGWObjVersionTracker* objv);
+
+  template <typename T>
+  int write(const DoutPrefixProvider* dpp, optional_yield y,
+            const rgw_pool& pool, const std::string& oid, Create create,
+            const T& data, RGWObjVersionTracker* objv)
+  {
+    bufferlist bl;
+    encode(data, bl);
+
+    return write(dpp, y, pool, oid, create, bl, objv);
+  }
+
+  int remove(const DoutPrefixProvider* dpp, optional_yield y,
+             const rgw_pool& pool, const std::string& oid,
+             RGWObjVersionTracker* objv);
+
+  int list(const DoutPrefixProvider* dpp, optional_yield y,
+           const rgw_pool& pool, const std::string& marker,
+           std::regular_invocable<std::string> auto filter,
+           std::span<std::string> entries,
+           sal::ListResult<std::string>& result)
+  {
+    librados::IoCtx ioctx;
+    int r = rgw_init_ioctx(dpp, &rados, pool, ioctx, true, false);
+    if (r < 0) {
+      return r;
+    }
+    librados::ObjectCursor oc;
+    if (!oc.from_str(marker)) {
+      ldpp_dout(dpp, 10) << "failed to parse cursor: " << marker << dendl;
+      return -EINVAL;
+    }
+    std::size_t count = 0;
+    try {
+      auto iter = ioctx.nobjects_begin(oc);
+      const auto end = ioctx.nobjects_end();
+      for (; count < entries.size() && iter != end; ++iter) {
+        std::string entry = filter(iter->get_oid());
+        if (!entry.empty()) {
+          entries[count++] = std::move(entry);
+        }
+      }
+      if (iter == end) {
+        result.next.clear();
+      } else {
+        result.next = iter.get_cursor().to_str();
+      }
+    } catch (const std::exception& e) {
+      ldpp_dout(dpp, 10) << "NObjectIterator exception " << e.what() << dendl;
+      return -EIO;
+    }
+    result.entries = entries.first(count);
+    return 0;
+  }
+
+  int notify(const DoutPrefixProvider* dpp, optional_yield y,
+             const rgw_pool& pool, const std::string& oid,
+             bufferlist& bl, uint64_t timeout_ms);
+};
+
+inline std::string_view name_or_default(std::string_view name,
+                                        std::string_view default_name)
+{
+  if (!name.empty()) {
+    return name;
+  }
+  return default_name;
+}
+
+} // namespace rgw::rados
diff --git a/src/rgw/driver/rados/config/period.cc b/src/rgw/driver/rados/config/period.cc
new file mode 100644 (file)
index 0000000..bc3fa27
--- /dev/null
@@ -0,0 +1,230 @@
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2022 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "common/dout.h"
+#include "common/errno.h"
+#include "rgw_zone.h"
+#include "driver/rados/config/store.h"
+
+#include "impl.h"
+
+namespace rgw::rados {
+
+// period oids
+constexpr std::string_view period_info_oid_prefix = "periods.";
+constexpr std::string_view period_latest_epoch_info_oid = ".latest_epoch";
+constexpr std::string_view period_staging_suffix = ":staging";
+
+static std::string period_oid(std::string_view period_id, uint32_t epoch)
+{
+  // omit the epoch for the staging period
+  if (period_id.ends_with(period_staging_suffix)) {
+    return string_cat_reserve(period_info_oid_prefix, period_id);
+  }
+  return fmt::format("{}{}.{}", period_info_oid_prefix, period_id, epoch);
+}
+
+static std::string latest_epoch_oid(const ceph::common::ConfigProxy& conf,
+                                    std::string_view period_id)
+{
+  return string_cat_reserve(
+      period_info_oid_prefix, period_id,
+      name_or_default(conf->rgw_period_latest_epoch_info_oid,
+                      period_latest_epoch_info_oid));
+}
+
+static int read_latest_epoch(const DoutPrefixProvider* dpp, optional_yield y,
+                             ConfigImpl* impl, std::string_view period_id,
+                             uint32_t& epoch, RGWObjVersionTracker* objv)
+{
+  const auto& pool = impl->period_pool;
+  const auto latest_oid = latest_epoch_oid(dpp->get_cct()->_conf, period_id);
+  RGWPeriodLatestEpochInfo latest;
+  int r = impl->read(dpp, y, pool, latest_oid, latest, objv);
+  if (r >= 0) {
+    epoch = latest.epoch;
+  }
+  return r;
+}
+
+static int write_latest_epoch(const DoutPrefixProvider* dpp, optional_yield y,
+                              ConfigImpl* impl, bool exclusive,
+                              std::string_view period_id, uint32_t epoch,
+                              RGWObjVersionTracker* objv)
+{
+  const auto& pool = impl->period_pool;
+  const auto latest_oid = latest_epoch_oid(dpp->get_cct()->_conf, period_id);
+  const auto create = exclusive ? Create::MustNotExist : Create::MayExist;
+  RGWPeriodLatestEpochInfo latest{epoch};
+  return impl->write(dpp, y, pool, latest_oid, create, latest, objv);
+}
+
+static int delete_latest_epoch(const DoutPrefixProvider* dpp, optional_yield y,
+                               ConfigImpl* impl, std::string_view period_id,
+                               RGWObjVersionTracker* objv)
+{
+  const auto& pool = impl->period_pool;
+  const auto latest_oid = latest_epoch_oid(dpp->get_cct()->_conf, period_id);
+  return impl->remove(dpp, y, pool, latest_oid, objv);
+}
+
+static int update_latest_epoch(const DoutPrefixProvider* dpp, optional_yield y,
+                               ConfigImpl* impl, std::string_view period_id,
+                               uint32_t epoch)
+{
+  static constexpr int MAX_RETRIES = 20;
+
+  for (int i = 0; i < MAX_RETRIES; i++) {
+    uint32_t existing_epoch = 0;
+    RGWObjVersionTracker objv;
+    bool exclusive = false;
+
+    // read existing epoch
+    int r = read_latest_epoch(dpp, y, impl, period_id, existing_epoch, &objv);
+    if (r == -ENOENT) {
+      // use an exclusive create to set the epoch atomically
+      exclusive = true;
+      objv.generate_new_write_ver(dpp->get_cct());
+      ldpp_dout(dpp, 20) << "creating initial latest_epoch=" << epoch
+          << " for period=" << period_id << dendl;
+    } else if (r < 0) {
+      ldpp_dout(dpp, 0) << "ERROR: failed to read latest_epoch" << dendl;
+      return r;
+    } else if (epoch <= existing_epoch) {
+      r = -EEXIST; // fail with EEXIST if epoch is not newer
+      ldpp_dout(dpp, 10) << "found existing latest_epoch " << existing_epoch
+          << " >= given epoch " << epoch << ", returning r=" << r << dendl;
+      return r;
+    } else {
+      ldpp_dout(dpp, 20) << "updating latest_epoch from " << existing_epoch
+          << " -> " << epoch << " on period=" << period_id << dendl;
+    }
+
+    r = write_latest_epoch(dpp, y, impl, exclusive, period_id, epoch, &objv);
+    if (r == -EEXIST) {
+      continue; // exclusive create raced with another update, retry
+    } else if (r == -ECANCELED) {
+      continue; // write raced with a conflicting version, retry
+    }
+    if (r < 0) {
+      ldpp_dout(dpp, 0) << "ERROR: failed to write latest_epoch" << dendl;
+      return r;
+    }
+    return 0; // return success
+  }
+
+  return -ECANCELED; // fail after max retries
+}
+
+int RadosConfigStore::create_period(const DoutPrefixProvider* dpp,
+                                    optional_yield y, bool exclusive,
+                                    const RGWPeriod& info)
+{
+  if (info.get_id().empty()) {
+    ldpp_dout(dpp, 0) << "period cannot have an empty id" << dendl;
+    return -EINVAL;
+  }
+  if (info.get_epoch() == 0) {
+    ldpp_dout(dpp, 0) << "period cannot have an empty epoch" << dendl;
+    return -EINVAL;
+  }
+  const auto& pool = impl->period_pool;
+  const auto info_oid = period_oid(info.get_id(), info.get_epoch());
+  const auto create = exclusive ? Create::MustNotExist : Create::MayExist;
+  RGWObjVersionTracker objv;
+  objv.generate_new_write_ver(dpp->get_cct());
+  int r = impl->write(dpp, y, pool, info_oid, create, info, &objv);
+  if (r < 0) {
+    return r;
+  }
+
+  (void) update_latest_epoch(dpp, y, impl.get(), info.get_id(), info.get_epoch());
+  return 0;
+}
+
+int RadosConfigStore::read_period(const DoutPrefixProvider* dpp,
+                                  optional_yield y,
+                                  std::string_view period_id,
+                                  std::optional<uint32_t> epoch,
+                                  RGWPeriod& info)
+{
+  int r = 0;
+  if (!epoch) {
+    epoch = 0;
+    r = read_latest_epoch(dpp, y, impl.get(), period_id, *epoch, nullptr);
+    if (r < 0) {
+      return r;
+    }
+  }
+
+  const auto& pool = impl->period_pool;
+  const auto info_oid = period_oid(period_id, *epoch);
+  return impl->read(dpp, y, pool, info_oid, info, nullptr);
+}
+
+int RadosConfigStore::delete_period(const DoutPrefixProvider* dpp,
+                                    optional_yield y,
+                                    std::string_view period_id)
+{
+  const auto& pool = impl->period_pool;
+
+  // read the latest_epoch
+  uint32_t latest_epoch = 0;
+  RGWObjVersionTracker latest_objv;
+  int r = read_latest_epoch(dpp, y, impl.get(), period_id,
+                            latest_epoch, &latest_objv);
+  if (r < 0 && r != -ENOENT) { // just delete epoch=0 on ENOENT
+    ldpp_dout(dpp, 0) << "failed to read latest epoch for period "
+        << period_id << ": " << cpp_strerror(r) << dendl;
+    return r;
+  }
+
+  for (uint32_t epoch = 0; epoch <= latest_epoch; epoch++) {
+    const auto info_oid = period_oid(period_id, epoch);
+    r = impl->remove(dpp, y, pool, info_oid, nullptr);
+    if (r < 0 && r != -ENOENT) { // ignore ENOENT
+      ldpp_dout(dpp, 0) << "failed to delete period " << info_oid
+          << ": " << cpp_strerror(r) << dendl;
+      return r;
+    }
+  }
+
+  return delete_latest_epoch(dpp, y, impl.get(), period_id, &latest_objv);
+}
+
+int RadosConfigStore::list_period_ids(const DoutPrefixProvider* dpp,
+                                      optional_yield y,
+                                      const std::string& marker,
+                                      std::span<std::string> entries,
+                                      sal::ListResult<std::string>& result)
+{
+  const auto& pool = impl->period_pool;
+  constexpr auto prefix = [] (std::string oid) -> std::string {
+      if (!oid.starts_with(period_info_oid_prefix)) {
+        return {};
+      }
+      if (!oid.ends_with(period_latest_epoch_info_oid)) {
+        return {};
+      }
+      // trim the prefix and suffix
+      const std::size_t count = oid.size() -
+          period_info_oid_prefix.size() -
+          period_latest_epoch_info_oid.size();
+      return oid.substr(period_info_oid_prefix.size(), count);
+    };
+
+  return impl->list(dpp, y, pool, marker, prefix, entries, result);
+}
+
+} // namespace rgw::rados
diff --git a/src/rgw/driver/rados/config/period_config.cc b/src/rgw/driver/rados/config/period_config.cc
new file mode 100644 (file)
index 0000000..ec984eb
--- /dev/null
@@ -0,0 +1,55 @@
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2022 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "rgw_zone.h"
+#include "driver/rados/config/store.h"
+
+#include "impl.h"
+
+namespace rgw::rados {
+
+// period config oids
+constexpr std::string_view period_config_prefix = "period_config.";
+constexpr std::string_view period_config_realm_default = "default";
+
+std::string period_config_oid(std::string_view realm_id)
+{
+  if (realm_id.empty()) {
+    realm_id = period_config_realm_default;
+  }
+  return string_cat_reserve(period_config_prefix, realm_id);
+}
+
+int RadosConfigStore::read_period_config(const DoutPrefixProvider* dpp,
+                                         optional_yield y,
+                                         std::string_view realm_id,
+                                         RGWPeriodConfig& info)
+{
+  const auto& pool = impl->period_pool;
+  const auto oid = period_config_oid(realm_id);
+  return impl->read(dpp, y, pool, oid, info, nullptr);
+}
+
+int RadosConfigStore::write_period_config(const DoutPrefixProvider* dpp,
+                                          optional_yield y, bool exclusive,
+                                          std::string_view realm_id,
+                                          const RGWPeriodConfig& info)
+{
+  const auto& pool = impl->period_pool;
+  const auto oid = period_config_oid(realm_id);
+  const auto create = exclusive ? Create::MustNotExist : Create::MayExist;
+  return impl->write(dpp, y, pool, oid, create, info, nullptr);
+}
+
+} // namespace rgw::rados
diff --git a/src/rgw/driver/rados/config/realm.cc b/src/rgw/driver/rados/config/realm.cc
new file mode 100644 (file)
index 0000000..331e0ff
--- /dev/null
@@ -0,0 +1,364 @@
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2022 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "common/dout.h"
+#include "common/errno.h"
+#include "rgw_realm_watcher.h"
+#include "rgw_zone.h"
+#include "driver/rados/config/store.h"
+
+#include "impl.h"
+
+namespace rgw::rados {
+
+// realm oids
+constexpr std::string_view realm_names_oid_prefix = "realms_names.";
+constexpr std::string_view realm_info_oid_prefix = "realms.";
+constexpr std::string_view realm_control_oid_suffix = ".control";
+constexpr std::string_view default_realm_info_oid = "default.realm";
+
+static std::string realm_info_oid(std::string_view realm_id)
+{
+  return string_cat_reserve(realm_info_oid_prefix, realm_id);
+}
+static std::string realm_name_oid(std::string_view realm_id)
+{
+  return string_cat_reserve(realm_names_oid_prefix, realm_id);
+}
+static std::string realm_control_oid(std::string_view realm_id)
+{
+  return string_cat_reserve(realm_info_oid_prefix, realm_id,
+                            realm_control_oid_suffix);
+}
+static std::string default_realm_oid(const ceph::common::ConfigProxy& conf)
+{
+  return std::string{name_or_default(conf->rgw_default_realm_info_oid,
+                                     default_realm_info_oid)};
+}
+
+
+int RadosConfigStore::write_default_realm_id(const DoutPrefixProvider* dpp,
+                                             optional_yield y, bool exclusive,
+                                             std::string_view realm_id)
+{
+  const auto& pool = impl->realm_pool;
+  const auto oid = default_realm_oid(dpp->get_cct()->_conf);
+  const auto create = exclusive ? Create::MustNotExist : Create::MayExist;
+
+  RGWDefaultSystemMetaObjInfo default_info;
+  default_info.default_id = realm_id;
+
+  return impl->write(dpp, y, pool, oid, create, default_info, nullptr);
+}
+
+int RadosConfigStore::read_default_realm_id(const DoutPrefixProvider* dpp,
+                                            optional_yield y,
+                                            std::string& realm_id)
+{
+  const auto& pool = impl->realm_pool;
+  const auto oid = default_realm_oid(dpp->get_cct()->_conf);
+
+  RGWDefaultSystemMetaObjInfo default_info;
+  int r = impl->read(dpp, y, pool, oid, default_info, nullptr);
+  if (r >= 0) {
+    realm_id = default_info.default_id;
+  }
+  return r;
+}
+
+int RadosConfigStore::delete_default_realm_id(const DoutPrefixProvider* dpp,
+                                              optional_yield y)
+{
+  const auto& pool = impl->realm_pool;
+  const auto oid = default_realm_oid(dpp->get_cct()->_conf);
+
+  return impl->remove(dpp, y, pool, oid, nullptr);
+}
+
+
+class RadosRealmWriter : public sal::RealmWriter {
+  ConfigImpl* impl;
+  RGWObjVersionTracker objv;
+  std::string realm_id;
+  std::string realm_name;
+ public:
+  RadosRealmWriter(ConfigImpl* impl, RGWObjVersionTracker objv,
+                   std::string_view realm_id, std::string_view realm_name)
+    : impl(impl), objv(std::move(objv)),
+      realm_id(realm_id), realm_name(realm_name)
+  {
+  }
+
+  int write(const DoutPrefixProvider* dpp, optional_yield y,
+            const RGWRealm& info) override
+  {
+    if (realm_id != info.get_id() || realm_name != info.get_name()) {
+      return -EINVAL; // can't modify realm id or name directly
+    }
+
+    const auto& pool = impl->realm_pool;
+    const auto info_oid = realm_info_oid(info.get_id());
+    return impl->write(dpp, y, pool, info_oid, Create::MustExist, info, &objv);
+  }
+
+  int rename(const DoutPrefixProvider* dpp, optional_yield y,
+             RGWRealm& info, std::string_view new_name) override
+  {
+    if (realm_id != info.get_id() || realm_name != info.get_name()) {
+      return -EINVAL; // can't modify realm id or name directly
+    }
+    if (new_name.empty()) {
+      ldpp_dout(dpp, 0) << "realm cannot have an empty name" << dendl;
+      return -EINVAL;
+    }
+
+    const auto& pool = impl->realm_pool;
+    const auto name = RGWNameToId{info.get_id()};
+    const auto info_oid = realm_info_oid(info.get_id());
+    const auto old_oid = realm_name_oid(info.get_name());
+    const auto new_oid = realm_name_oid(new_name);
+
+    // link the new name
+    RGWObjVersionTracker new_objv;
+    new_objv.generate_new_write_ver(dpp->get_cct());
+    int r = impl->write(dpp, y, pool, new_oid, Create::MustNotExist,
+                        name, &new_objv);
+    if (r < 0) {
+      return r;
+    }
+
+    // write the info with updated name
+    info.set_name(std::string{new_name});
+    r = impl->write(dpp, y, pool, info_oid, Create::MustExist, info, &objv);
+    if (r < 0) {
+      // on failure, unlink the new name
+      (void) impl->remove(dpp, y, pool, new_oid, &new_objv);
+      return r;
+    }
+
+    // unlink the old name
+    (void) impl->remove(dpp, y, pool, old_oid, nullptr);
+
+    realm_name = new_name;
+    return 0;
+  }
+
+  int remove(const DoutPrefixProvider* dpp, optional_yield y) override
+  {
+    const auto& pool = impl->realm_pool;
+    const auto info_oid = realm_info_oid(realm_id);
+    int r = impl->remove(dpp, y, pool, info_oid, &objv);
+    if (r < 0) {
+      return r;
+    }
+    const auto name_oid = realm_name_oid(realm_name);
+    (void) impl->remove(dpp, y, pool, name_oid, nullptr);
+    const auto control_oid = realm_control_oid(realm_id);
+    (void) impl->remove(dpp, y, pool, control_oid, nullptr);
+    return 0;
+  }
+}; // RadosRealmWriter
+
+
+int RadosConfigStore::create_realm(const DoutPrefixProvider* dpp,
+                                   optional_yield y, bool exclusive,
+                                   const RGWRealm& info,
+                                   std::unique_ptr<sal::RealmWriter>* writer)
+{
+  if (info.get_id().empty()) {
+    ldpp_dout(dpp, 0) << "realm cannot have an empty id" << dendl;
+    return -EINVAL;
+  }
+  if (info.get_name().empty()) {
+    ldpp_dout(dpp, 0) << "realm cannot have an empty name" << dendl;
+    return -EINVAL;
+  }
+
+  const auto& pool = impl->realm_pool;
+  const auto create = exclusive ? Create::MustNotExist : Create::MayExist;
+
+  // write the realm info
+  const auto info_oid = realm_info_oid(info.get_id());
+  RGWObjVersionTracker objv;
+  objv.generate_new_write_ver(dpp->get_cct());
+
+  int r = impl->write(dpp, y, pool, info_oid, create, info, &objv);
+  if (r < 0) {
+    return r;
+  }
+
+  // write the realm name
+  const auto name_oid = realm_name_oid(info.get_name());
+  const auto name = RGWNameToId{info.get_id()};
+  RGWObjVersionTracker name_objv;
+  name_objv.generate_new_write_ver(dpp->get_cct());
+
+  r = impl->write(dpp, y, pool, name_oid, create, name, &name_objv);
+  if (r < 0) {
+    (void) impl->remove(dpp, y, pool, info_oid, &objv);
+    return r;
+  }
+
+  // create control object for watch/notify
+  const auto control_oid = realm_control_oid(info.get_id());
+  bufferlist empty_bl;
+  r = impl->write(dpp, y, pool, control_oid, Create::MayExist,
+                  empty_bl, nullptr);
+  if (r < 0) {
+    (void) impl->remove(dpp, y, pool, name_oid, &name_objv);
+    (void) impl->remove(dpp, y, pool, info_oid, &objv);
+    return r;
+  }
+
+  if (writer) {
+    *writer = std::make_unique<RadosRealmWriter>(
+        impl.get(), std::move(objv), info.get_id(), info.get_name());
+  }
+  return 0;
+}
+
+int RadosConfigStore::read_realm_by_id(const DoutPrefixProvider* dpp,
+                                       optional_yield y,
+                                       std::string_view realm_id,
+                                       RGWRealm& info,
+                                       std::unique_ptr<sal::RealmWriter>* writer)
+{
+  const auto& pool = impl->realm_pool;
+  const auto info_oid = realm_info_oid(realm_id);
+  RGWObjVersionTracker objv;
+  int r = impl->read(dpp, y, pool, info_oid, info, &objv);
+  if (r < 0) {
+    return r;
+  }
+
+  if (writer) {
+    *writer = std::make_unique<RadosRealmWriter>(
+        impl.get(), std::move(objv), info.get_id(), info.get_name());
+  }
+  return 0;
+}
+
+int RadosConfigStore::read_realm_by_name(const DoutPrefixProvider* dpp,
+                                         optional_yield y,
+                                         std::string_view realm_name,
+                                         RGWRealm& info,
+                                         std::unique_ptr<sal::RealmWriter>* writer)
+{
+  const auto& pool = impl->realm_pool;
+
+  // look up realm id by name
+  RGWNameToId name;
+  const auto name_oid = realm_name_oid(realm_name);
+  int r = impl->read(dpp, y, pool, name_oid, name, nullptr);
+  if (r < 0) {
+    return r;
+  }
+
+  const auto info_oid = realm_info_oid(name.obj_id);
+  RGWObjVersionTracker objv;
+  r = impl->read(dpp, y, pool, info_oid, info, &objv);
+  if (r < 0) {
+    return r;
+  }
+
+  if (writer) {
+    *writer = std::make_unique<RadosRealmWriter>(
+        impl.get(), std::move(objv), info.get_id(), info.get_name());
+  }
+  return 0;
+}
+
+int RadosConfigStore::read_default_realm(const DoutPrefixProvider* dpp,
+                                         optional_yield y,
+                                         RGWRealm& info,
+                                         std::unique_ptr<sal::RealmWriter>* writer)
+{
+  const auto& pool = impl->realm_pool;
+
+  // read default realm id
+  RGWDefaultSystemMetaObjInfo default_info;
+  const auto default_oid = default_realm_oid(dpp->get_cct()->_conf);
+  int r = impl->read(dpp, y, pool, default_oid, default_info, nullptr);
+  if (r < 0) {
+    return r;
+  }
+
+  const auto info_oid = realm_info_oid(default_info.default_id);
+  RGWObjVersionTracker objv;
+  r = impl->read(dpp, y, pool, info_oid, info, &objv);
+  if (r < 0) {
+    return r;
+  }
+
+  if (writer) {
+    *writer = std::make_unique<RadosRealmWriter>(
+        impl.get(), std::move(objv), info.get_id(), info.get_name());
+  }
+  return 0;
+}
+
+int RadosConfigStore::read_realm_id(const DoutPrefixProvider* dpp,
+                                    optional_yield y,
+                                    std::string_view realm_name,
+                                    std::string& realm_id)
+{
+  const auto& pool = impl->realm_pool;
+  RGWNameToId name;
+
+  // look up realm id by name
+  const auto name_oid = realm_name_oid(realm_name);
+  int r = impl->read(dpp, y, pool, name_oid, name, nullptr);
+  if (r < 0) {
+    return r;
+  }
+  realm_id = std::move(name.obj_id);
+  return 0;
+}
+
+int RadosConfigStore::realm_notify_new_period(const DoutPrefixProvider* dpp,
+                                              optional_yield y,
+                                              const RGWPeriod& period)
+{
+  const auto& pool = impl->realm_pool;
+  const auto control_oid = realm_control_oid(period.get_realm());
+
+  bufferlist bl;
+  using ceph::encode;
+  // push the period to dependent zonegroups/zones
+  encode(RGWRealmNotify::ZonesNeedPeriod, bl);
+  encode(period, bl);
+  // reload the gateway with the new period
+  encode(RGWRealmNotify::Reload, bl);
+
+  constexpr uint64_t timeout_ms = 0;
+  return impl->notify(dpp, y, pool, control_oid, bl, timeout_ms);
+}
+
+int RadosConfigStore::list_realm_names(const DoutPrefixProvider* dpp,
+                                       optional_yield y,
+                                       const std::string& marker,
+                                       std::span<std::string> entries,
+                                       sal::ListResult<std::string>& result)
+{
+  const auto& pool = impl->realm_pool;
+  constexpr auto prefix = [] (std::string oid) -> std::string {
+      if (!oid.starts_with(realm_names_oid_prefix)) {
+        return {};
+      }
+      return oid.substr(realm_names_oid_prefix.size());
+    };
+  return impl->list(dpp, y, pool, marker, prefix, entries, result);
+}
+
+} // namespace rgw::rados
diff --git a/src/rgw/driver/rados/config/store.cc b/src/rgw/driver/rados/config/store.cc
new file mode 100644 (file)
index 0000000..ec2b034
--- /dev/null
@@ -0,0 +1,52 @@
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2022 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "include/rados/librados.hpp"
+#include "common/errno.h"
+#include "impl.h"
+#include "store.h"
+
+namespace rgw::rados {
+
+RadosConfigStore::RadosConfigStore(std::unique_ptr<ConfigImpl> impl)
+  : impl(std::move(impl))
+{
+}
+
+RadosConfigStore::~RadosConfigStore() = default;
+
+
+auto create_config_store(const DoutPrefixProvider* dpp)
+    -> std::unique_ptr<RadosConfigStore>
+{
+  auto impl = std::make_unique<ConfigImpl>(dpp->get_cct()->_conf);
+
+  // initialize a Rados client
+  int r = impl->rados.init_with_context(dpp->get_cct());
+  if (r < 0) {
+    ldpp_dout(dpp, -1) << "Rados client initialization failed with "
+        << cpp_strerror(-r) << dendl;
+    return nullptr;
+  }
+  r = impl->rados.connect();
+  if (r < 0) {
+    ldpp_dout(dpp, -1) << "Rados client connection failed with "
+        << cpp_strerror(-r) << dendl;
+    return nullptr;
+  }
+
+  return std::make_unique<RadosConfigStore>(std::move(impl));
+}
+
+} // namespace rgw::rados
diff --git a/src/rgw/driver/rados/config/store.h b/src/rgw/driver/rados/config/store.h
new file mode 100644 (file)
index 0000000..1b93a80
--- /dev/null
@@ -0,0 +1,182 @@
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2022 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#pragma once
+
+#include <list>
+#include <memory>
+#include <string>
+#include "rgw_common.h"
+#include "rgw_sal_config.h"
+
+class DoutPrefixProvider;
+class optional_yield;
+
+namespace rgw::rados {
+
+struct ConfigImpl;
+
+class RadosConfigStore : public sal::ConfigStore {
+ public:
+  explicit RadosConfigStore(std::unique_ptr<ConfigImpl> impl);
+  virtual ~RadosConfigStore() override;
+
+  // Realm
+  virtual int write_default_realm_id(const DoutPrefixProvider* dpp,
+                                     optional_yield y, bool exclusive,
+                                     std::string_view realm_id) override;
+  virtual int read_default_realm_id(const DoutPrefixProvider* dpp,
+                                    optional_yield y,
+                                    std::string& realm_id) override;
+  virtual int delete_default_realm_id(const DoutPrefixProvider* dpp,
+                                      optional_yield y) override;
+
+  virtual int create_realm(const DoutPrefixProvider* dpp,
+                           optional_yield y, bool exclusive,
+                           const RGWRealm& info,
+                           std::unique_ptr<sal::RealmWriter>* writer) override;
+  virtual int read_realm_by_id(const DoutPrefixProvider* dpp,
+                               optional_yield y,
+                               std::string_view realm_id,
+                               RGWRealm& info,
+                               std::unique_ptr<sal::RealmWriter>* writer) override;
+  virtual int read_realm_by_name(const DoutPrefixProvider* dpp,
+                                 optional_yield y,
+                                 std::string_view realm_name,
+                                 RGWRealm& info,
+                                 std::unique_ptr<sal::RealmWriter>* writer) override;
+  virtual int read_default_realm(const DoutPrefixProvider* dpp,
+                                 optional_yield y,
+                                 RGWRealm& info,
+                                 std::unique_ptr<sal::RealmWriter>* writer) override;
+  virtual int read_realm_id(const DoutPrefixProvider* dpp,
+                            optional_yield y, std::string_view realm_name,
+                            std::string& realm_id) override;
+  virtual int realm_notify_new_period(const DoutPrefixProvider* dpp,
+                                      optional_yield y,
+                                      const RGWPeriod& period) override;
+  virtual int list_realm_names(const DoutPrefixProvider* dpp,
+                               optional_yield y, const std::string& marker,
+                               std::span<std::string> entries,
+                               sal::ListResult<std::string>& result) override;
+
+  // Period
+  virtual int create_period(const DoutPrefixProvider* dpp,
+                            optional_yield y, bool exclusive,
+                            const RGWPeriod& info) override;
+  virtual int read_period(const DoutPrefixProvider* dpp,
+                          optional_yield y, std::string_view period_id,
+                          std::optional<uint32_t> epoch, RGWPeriod& info) override;
+  virtual int delete_period(const DoutPrefixProvider* dpp,
+                            optional_yield y,
+                            std::string_view period_id) override;
+  virtual int list_period_ids(const DoutPrefixProvider* dpp,
+                              optional_yield y, const std::string& marker,
+                              std::span<std::string> entries,
+                              sal::ListResult<std::string>& result) override;
+
+  // ZoneGroup
+  virtual int write_default_zonegroup_id(const DoutPrefixProvider* dpp,
+                                         optional_yield y, bool exclusive,
+                                         std::string_view realm_id,
+                                         std::string_view zonegroup_id) override;
+  virtual int read_default_zonegroup_id(const DoutPrefixProvider* dpp,
+                                        optional_yield y,
+                                        std::string_view realm_id,
+                                        std::string& zonegroup_id) override;
+  virtual int delete_default_zonegroup_id(const DoutPrefixProvider* dpp,
+                                          optional_yield y,
+                                          std::string_view realm_id) override;
+
+  virtual int create_zonegroup(const DoutPrefixProvider* dpp,
+                               optional_yield y, bool exclusive,
+                               const RGWZoneGroup& info,
+                               std::unique_ptr<sal::ZoneGroupWriter>* writer) override;
+  virtual int read_zonegroup_by_id(const DoutPrefixProvider* dpp,
+                                   optional_yield y,
+                                   std::string_view zonegroup_id,
+                                   RGWZoneGroup& info,
+                                   std::unique_ptr<sal::ZoneGroupWriter>* writer) override;
+  virtual int read_zonegroup_by_name(const DoutPrefixProvider* dpp,
+                                     optional_yield y,
+                                     std::string_view zonegroup_name,
+                                     RGWZoneGroup& info,
+                                     std::unique_ptr<sal::ZoneGroupWriter>* writer) override;
+  virtual int read_default_zonegroup(const DoutPrefixProvider* dpp,
+                                     optional_yield y,
+                                     std::string_view realm_id,
+                                     RGWZoneGroup& info,
+                                     std::unique_ptr<sal::ZoneGroupWriter>* writer) override;
+  virtual int list_zonegroup_names(const DoutPrefixProvider* dpp,
+                                   optional_yield y, const std::string& marker,
+                                   std::span<std::string> entries,
+                                   sal::ListResult<std::string>& result) override;
+
+  // Zone
+  virtual int write_default_zone_id(const DoutPrefixProvider* dpp,
+                                    optional_yield y, bool exclusive,
+                                    std::string_view realm_id,
+                                    std::string_view zone_id) override;
+  virtual int read_default_zone_id(const DoutPrefixProvider* dpp,
+                                   optional_yield y,
+                                   std::string_view realm_id,
+                                   std::string& zone_id) override;
+  virtual int delete_default_zone_id(const DoutPrefixProvider* dpp,
+                                     optional_yield y,
+                                     std::string_view realm_id) override;
+
+  virtual int create_zone(const DoutPrefixProvider* dpp,
+                          optional_yield y, bool exclusive,
+                          const RGWZoneParams& info,
+                          std::unique_ptr<sal::ZoneWriter>* writer) override;
+  virtual int read_zone_by_id(const DoutPrefixProvider* dpp,
+                              optional_yield y,
+                              std::string_view zone_id,
+                              RGWZoneParams& info,
+                              std::unique_ptr<sal::ZoneWriter>* writer) override;
+  virtual int read_zone_by_name(const DoutPrefixProvider* dpp,
+                                optional_yield y,
+                                std::string_view zone_name,
+                                RGWZoneParams& info,
+                                std::unique_ptr<sal::ZoneWriter>* writer) override;
+  virtual int read_default_zone(const DoutPrefixProvider* dpp,
+                                optional_yield y,
+                                std::string_view realm_id,
+                                RGWZoneParams& info,
+                                std::unique_ptr<sal::ZoneWriter>* writer) override;
+  virtual int list_zone_names(const DoutPrefixProvider* dpp,
+                              optional_yield y, const std::string& marker,
+                              std::span<std::string> entries,
+                              sal::ListResult<std::string>& result) override;
+
+  // PeriodConfig
+  virtual int read_period_config(const DoutPrefixProvider* dpp,
+                                 optional_yield y,
+                                 std::string_view realm_id,
+                                 RGWPeriodConfig& info) override;
+  virtual int write_period_config(const DoutPrefixProvider* dpp,
+                                  optional_yield y, bool exclusive,
+                                  std::string_view realm_id,
+                                  const RGWPeriodConfig& info) override;
+
+ private:
+  std::unique_ptr<ConfigImpl> impl;
+}; // RadosConfigStore
+
+
+/// RadosConfigStore factory function
+auto create_config_store(const DoutPrefixProvider* dpp)
+    -> std::unique_ptr<RadosConfigStore>;
+
+} // namespace rgw::rados
diff --git a/src/rgw/driver/rados/config/zone.cc b/src/rgw/driver/rados/config/zone.cc
new file mode 100644 (file)
index 0000000..e06c160
--- /dev/null
@@ -0,0 +1,312 @@
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2022 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "common/dout.h"
+#include "common/errno.h"
+#include "rgw_zone.h"
+#include "driver/rados/config/store.h"
+
+#include "impl.h"
+
+namespace rgw::rados {
+
+// zone oids
+constexpr std::string_view zone_info_oid_prefix = "zone_info.";
+constexpr std::string_view zone_names_oid_prefix = "zone_names.";
+
+std::string zone_info_oid(std::string_view zone_id)
+{
+  return string_cat_reserve(zone_info_oid_prefix, zone_id);
+}
+std::string zone_name_oid(std::string_view zone_id)
+{
+  return string_cat_reserve(zone_names_oid_prefix, zone_id);
+}
+std::string default_zone_oid(const ceph::common::ConfigProxy& conf,
+                             std::string_view realm_id)
+{
+  return fmt::format("{}.{}", conf->rgw_default_zone_info_oid, realm_id);
+}
+
+
+int RadosConfigStore::write_default_zone_id(const DoutPrefixProvider* dpp,
+                                            optional_yield y,
+                                            bool exclusive,
+                                            std::string_view realm_id,
+                                            std::string_view zone_id)
+{
+  const auto& pool = impl->zone_pool;
+  const auto default_oid = default_zone_oid(dpp->get_cct()->_conf, realm_id);
+  const auto create = exclusive ? Create::MustNotExist : Create::MayExist;
+
+  RGWDefaultSystemMetaObjInfo default_info;
+  default_info.default_id = zone_id;
+
+  return impl->write(dpp, y, pool, default_oid, create, default_info, nullptr);
+}
+
+int RadosConfigStore::read_default_zone_id(const DoutPrefixProvider* dpp,
+                                           optional_yield y,
+                                           std::string_view realm_id,
+                                           std::string& zone_id)
+{
+  const auto& pool = impl->zone_pool;
+  const auto default_oid = default_zone_oid(dpp->get_cct()->_conf, realm_id);
+
+  RGWDefaultSystemMetaObjInfo default_info;
+  int r = impl->read(dpp, y, pool, default_oid, default_info, nullptr);
+  if (r >= 0) {
+    zone_id = default_info.default_id;
+  }
+  return r;
+}
+
+int RadosConfigStore::delete_default_zone_id(const DoutPrefixProvider* dpp,
+                                             optional_yield y,
+                                             std::string_view realm_id)
+{
+  const auto& pool = impl->zone_pool;
+  const auto default_oid = default_zone_oid(dpp->get_cct()->_conf, realm_id);
+
+  return impl->remove(dpp, y, pool, default_oid, nullptr);
+}
+
+
+class RadosZoneWriter : public sal::ZoneWriter {
+  ConfigImpl* impl;
+  RGWObjVersionTracker objv;
+  std::string zone_id;
+  std::string zone_name;
+ public:
+  RadosZoneWriter(ConfigImpl* impl, RGWObjVersionTracker objv,
+                  std::string_view zone_id, std::string_view zone_name)
+      : impl(impl), objv(std::move(objv)),
+        zone_id(zone_id), zone_name(zone_name)
+  {
+  }
+
+  int write(const DoutPrefixProvider* dpp, optional_yield y,
+            const RGWZoneParams& info) override
+  {
+    if (zone_id != info.get_id() || zone_name != info.get_name()) {
+      return -EINVAL; // can't modify zone id or name directly
+    }
+
+    const auto& pool = impl->zone_pool;
+    const auto info_oid = zone_info_oid(info.get_id());
+    return impl->write(dpp, y, pool, info_oid, Create::MustExist, info, &objv);
+  }
+
+  int rename(const DoutPrefixProvider* dpp, optional_yield y,
+             RGWZoneParams& info, std::string_view new_name) override
+  {
+    if (zone_id != info.get_id() || zone_name != info.get_name()) {
+      return -EINVAL; // can't modify zone id or name directly
+    }
+    if (new_name.empty()) {
+      ldpp_dout(dpp, 0) << "zone cannot have an empty name" << dendl;
+      return -EINVAL;
+    }
+
+    const auto& pool = impl->zone_pool;
+    const auto name = RGWNameToId{info.get_id()};
+    const auto info_oid = zone_info_oid(info.get_id());
+    const auto old_oid = zone_name_oid(info.get_name());
+    const auto new_oid = zone_name_oid(new_name);
+
+    // link the new name
+    RGWObjVersionTracker new_objv;
+    new_objv.generate_new_write_ver(dpp->get_cct());
+    int r = impl->write(dpp, y, pool, new_oid, Create::MustNotExist,
+                        name, &new_objv);
+    if (r < 0) {
+      return r;
+    }
+
+    // write the info with updated name
+    info.set_name(std::string{new_name});
+    r = impl->write(dpp, y, pool, info_oid, Create::MustExist, info, &objv);
+    if (r < 0) {
+      // on failure, unlink the new name
+      (void) impl->remove(dpp, y, pool, new_oid, &new_objv);
+      return r;
+    }
+
+    // unlink the old name
+    (void) impl->remove(dpp, y, pool, old_oid, nullptr);
+
+    zone_name = new_name;
+    return 0;
+  }
+
+  int remove(const DoutPrefixProvider* dpp, optional_yield y) override
+  {
+    const auto& pool = impl->zone_pool;
+    const auto info_oid = zone_info_oid(zone_id);
+    int r = impl->remove(dpp, y, pool, info_oid, &objv);
+    if (r < 0) {
+      return r;
+    }
+    const auto name_oid = zone_name_oid(zone_name);
+    (void) impl->remove(dpp, y, pool, name_oid, nullptr);
+    return 0;
+  }
+}; // RadosZoneWriter
+
+
+int RadosConfigStore::create_zone(const DoutPrefixProvider* dpp,
+                                  optional_yield y, bool exclusive,
+                                  const RGWZoneParams& info,
+                                  std::unique_ptr<sal::ZoneWriter>* writer)
+{
+  if (info.get_id().empty()) {
+    ldpp_dout(dpp, 0) << "zone cannot have an empty id" << dendl;
+    return -EINVAL;
+  }
+  if (info.get_name().empty()) {
+    ldpp_dout(dpp, 0) << "zone cannot have an empty name" << dendl;
+    return -EINVAL;
+  }
+
+  const auto& pool = impl->zone_pool;
+  const auto create = exclusive ? Create::MustNotExist : Create::MayExist;
+
+  // write the zone info
+  const auto info_oid = zone_info_oid(info.get_id());
+  RGWObjVersionTracker objv;
+  objv.generate_new_write_ver(dpp->get_cct());
+
+  int r = impl->write(dpp, y, pool, info_oid, create, info, &objv);
+  if (r < 0) {
+    return r;
+  }
+
+  // write the zone name
+  const auto name_oid = zone_name_oid(info.get_name());
+  const auto name = RGWNameToId{info.get_id()};
+  RGWObjVersionTracker name_objv;
+  name_objv.generate_new_write_ver(dpp->get_cct());
+
+  r = impl->write(dpp, y, pool, name_oid, create, name, &name_objv);
+  if (r < 0) {
+    (void) impl->remove(dpp, y, pool, info_oid, &objv);
+    return r;
+  }
+
+  if (writer) {
+    *writer = std::make_unique<RadosZoneWriter>(
+        impl.get(), std::move(objv), info.get_id(), info.get_name());
+  }
+  return 0;
+}
+
+int RadosConfigStore::read_zone_by_id(const DoutPrefixProvider* dpp,
+                                      optional_yield y,
+                                      std::string_view zone_id,
+                                      RGWZoneParams& info,
+                                      std::unique_ptr<sal::ZoneWriter>* writer)
+{
+  const auto& pool = impl->zone_pool;
+  const auto info_oid = zone_info_oid(zone_id);
+  RGWObjVersionTracker objv;
+
+  int r = impl->read(dpp, y, pool, info_oid, info, &objv);
+  if (r < 0) {
+    return r;
+  }
+
+  if (writer) {
+    *writer = std::make_unique<RadosZoneWriter>(
+        impl.get(), std::move(objv), info.get_id(), info.get_name());
+  }
+  return 0;
+}
+
+int RadosConfigStore::read_zone_by_name(const DoutPrefixProvider* dpp,
+                                        optional_yield y,
+                                        std::string_view zone_name,
+                                        RGWZoneParams& info,
+                                        std::unique_ptr<sal::ZoneWriter>* writer)
+{
+  const auto& pool = impl->zone_pool;
+
+  // look up zone id by name
+  const auto name_oid = zone_name_oid(zone_name);
+  RGWNameToId name;
+  int r = impl->read(dpp, y, pool, name_oid, name, nullptr);
+  if (r < 0) {
+    return r;
+  }
+
+  const auto info_oid = zone_info_oid(name.obj_id);
+  RGWObjVersionTracker objv;
+  r = impl->read(dpp, y, pool, info_oid, info, &objv);
+  if (r < 0) {
+    return r;
+  }
+
+  if (writer) {
+    *writer = std::make_unique<RadosZoneWriter>(
+        impl.get(), std::move(objv), info.get_id(), info.get_name());
+  }
+  return 0;
+}
+
+int RadosConfigStore::read_default_zone(const DoutPrefixProvider* dpp,
+                                        optional_yield y,
+                                        std::string_view realm_id,
+                                        RGWZoneParams& info,
+                                        std::unique_ptr<sal::ZoneWriter>* writer)
+{
+  const auto& pool = impl->zone_pool;
+
+  // read default zone id
+  const auto default_oid = default_zone_oid(dpp->get_cct()->_conf, realm_id);
+  RGWDefaultSystemMetaObjInfo default_info;
+  int r = impl->read(dpp, y, pool, default_oid, default_info, nullptr);
+  if (r < 0) {
+    return r;
+  }
+
+  const auto info_oid = zone_info_oid(default_info.default_id);
+  RGWObjVersionTracker objv;
+  r = impl->read(dpp, y, pool, info_oid, info, &objv);
+  if (r < 0) {
+    return r;
+  }
+
+  if (writer) {
+    *writer = std::make_unique<RadosZoneWriter>(
+        impl.get(), std::move(objv), info.get_id(), info.get_name());
+  }
+  return 0;
+}
+
+int RadosConfigStore::list_zone_names(const DoutPrefixProvider* dpp,
+                                      optional_yield y,
+                                      const std::string& marker,
+                                      std::span<std::string> entries,
+                                      sal::ListResult<std::string>& result)
+{
+  const auto& pool = impl->zone_pool;
+  constexpr auto prefix = [] (std::string oid) -> std::string {
+      if (!oid.starts_with(zone_names_oid_prefix)) {
+        return {};
+      }
+      return oid.substr(zone_names_oid_prefix.size());
+    };
+  return impl->list(dpp, y, pool, marker, prefix, entries, result);
+}
+
+} // namespace rgw::rados
diff --git a/src/rgw/driver/rados/config/zonegroup.cc b/src/rgw/driver/rados/config/zonegroup.cc
new file mode 100644 (file)
index 0000000..1766a68
--- /dev/null
@@ -0,0 +1,315 @@
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2022 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "common/dout.h"
+#include "common/errno.h"
+#include "rgw_zone.h"
+#include "driver/rados/config/store.h"
+
+#include "impl.h"
+
+namespace rgw::rados {
+
+// zonegroup oids
+constexpr std::string_view zonegroup_names_oid_prefix = "zonegroups_names.";
+constexpr std::string_view zonegroup_info_oid_prefix = "zonegroup_info.";
+constexpr std::string_view default_zonegroup_info_oid = "default.zonegroup";
+
+static std::string zonegroup_info_oid(std::string_view zonegroup_id)
+{
+  return string_cat_reserve(zonegroup_info_oid_prefix, zonegroup_id);
+}
+static std::string zonegroup_name_oid(std::string_view zonegroup_id)
+{
+  return string_cat_reserve(zonegroup_names_oid_prefix, zonegroup_id);
+}
+static std::string default_zonegroup_oid(const ceph::common::ConfigProxy& conf,
+                                         std::string_view realm_id)
+{
+  const auto prefix = name_or_default(conf->rgw_default_zonegroup_info_oid,
+                                      default_zonegroup_info_oid);
+  return fmt::format("{}.{}", prefix, realm_id);
+}
+
+
+int RadosConfigStore::write_default_zonegroup_id(const DoutPrefixProvider* dpp,
+                                                 optional_yield y,
+                                                 bool exclusive,
+                                                 std::string_view realm_id,
+                                                 std::string_view zonegroup_id)
+{
+  const auto& pool = impl->zonegroup_pool;
+  const auto oid = default_zonegroup_oid(dpp->get_cct()->_conf, realm_id);
+  const auto create = exclusive ? Create::MustNotExist : Create::MayExist;
+
+  RGWDefaultSystemMetaObjInfo default_info;
+  default_info.default_id = zonegroup_id;
+
+  return impl->write(dpp, y, pool, oid, create, default_info, nullptr);
+}
+
+int RadosConfigStore::read_default_zonegroup_id(const DoutPrefixProvider* dpp,
+                                                optional_yield y,
+                                                std::string_view realm_id,
+                                                std::string& zonegroup_id)
+{
+  const auto& pool = impl->zonegroup_pool;
+  const auto oid = default_zonegroup_oid(dpp->get_cct()->_conf, realm_id);
+
+  RGWDefaultSystemMetaObjInfo default_info;
+  int r = impl->read(dpp, y, pool, oid, default_info, nullptr);
+  if (r >= 0) {
+    zonegroup_id = default_info.default_id;
+  }
+  return r;
+}
+
+int RadosConfigStore::delete_default_zonegroup_id(const DoutPrefixProvider* dpp,
+                                                  optional_yield y,
+                                                  std::string_view realm_id)
+{
+  const auto& pool = impl->zonegroup_pool;
+  const auto oid = default_zonegroup_oid(dpp->get_cct()->_conf, realm_id);
+  return impl->remove(dpp, y, pool, oid, nullptr);
+}
+
+
+class RadosZoneGroupWriter : public sal::ZoneGroupWriter {
+  ConfigImpl* impl;
+  RGWObjVersionTracker objv;
+  std::string zonegroup_id;
+  std::string zonegroup_name;
+ public:
+  RadosZoneGroupWriter(ConfigImpl* impl, RGWObjVersionTracker objv,
+                       std::string_view zonegroup_id,
+                       std::string_view zonegroup_name)
+    : impl(impl), objv(std::move(objv)),
+      zonegroup_id(zonegroup_id), zonegroup_name(zonegroup_name)
+  {
+  }
+
+  int write(const DoutPrefixProvider* dpp, optional_yield y,
+            const RGWZoneGroup& info) override
+  {
+    if (zonegroup_id != info.get_id() || zonegroup_name != info.get_name()) {
+      return -EINVAL; // can't modify zonegroup id or name directly
+    }
+
+    const auto& pool = impl->zonegroup_pool;
+    const auto info_oid = zonegroup_info_oid(info.get_id());
+    return impl->write(dpp, y, pool, info_oid, Create::MustExist, info, &objv);
+  }
+
+  int rename(const DoutPrefixProvider* dpp, optional_yield y,
+             RGWZoneGroup& info, std::string_view new_name) override
+  {
+    if (zonegroup_id != info.get_id() || zonegroup_name != info.get_name()) {
+      return -EINVAL; // can't modify zonegroup id or name directly
+    }
+    if (new_name.empty()) {
+      ldpp_dout(dpp, 0) << "zonegroup cannot have an empty name" << dendl;
+      return -EINVAL;
+    }
+
+    const auto& pool = impl->zonegroup_pool;
+    const auto name = RGWNameToId{info.get_id()};
+    const auto info_oid = zonegroup_info_oid(info.get_id());
+    const auto old_oid = zonegroup_name_oid(info.get_name());
+    const auto new_oid = zonegroup_name_oid(new_name);
+
+    // link the new name
+    RGWObjVersionTracker new_objv;
+    new_objv.generate_new_write_ver(dpp->get_cct());
+    int r = impl->write(dpp, y, pool, new_oid, Create::MustNotExist,
+                        name, &new_objv);
+    if (r < 0) {
+      return r;
+    }
+
+    // write the info with updated name
+    info.set_name(std::string{new_name});
+    r = impl->write(dpp, y, pool, info_oid, Create::MustExist, info, &objv);
+    if (r < 0) {
+      // on failure, unlink the new name
+      (void) impl->remove(dpp, y, pool, new_oid, &new_objv);
+      return r;
+    }
+
+    // unlink the old name
+    (void) impl->remove(dpp, y, pool, old_oid, nullptr);
+
+    zonegroup_name = new_name;
+    return 0;
+  }
+
+  int remove(const DoutPrefixProvider* dpp, optional_yield y) override
+  {
+    const auto& pool = impl->zonegroup_pool;
+    const auto info_oid = zonegroup_info_oid(zonegroup_id);
+    int r = impl->remove(dpp, y, pool, info_oid, &objv);
+    if (r < 0) {
+      return r;
+    }
+    const auto name_oid = zonegroup_name_oid(zonegroup_name);
+    (void) impl->remove(dpp, y, pool, name_oid, nullptr);
+    return 0;
+  }
+}; // RadosZoneGroupWriter
+
+
+int RadosConfigStore::create_zonegroup(const DoutPrefixProvider* dpp,
+                                       optional_yield y, bool exclusive,
+                                       const RGWZoneGroup& info,
+                                       std::unique_ptr<sal::ZoneGroupWriter>* writer)
+{
+  if (info.get_id().empty()) {
+    ldpp_dout(dpp, 0) << "zonegroup cannot have an empty id" << dendl;
+    return -EINVAL;
+  }
+  if (info.get_name().empty()) {
+    ldpp_dout(dpp, 0) << "zonegroup cannot have an empty name" << dendl;
+    return -EINVAL;
+  }
+
+  const auto& pool = impl->zonegroup_pool;
+  const auto create = exclusive ? Create::MustNotExist : Create::MayExist;
+
+  // write the zonegroup info
+  const auto info_oid = zonegroup_info_oid(info.get_id());
+  RGWObjVersionTracker objv;
+  objv.generate_new_write_ver(dpp->get_cct());
+
+  int r = impl->write(dpp, y, pool, info_oid, create, info, &objv);
+  if (r < 0) {
+    return r;
+  }
+
+  // write the zonegroup name
+  const auto name_oid = zonegroup_name_oid(info.get_name());
+  const auto name = RGWNameToId{info.get_id()};
+  RGWObjVersionTracker name_objv;
+  name_objv.generate_new_write_ver(dpp->get_cct());
+
+  r = impl->write(dpp, y, pool, name_oid, create, name, &name_objv);
+  if (r < 0) {
+    (void) impl->remove(dpp, y, pool, info_oid, &objv);
+    return r;
+  }
+
+  if (writer) {
+    *writer = std::make_unique<RadosZoneGroupWriter>(
+        impl.get(), std::move(objv), info.get_id(), info.get_name());
+  }
+  return 0;
+}
+
+int RadosConfigStore::read_zonegroup_by_id(const DoutPrefixProvider* dpp,
+                                           optional_yield y,
+                                           std::string_view zonegroup_id,
+                                           RGWZoneGroup& info,
+                                           std::unique_ptr<sal::ZoneGroupWriter>* writer)
+{
+  const auto& pool = impl->zonegroup_pool;
+  const auto info_oid = zonegroup_info_oid(zonegroup_id);
+  RGWObjVersionTracker objv;
+
+  int r = impl->read(dpp, y, pool, info_oid, info, &objv);
+  if (r < 0) {
+    return r;
+  }
+
+  if (writer) {
+    *writer = std::make_unique<RadosZoneGroupWriter>(
+        impl.get(), std::move(objv), info.get_id(), info.get_name());
+  }
+  return 0;
+}
+
+int RadosConfigStore::read_zonegroup_by_name(const DoutPrefixProvider* dpp,
+                                             optional_yield y,
+                                             std::string_view zonegroup_name,
+                                             RGWZoneGroup& info,
+                                             std::unique_ptr<sal::ZoneGroupWriter>* writer)
+{
+  const auto& pool = impl->zonegroup_pool;
+
+  // look up zonegroup id by name
+  RGWNameToId name;
+  const auto name_oid = zonegroup_name_oid(zonegroup_name);
+  int r = impl->read(dpp, y, pool, name_oid, name, nullptr);
+  if (r < 0) {
+    return r;
+  }
+
+  const auto info_oid = zonegroup_info_oid(name.obj_id);
+  RGWObjVersionTracker objv;
+  r = impl->read(dpp, y, pool, info_oid, info, &objv);
+  if (r < 0) {
+    return r;
+  }
+
+  if (writer) {
+    *writer = std::make_unique<RadosZoneGroupWriter>(
+        impl.get(), std::move(objv), info.get_id(), info.get_name());
+  }
+  return 0;
+}
+
+int RadosConfigStore::read_default_zonegroup(const DoutPrefixProvider* dpp,
+                                             optional_yield y,
+                                             std::string_view realm_id,
+                                             RGWZoneGroup& info,
+                                             std::unique_ptr<sal::ZoneGroupWriter>* writer)
+{
+  const auto& pool = impl->zonegroup_pool;
+
+  // read default zonegroup id
+  RGWDefaultSystemMetaObjInfo default_info;
+  const auto default_oid = default_zonegroup_oid(dpp->get_cct()->_conf, realm_id);
+  int r = impl->read(dpp, y, pool, default_oid, default_info, nullptr);
+  if (r < 0) {
+    return r;
+  }
+
+  const auto info_oid = zonegroup_info_oid(default_info.default_id);
+  RGWObjVersionTracker objv;
+  r = impl->read(dpp, y, pool, info_oid, info, &objv);
+  if (r < 0) {
+    return r;
+  }
+
+  if (writer) {
+    *writer = std::make_unique<RadosZoneGroupWriter>(
+        impl.get(), std::move(objv), info.get_id(), info.get_name());
+  }
+  return 0;
+}
+
+int RadosConfigStore::list_zonegroup_names(const DoutPrefixProvider* dpp,
+                                           optional_yield y,
+                                           const std::string& marker,
+                                           std::span<std::string> entries,
+                                           sal::ListResult<std::string>& result)
+{
+  const auto& pool = impl->zonegroup_pool;
+  constexpr auto prefix = [] (std::string oid) -> std::string {
+      if (!oid.starts_with(zonegroup_names_oid_prefix)) {
+        return {};
+      }
+      return oid.substr(zonegroup_names_oid_prefix.size());
+    };
+  return impl->list(dpp, y, pool, marker, prefix, entries, result);
+}
+
+} // namespace rgw::rados
diff --git a/src/rgw/driver/rados/rgw_bucket.cc b/src/rgw/driver/rados/rgw_bucket.cc
new file mode 100644 (file)
index 0000000..7f600fe
--- /dev/null
@@ -0,0 +1,2971 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "rgw_acl_s3.h"
+#include "rgw_tag_s3.h"
+
+#include "rgw_bucket.h"
+#include "rgw_op.h"
+#include "rgw_bucket_sync.h"
+
+#include "services/svc_zone.h"
+#include "services/svc_bucket.h"
+#include "services/svc_user.h"
+
+#include "rgw_reshard.h"
+
+// stolen from src/cls/version/cls_version.cc
+#define VERSION_ATTR "ceph.objclass.version"
+
+#include "cls/user/cls_user_types.h"
+
+#include "rgw_sal_rados.h"
+
+#define dout_subsys ceph_subsys_rgw
+
+// seconds for timeout during RGWBucket::check_object_index
+constexpr uint64_t BUCKET_TAG_QUICK_TIMEOUT = 30;
+
+using namespace std;
+
+// default number of entries to list with each bucket listing call
+// (use marker to bridge between calls)
+static constexpr size_t listing_max_entries = 1000;
+
+/*
+ * The tenant_name is always returned on purpose. May be empty, of course.
+ */
+static void parse_bucket(const string& bucket,
+                         string *tenant_name,
+                         string *bucket_name,
+                         string *bucket_instance = nullptr /* optional */)
+{
+  /*
+   * expected format: [tenant/]bucket:bucket_instance
+   */
+  int pos = bucket.find('/');
+  if (pos >= 0) {
+    *tenant_name = bucket.substr(0, pos);
+  } else {
+    tenant_name->clear();
+  }
+  string bn = bucket.substr(pos + 1);
+  pos = bn.find (':');
+  if (pos < 0) {
+    *bucket_name = std::move(bn);
+    return;
+  }
+  *bucket_name = bn.substr(0, pos);
+  if (bucket_instance) {
+    *bucket_instance = bn.substr(pos + 1);
+  }
+
+  /*
+   * deal with the possible tenant:bucket:bucket_instance case
+   */
+  if (tenant_name->empty()) {
+    pos = bucket_instance->find(':');
+    if (pos >= 0) {
+      *tenant_name = *bucket_name;
+      *bucket_name = bucket_instance->substr(0, pos);
+      *bucket_instance = bucket_instance->substr(pos + 1);
+    }
+  }
+}
+
+static void dump_mulipart_index_results(list<rgw_obj_index_key>& objs_to_unlink,
+        Formatter *f)
+{
+  for (const auto& o : objs_to_unlink) {
+    f->dump_string("object",  o.name);
+  }
+}
+
+void check_bad_user_bucket_mapping(rgw::sal::Driver* driver, rgw::sal::User* user,
+                                  bool fix,
+                                  optional_yield y,
+                                   const DoutPrefixProvider *dpp)
+{
+  rgw::sal::BucketList user_buckets;
+  string marker;
+
+  CephContext *cct = driver->ctx();
+
+  size_t max_entries = cct->_conf->rgw_list_buckets_max_chunk;
+
+  do {
+    int ret = user->list_buckets(dpp, marker, string(), max_entries, false, user_buckets, y);
+    if (ret < 0) {
+      ldout(driver->ctx(), 0) << "failed to read user buckets: "
+                            << cpp_strerror(-ret) << dendl;
+      return;
+    }
+
+    map<string, std::unique_ptr<rgw::sal::Bucket>>& buckets = user_buckets.get_buckets();
+    for (auto i = buckets.begin();
+         i != buckets.end();
+         ++i) {
+      marker = i->first;
+
+      auto& bucket = i->second;
+
+      std::unique_ptr<rgw::sal::Bucket> actual_bucket;
+      int r = driver->get_bucket(dpp, user, user->get_tenant(), bucket->get_name(), &actual_bucket, null_yield);
+      if (r < 0) {
+        ldout(driver->ctx(), 0) << "could not get bucket info for bucket=" << bucket << dendl;
+        continue;
+      }
+
+      if (actual_bucket->get_name().compare(bucket->get_name()) != 0 ||
+          actual_bucket->get_tenant().compare(bucket->get_tenant()) != 0 ||
+          actual_bucket->get_marker().compare(bucket->get_marker()) != 0 ||
+          actual_bucket->get_bucket_id().compare(bucket->get_bucket_id()) != 0) {
+        cout << "bucket info mismatch: expected " << actual_bucket << " got " << bucket << std::endl;
+        if (fix) {
+          cout << "fixing" << std::endl;
+         r = actual_bucket->chown(dpp, user, nullptr, null_yield);
+          if (r < 0) {
+            cerr << "failed to fix bucket: " << cpp_strerror(-r) << std::endl;
+          }
+        }
+      }
+    }
+  } while (user_buckets.is_truncated());
+}
+
+// returns true if entry is in the empty namespace. note: function
+// type conforms to type RGWBucketListNameFilter
+bool rgw_bucket_object_check_filter(const std::string& oid)
+{
+  const static std::string empty_ns;
+  rgw_obj_key key; // thrown away but needed for parsing
+  return rgw_obj_key::oid_to_key_in_ns(oid, &key, empty_ns);
+}
+
+int rgw_remove_object(const DoutPrefixProvider *dpp, rgw::sal::Driver* driver, rgw::sal::Bucket* bucket, rgw_obj_key& key)
+{
+  if (key.instance.empty()) {
+    key.instance = "null";
+  }
+
+  std::unique_ptr<rgw::sal::Object> object = bucket->get_object(key);
+
+  return object->delete_object(dpp, null_yield);
+}
+
+static void set_err_msg(std::string *sink, std::string msg)
+{
+  if (sink && !msg.empty())
+    *sink = msg;
+}
+
+int RGWBucket::init(rgw::sal::Driver* _driver, RGWBucketAdminOpState& op_state,
+                    optional_yield y, const DoutPrefixProvider *dpp, std::string *err_msg)
+{
+  if (!_driver) {
+    set_err_msg(err_msg, "no storage!");
+    return -EINVAL;
+  }
+
+  driver = _driver;
+
+  std::string bucket_name = op_state.get_bucket_name();
+
+  if (bucket_name.empty() && op_state.get_user_id().empty())
+    return -EINVAL;
+
+  user = driver->get_user(op_state.get_user_id());
+  std::string tenant = user->get_tenant();
+
+  // split possible tenant/name
+  auto pos = bucket_name.find('/');
+  if (pos != string::npos) {
+    tenant = bucket_name.substr(0, pos);
+    bucket_name = bucket_name.substr(pos + 1);
+  }
+
+  int r = driver->get_bucket(dpp, user.get(), tenant, bucket_name, &bucket, y);
+  if (r < 0) {
+      set_err_msg(err_msg, "failed to fetch bucket info for bucket=" + bucket_name);
+      return r;
+  }
+
+  op_state.set_bucket(bucket->clone());
+
+  if (!rgw::sal::User::empty(user.get())) {
+    r = user->load_user(dpp, y);
+    if (r < 0) {
+      set_err_msg(err_msg, "failed to fetch user info");
+      return r;
+    }
+  }
+
+  op_state.display_name = user->get_display_name();
+
+  clear_failure();
+  return 0;
+}
+
+bool rgw_find_bucket_by_id(const DoutPrefixProvider *dpp, CephContext *cct, rgw::sal::Driver* driver,
+                           const string& marker, const string& bucket_id, rgw_bucket* bucket_out)
+{
+  void *handle = NULL;
+  bool truncated = false;
+  string s;
+
+  int ret = driver->meta_list_keys_init(dpp, "bucket.instance", marker, &handle);
+  if (ret < 0) {
+    cerr << "ERROR: can't get key: " << cpp_strerror(-ret) << std::endl;
+    driver->meta_list_keys_complete(handle);
+    return -ret;
+  }
+  do {
+      list<string> keys;
+      ret = driver->meta_list_keys_next(dpp, handle, 1000, keys, &truncated);
+      if (ret < 0) {
+        cerr << "ERROR: lists_keys_next(): " << cpp_strerror(-ret) << std::endl;
+        driver->meta_list_keys_complete(handle);
+        return -ret;
+      }
+      for (list<string>::iterator iter = keys.begin(); iter != keys.end(); ++iter) {
+        s = *iter;
+        ret = rgw_bucket_parse_bucket_key(cct, s, bucket_out, nullptr);
+        if (ret < 0) {
+          continue;
+        }
+        if (bucket_id == bucket_out->bucket_id) {
+          driver->meta_list_keys_complete(handle);
+          return true;
+        }
+      }
+  } while (truncated);
+  driver->meta_list_keys_complete(handle);
+  return false;
+}
+
+int RGWBucket::chown(RGWBucketAdminOpState& op_state, const string& marker,
+                     optional_yield y, const DoutPrefixProvider *dpp, std::string *err_msg)
+{
+  int ret = bucket->chown(dpp, user.get(), user.get(), y, &marker);
+  if (ret < 0) {
+    set_err_msg(err_msg, "Failed to change object ownership: " + cpp_strerror(-ret));
+  }
+  
+  return ret;
+}
+
+int RGWBucket::set_quota(RGWBucketAdminOpState& op_state, const DoutPrefixProvider *dpp, std::string *err_msg)
+{
+  bucket = op_state.get_bucket()->clone();
+
+  bucket->get_info().quota = op_state.quota;
+  int r = bucket->put_info(dpp, false, real_time());
+  if (r < 0) {
+    set_err_msg(err_msg, "ERROR: failed writing bucket instance info: " + cpp_strerror(-r));
+    return r;
+  }
+  return r;
+}
+
+int RGWBucket::remove_object(const DoutPrefixProvider *dpp, RGWBucketAdminOpState& op_state, std::string *err_msg)
+{
+  std::string object_name = op_state.get_object_name();
+
+  rgw_obj_key key(object_name);
+
+  bucket = op_state.get_bucket()->clone();
+
+  int ret = rgw_remove_object(dpp, driver, bucket.get(), key);
+  if (ret < 0) {
+    set_err_msg(err_msg, "unable to remove object" + cpp_strerror(-ret));
+    return ret;
+  }
+
+  return 0;
+}
+
+static void dump_bucket_index(const vector<rgw_bucket_dir_entry>& objs,  Formatter *f)
+{
+  for (auto iter = objs.begin(); iter != objs.end(); ++iter) {
+    f->dump_string("object", iter->key.name);
+  }
+}
+
+static void dump_bucket_usage(map<RGWObjCategory, RGWStorageStats>& stats, Formatter *formatter)
+{
+  map<RGWObjCategory, RGWStorageStats>::iterator iter;
+
+  formatter->open_object_section("usage");
+  for (iter = stats.begin(); iter != stats.end(); ++iter) {
+    RGWStorageStats& s = iter->second;
+    formatter->open_object_section(to_string(iter->first));
+    s.dump(formatter);
+    formatter->close_section();
+  }
+  formatter->close_section();
+}
+
+static void dump_index_check(map<RGWObjCategory, RGWStorageStats> existing_stats,
+        map<RGWObjCategory, RGWStorageStats> calculated_stats,
+        Formatter *formatter)
+{
+  formatter->open_object_section("check_result");
+  formatter->open_object_section("existing_header");
+  dump_bucket_usage(existing_stats, formatter);
+  formatter->close_section();
+  formatter->open_object_section("calculated_header");
+  dump_bucket_usage(calculated_stats, formatter);
+  formatter->close_section();
+  formatter->close_section();
+}
+
+int RGWBucket::check_bad_index_multipart(RGWBucketAdminOpState& op_state,
+                                        RGWFormatterFlusher& flusher,
+                                        const DoutPrefixProvider *dpp,
+                                        std::string *err_msg)
+{
+  const bool fix_index = op_state.will_fix_index();
+
+  bucket = op_state.get_bucket()->clone();
+
+  rgw::sal::Bucket::ListParams params;
+  params.list_versions = true;
+  params.ns = RGW_OBJ_NS_MULTIPART;
+
+  std::map<std::string, bool> meta_objs;
+  std::map<rgw_obj_index_key, std::string> all_objs;
+  bool is_truncated;
+  do {
+    rgw::sal::Bucket::ListResults results;
+    int r = bucket->list(dpp, params, listing_max_entries, results, null_yield);
+    if (r < 0) {
+      set_err_msg(err_msg, "failed to list objects in bucket=" + bucket->get_name() +
+              " err=" +  cpp_strerror(-r));
+
+      return r;
+    }
+    is_truncated = results.is_truncated;
+
+    for (const auto& o : results.objs) {
+      rgw_obj_index_key key = o.key;
+      rgw_obj obj(bucket->get_key(), key);
+      std::string oid = obj.get_oid();
+
+      int pos = oid.find_last_of('.');
+      if (pos < 0) {
+        /* obj has no suffix */
+        all_objs[key] = oid;
+      } else {
+        /* obj has suffix */
+       std::string name = oid.substr(0, pos);
+       std::string suffix = oid.substr(pos + 1);
+
+        if (suffix.compare("meta") == 0) {
+          meta_objs[name] = true;
+        } else {
+          all_objs[key] = name;
+        }
+      }
+    }
+  } while (is_truncated);
+
+  std::list<rgw_obj_index_key> objs_to_unlink;
+  Formatter *f =  flusher.get_formatter();
+
+  f->open_array_section("invalid_multipart_entries");
+
+  for (const auto& o : all_objs) {
+    const std::string& name = o.second;
+    if (meta_objs.find(name) == meta_objs.end()) {
+      objs_to_unlink.push_back(o.first);
+    }
+
+    if (objs_to_unlink.size() > listing_max_entries) {
+      if (fix_index) {
+       // note: under rados this removes directly from rados index objects
+       int r = bucket->remove_objs_from_index(dpp, objs_to_unlink);
+       if (r < 0) {
+         set_err_msg(err_msg, "ERROR: remove_obj_from_index() returned error: " +
+                     cpp_strerror(-r));
+         return r;
+       }
+      }
+
+      dump_mulipart_index_results(objs_to_unlink, f);
+      flusher.flush();
+      objs_to_unlink.clear();
+    }
+  }
+
+  if (fix_index) {
+    // note: under rados this removes directly from rados index objects
+    int r = bucket->remove_objs_from_index(dpp, objs_to_unlink);
+    if (r < 0) {
+      set_err_msg(err_msg, "ERROR: remove_obj_from_index() returned error: " +
+              cpp_strerror(-r));
+
+      return r;
+    }
+  }
+
+  dump_mulipart_index_results(objs_to_unlink, f);
+  f->close_section();
+  flusher.flush();
+
+  return 0;
+}
+
+int RGWBucket::check_object_index(const DoutPrefixProvider *dpp, 
+                                  RGWBucketAdminOpState& op_state,
+                                  RGWFormatterFlusher& flusher,
+                                  optional_yield y,
+                                  std::string *err_msg)
+{
+
+  bool fix_index = op_state.will_fix_index();
+
+  if (!fix_index) {
+    set_err_msg(err_msg, "check-objects flag requires fix index enabled");
+    return -EINVAL;
+  }
+
+  // use a quicker/shorter tag timeout during this process
+  bucket->set_tag_timeout(dpp, BUCKET_TAG_QUICK_TIMEOUT);
+
+  rgw::sal::Bucket::ListResults results;
+  results.is_truncated = true;
+
+  Formatter *formatter = flusher.get_formatter();
+  formatter->open_object_section("objects");
+
+  while (results.is_truncated) {
+    rgw::sal::Bucket::ListParams params;
+    params.marker = results.next_marker;
+    params.force_check_filter = rgw_bucket_object_check_filter;
+
+    int r = bucket->list(dpp, params, listing_max_entries, results, y);
+
+    if (r == -ENOENT) {
+      break;
+    } else if (r < 0) {
+      set_err_msg(err_msg, "ERROR: failed operation r=" + cpp_strerror(-r));
+    }
+
+    dump_bucket_index(results.objs, formatter);
+    flusher.flush();
+  }
+
+  formatter->close_section();
+
+  // restore normal tag timeout for bucket
+  bucket->set_tag_timeout(dpp, 0);
+
+  return 0;
+}
+
+
+int RGWBucket::check_index(const DoutPrefixProvider *dpp,
+        RGWBucketAdminOpState& op_state,
+        map<RGWObjCategory, RGWStorageStats>& existing_stats,
+        map<RGWObjCategory, RGWStorageStats>& calculated_stats,
+        std::string *err_msg)
+{
+  bool fix_index = op_state.will_fix_index();
+
+  int r = bucket->check_index(dpp, existing_stats, calculated_stats);
+  if (r < 0) {
+    set_err_msg(err_msg, "failed to check index error=" + cpp_strerror(-r));
+    return r;
+  }
+
+  if (fix_index) {
+    r = bucket->rebuild_index(dpp);
+    if (r < 0) {
+      set_err_msg(err_msg, "failed to rebuild index err=" + cpp_strerror(-r));
+      return r;
+    }
+  }
+
+  return 0;
+}
+
+int RGWBucket::sync(RGWBucketAdminOpState& op_state, const DoutPrefixProvider *dpp, std::string *err_msg)
+{
+  if (!driver->is_meta_master()) {
+    set_err_msg(err_msg, "ERROR: failed to update bucket sync: only allowed on meta master zone");
+    return -EINVAL;
+  }
+  bool sync = op_state.will_sync_bucket();
+  if (sync) {
+    bucket->get_info().flags &= ~BUCKET_DATASYNC_DISABLED;
+  } else {
+    bucket->get_info().flags |= BUCKET_DATASYNC_DISABLED;
+  }
+
+  // when writing this metadata, RGWSI_BucketIndex_RADOS::handle_overwrite()
+  // will write the corresponding datalog and bilog entries
+  int r = bucket->put_info(dpp, false, real_time());
+  if (r < 0) {
+    set_err_msg(err_msg, "ERROR: failed writing bucket instance info:" + cpp_strerror(-r));
+    return r;
+  }
+
+  return 0;
+}
+
+
+int RGWBucket::policy_bl_to_stream(bufferlist& bl, ostream& o)
+{
+  RGWAccessControlPolicy_S3 policy(g_ceph_context);
+  int ret = decode_bl(bl, policy);
+  if (ret < 0) {
+    ldout(driver->ctx(),0) << "failed to decode RGWAccessControlPolicy" << dendl;
+  }
+  policy.to_xml(o);
+  return 0;
+}
+
+int rgw_object_get_attr(const DoutPrefixProvider *dpp,
+                       rgw::sal::Driver* driver, rgw::sal::Object* obj,
+                       const char* attr_name, bufferlist& out_bl, optional_yield y)
+{
+  std::unique_ptr<rgw::sal::Object::ReadOp> rop = obj->get_read_op();
+
+  return rop->get_attr(dpp, attr_name, out_bl, y);
+}
+
+int RGWBucket::get_policy(RGWBucketAdminOpState& op_state, RGWAccessControlPolicy& policy, optional_yield y, const DoutPrefixProvider *dpp)
+{
+  int ret;
+  std::string object_name = op_state.get_object_name();
+
+  bucket = op_state.get_bucket()->clone();
+
+  if (!object_name.empty()) {
+    bufferlist bl;
+    std::unique_ptr<rgw::sal::Object> obj = bucket->get_object(rgw_obj_key(object_name));
+
+    ret = rgw_object_get_attr(dpp, driver, obj.get(), RGW_ATTR_ACL, bl, y);
+    if (ret < 0){
+      return ret;
+    }
+
+    ret = decode_bl(bl, policy);
+    if (ret < 0) {
+      ldout(driver->ctx(),0) << "failed to decode RGWAccessControlPolicy" << dendl;
+    }
+    return ret;
+  }
+
+  map<string, bufferlist>::iterator aiter = bucket->get_attrs().find(RGW_ATTR_ACL);
+  if (aiter == bucket->get_attrs().end()) {
+    return -ENOENT;
+  }
+
+  ret = decode_bl(aiter->second, policy);
+  if (ret < 0) {
+    ldout(driver->ctx(),0) << "failed to decode RGWAccessControlPolicy" << dendl;
+  }
+
+  return ret;
+}
+
+
+int RGWBucketAdminOp::get_policy(rgw::sal::Driver* driver, RGWBucketAdminOpState& op_state,
+                  RGWAccessControlPolicy& policy, const DoutPrefixProvider *dpp)
+{
+  RGWBucket bucket;
+
+  int ret = bucket.init(driver, op_state, null_yield, dpp);
+  if (ret < 0)
+    return ret;
+
+  ret = bucket.get_policy(op_state, policy, null_yield, dpp);
+  if (ret < 0)
+    return ret;
+
+  return 0;
+}
+
+/* Wrappers to facilitate RESTful interface */
+
+
+int RGWBucketAdminOp::get_policy(rgw::sal::Driver* driver, RGWBucketAdminOpState& op_state,
+                  RGWFormatterFlusher& flusher, const DoutPrefixProvider *dpp)
+{
+  RGWAccessControlPolicy policy(driver->ctx());
+
+  int ret = get_policy(driver, op_state, policy, dpp);
+  if (ret < 0)
+    return ret;
+
+  Formatter *formatter = flusher.get_formatter();
+
+  flusher.start(0);
+
+  formatter->open_object_section("policy");
+  policy.dump(formatter);
+  formatter->close_section();
+
+  flusher.flush();
+
+  return 0;
+}
+
+int RGWBucketAdminOp::dump_s3_policy(rgw::sal::Driver* driver, RGWBucketAdminOpState& op_state,
+                  ostream& os, const DoutPrefixProvider *dpp)
+{
+  RGWAccessControlPolicy_S3 policy(driver->ctx());
+
+  int ret = get_policy(driver, op_state, policy, dpp);
+  if (ret < 0)
+    return ret;
+
+  policy.to_xml(os);
+
+  return 0;
+}
+
+int RGWBucketAdminOp::unlink(rgw::sal::Driver* driver, RGWBucketAdminOpState& op_state, const DoutPrefixProvider *dpp)
+{
+  RGWBucket bucket;
+
+  int ret = bucket.init(driver, op_state, null_yield, dpp);
+  if (ret < 0)
+    return ret;
+
+  return static_cast<rgw::sal::RadosStore*>(driver)->ctl()->bucket->unlink_bucket(op_state.get_user_id(), op_state.get_bucket()->get_info().bucket, null_yield, dpp, true);
+}
+
+int RGWBucketAdminOp::link(rgw::sal::Driver* driver, RGWBucketAdminOpState& op_state, const DoutPrefixProvider *dpp, string *err)
+{
+  if (!op_state.is_user_op()) {
+    set_err_msg(err, "empty user id");
+    return -EINVAL;
+  }
+
+  RGWBucket bucket;
+  int ret = bucket.init(driver, op_state, null_yield, dpp, err);
+  if (ret < 0)
+    return ret;
+
+  string bucket_id = op_state.get_bucket_id();
+  std::string display_name = op_state.get_user_display_name();
+  std::unique_ptr<rgw::sal::Bucket> loc_bucket;
+  std::unique_ptr<rgw::sal::Bucket> old_bucket;
+
+  loc_bucket = op_state.get_bucket()->clone();
+
+  if (!bucket_id.empty() && bucket_id != loc_bucket->get_bucket_id()) {
+    set_err_msg(err,
+       "specified bucket id does not match " + loc_bucket->get_bucket_id());
+    return -EINVAL;
+  }
+
+  old_bucket = loc_bucket->clone();
+
+  loc_bucket->get_key().tenant = op_state.get_user_id().tenant;
+
+  if (!op_state.new_bucket_name.empty()) {
+    auto pos = op_state.new_bucket_name.find('/');
+    if (pos != string::npos) {
+      loc_bucket->get_key().tenant = op_state.new_bucket_name.substr(0, pos);
+      loc_bucket->get_key().name = op_state.new_bucket_name.substr(pos + 1);
+    } else {
+      loc_bucket->get_key().name = op_state.new_bucket_name;
+    }
+  }
+
+  RGWObjVersionTracker objv_tracker;
+  RGWObjVersionTracker old_version = loc_bucket->get_info().objv_tracker;
+
+  map<string, bufferlist>::iterator aiter = loc_bucket->get_attrs().find(RGW_ATTR_ACL);
+  if (aiter == loc_bucket->get_attrs().end()) {
+       // should never happen; only pre-argonaut buckets lacked this.
+    ldpp_dout(dpp, 0) << "WARNING: can't bucket link because no acl on bucket=" << old_bucket << dendl;
+    set_err_msg(err,
+       "While crossing the Anavros you have displeased the goddess Hera."
+       "  You must sacrifice your ancient bucket " + loc_bucket->get_bucket_id());
+    return -EINVAL;
+  }
+  bufferlist& aclbl = aiter->second;
+  RGWAccessControlPolicy policy;
+  ACLOwner owner;
+  try {
+   auto iter = aclbl.cbegin();
+   decode(policy, iter);
+   owner = policy.get_owner();
+  } catch (buffer::error& e) {
+    set_err_msg(err, "couldn't decode policy");
+    return -EIO;
+  }
+
+  int r = static_cast<rgw::sal::RadosStore*>(driver)->ctl()->bucket->unlink_bucket(owner.get_id(), old_bucket->get_info().bucket, null_yield, dpp, false);
+  if (r < 0) {
+    set_err_msg(err, "could not unlink policy from user " + owner.get_id().to_str());
+    return r;
+  }
+
+  // now update the user for the bucket...
+  if (display_name.empty()) {
+    ldpp_dout(dpp, 0) << "WARNING: user " << op_state.get_user_id() << " has no display name set" << dendl;
+  }
+
+  RGWAccessControlPolicy policy_instance;
+  policy_instance.create_default(op_state.get_user_id(), display_name);
+  owner = policy_instance.get_owner();
+
+  aclbl.clear();
+  policy_instance.encode(aclbl);
+
+  bool exclusive = false;
+  loc_bucket->get_info().owner = op_state.get_user_id();
+  if (*loc_bucket != *old_bucket) {
+    loc_bucket->get_info().bucket = loc_bucket->get_key();
+    loc_bucket->get_info().objv_tracker.version_for_read()->ver = 0;
+    exclusive = true;
+  }
+
+  r = loc_bucket->put_info(dpp, exclusive, ceph::real_time());
+  if (r < 0) {
+    set_err_msg(err, "ERROR: failed writing bucket instance info: " + cpp_strerror(-r));
+    return r;
+  }
+
+  /* link to user */
+  RGWBucketEntryPoint ep;
+  ep.bucket = loc_bucket->get_info().bucket;
+  ep.owner = op_state.get_user_id();
+  ep.creation_time = loc_bucket->get_info().creation_time;
+  ep.linked = true;
+  rgw::sal::Attrs ep_attrs;
+  rgw_ep_info ep_data{ep, ep_attrs};
+
+  r = static_cast<rgw::sal::RadosStore*>(driver)->ctl()->bucket->link_bucket(op_state.get_user_id(), loc_bucket->get_info().bucket, loc_bucket->get_info().creation_time, null_yield, dpp, true, &ep_data);
+  if (r < 0) {
+    set_err_msg(err, "failed to relink bucket");
+    return r;
+  }
+
+  if (*loc_bucket != *old_bucket) {
+    // like RGWRados::delete_bucket -- excepting no bucket_index work.
+    r = static_cast<rgw::sal::RadosStore*>(driver)->ctl()->bucket->remove_bucket_entrypoint_info(
+                                       old_bucket->get_key(), null_yield, dpp,
+                                       RGWBucketCtl::Bucket::RemoveParams()
+                                       .set_objv_tracker(&ep_data.ep_objv));
+    if (r < 0) {
+      set_err_msg(err, "failed to unlink old bucket " + old_bucket->get_tenant() + "/" + old_bucket->get_name());
+      return r;
+    }
+    r = static_cast<rgw::sal::RadosStore*>(driver)->ctl()->bucket->remove_bucket_instance_info(
+                                       old_bucket->get_key(), old_bucket->get_info(),
+                                       null_yield, dpp,
+                                       RGWBucketCtl::BucketInstance::RemoveParams()
+                                       .set_objv_tracker(&ep_data.ep_objv));
+    if (r < 0) {
+      set_err_msg(err, "failed to unlink old bucket " + old_bucket->get_tenant() + "/" + old_bucket->get_name());
+      return r;
+    }
+  }
+
+  return 0;
+}
+
+int RGWBucketAdminOp::chown(rgw::sal::Driver* driver, RGWBucketAdminOpState& op_state, const string& marker, const DoutPrefixProvider *dpp, string *err)
+{
+  RGWBucket bucket;
+
+  int ret = bucket.init(driver, op_state, null_yield, dpp, err);
+  if (ret < 0)
+    return ret;
+
+  return bucket.chown(op_state, marker, null_yield, dpp, err);
+
+}
+
+int RGWBucketAdminOp::check_index(rgw::sal::Driver* driver, RGWBucketAdminOpState& op_state,
+                  RGWFormatterFlusher& flusher, optional_yield y, const DoutPrefixProvider *dpp)
+{
+  int ret;
+  map<RGWObjCategory, RGWStorageStats> existing_stats;
+  map<RGWObjCategory, RGWStorageStats> calculated_stats;
+
+
+  RGWBucket bucket;
+
+  ret = bucket.init(driver, op_state, null_yield, dpp);
+  if (ret < 0)
+    return ret;
+
+  Formatter *formatter = flusher.get_formatter();
+  flusher.start(0);
+
+  ret = bucket.check_bad_index_multipart(op_state, flusher, dpp);
+  if (ret < 0)
+    return ret;
+
+  ret = bucket.check_object_index(dpp, op_state, flusher, y);
+  if (ret < 0)
+    return ret;
+
+  ret = bucket.check_index(dpp, op_state, existing_stats, calculated_stats);
+  if (ret < 0)
+    return ret;
+
+  dump_index_check(existing_stats, calculated_stats, formatter);
+  flusher.flush();
+
+  return 0;
+}
+
+int RGWBucketAdminOp::remove_bucket(rgw::sal::Driver* driver, RGWBucketAdminOpState& op_state,
+                                   optional_yield y, const DoutPrefixProvider *dpp, 
+                                    bool bypass_gc, bool keep_index_consistent)
+{
+  std::unique_ptr<rgw::sal::Bucket> bucket;
+  std::unique_ptr<rgw::sal::User> user = driver->get_user(op_state.get_user_id());
+
+  int ret = driver->get_bucket(dpp, user.get(), user->get_tenant(), op_state.get_bucket_name(),
+                             &bucket, y);
+  if (ret < 0)
+    return ret;
+
+  if (bypass_gc)
+    ret = bucket->remove_bucket_bypass_gc(op_state.get_max_aio(), keep_index_consistent, y, dpp);
+  else
+    ret = bucket->remove_bucket(dpp, op_state.will_delete_children(),
+                               false, nullptr, y);
+
+  return ret;
+}
+
+int RGWBucketAdminOp::remove_object(rgw::sal::Driver* driver, RGWBucketAdminOpState& op_state, const DoutPrefixProvider *dpp)
+{
+  RGWBucket bucket;
+
+  int ret = bucket.init(driver, op_state, null_yield, dpp);
+  if (ret < 0)
+    return ret;
+
+  return bucket.remove_object(dpp, op_state);
+}
+
+int RGWBucketAdminOp::sync_bucket(rgw::sal::Driver* driver, RGWBucketAdminOpState& op_state, const DoutPrefixProvider *dpp, string *err_msg)
+{
+  RGWBucket bucket;
+  int ret = bucket.init(driver, op_state, null_yield, dpp, err_msg);
+  if (ret < 0)
+  {
+    return ret;
+  }
+  return bucket.sync(op_state, dpp, err_msg);
+}
+
+static int bucket_stats(rgw::sal::Driver* driver,
+                       const std::string& tenant_name,
+                       const std::string& bucket_name,
+                       Formatter *formatter,
+                        const DoutPrefixProvider *dpp)
+{
+  std::unique_ptr<rgw::sal::Bucket> bucket;
+  map<RGWObjCategory, RGWStorageStats> stats;
+
+  real_time mtime;
+  int ret = driver->get_bucket(dpp, nullptr, tenant_name, bucket_name, &bucket, null_yield);
+  if (ret < 0) {
+    return ret;
+  }
+
+  const auto& index = bucket->get_info().get_current_index();
+  if (is_layout_indexless(index)) {
+    cerr << "error, indexless buckets do not maintain stats; bucket=" <<
+      bucket->get_name() << std::endl;
+    return -EINVAL;
+  }
+
+  std::string bucket_ver, master_ver;
+  std::string max_marker;
+  ret = bucket->read_stats(dpp, index, RGW_NO_SHARD, &bucket_ver, &master_ver, stats, &max_marker);
+  if (ret < 0) {
+    cerr << "error getting bucket stats bucket=" << bucket->get_name() << " ret=" << ret << std::endl;
+    return ret;
+  }
+
+  utime_t ut(mtime);
+  utime_t ctime_ut(bucket->get_creation_time());
+
+  formatter->open_object_section("stats");
+  formatter->dump_string("bucket", bucket->get_name());
+  formatter->dump_int("num_shards",
+                     bucket->get_info().layout.current_index.layout.normal.num_shards);
+  formatter->dump_string("tenant", bucket->get_tenant());
+  formatter->dump_string("zonegroup", bucket->get_info().zonegroup);
+  formatter->dump_string("placement_rule", bucket->get_info().placement_rule.to_str());
+  ::encode_json("explicit_placement", bucket->get_key().explicit_placement, formatter);
+  formatter->dump_string("id", bucket->get_bucket_id());
+  formatter->dump_string("marker", bucket->get_marker());
+  formatter->dump_stream("index_type") << bucket->get_info().layout.current_index.layout.type;
+  ::encode_json("owner", bucket->get_info().owner, formatter);
+  formatter->dump_string("ver", bucket_ver);
+  formatter->dump_string("master_ver", master_ver);
+  ut.gmtime(formatter->dump_stream("mtime"));
+  ctime_ut.gmtime(formatter->dump_stream("creation_time"));
+  formatter->dump_string("max_marker", max_marker);
+  dump_bucket_usage(stats, formatter);
+  encode_json("bucket_quota", bucket->get_info().quota, formatter);
+
+  // bucket tags
+  auto iter = bucket->get_attrs().find(RGW_ATTR_TAGS);
+  if (iter != bucket->get_attrs().end()) {
+    RGWObjTagSet_S3 tagset;
+    bufferlist::const_iterator piter{&iter->second};
+    try {
+      tagset.decode(piter);
+      tagset.dump(formatter); 
+    } catch (buffer::error& err) {
+      cerr << "ERROR: caught buffer:error, couldn't decode TagSet" << std::endl;
+    }
+  }
+
+  // TODO: bucket CORS
+  // TODO: bucket LC
+  formatter->close_section();
+
+  return 0;
+}
+
+int RGWBucketAdminOp::limit_check(rgw::sal::Driver* driver,
+                                 RGWBucketAdminOpState& op_state,
+                                 const std::list<std::string>& user_ids,
+                                 RGWFormatterFlusher& flusher, optional_yield y,
+                                  const DoutPrefixProvider *dpp,
+                                 bool warnings_only)
+{
+  int ret = 0;
+  const size_t max_entries =
+    driver->ctx()->_conf->rgw_list_buckets_max_chunk;
+
+  const size_t safe_max_objs_per_shard =
+    driver->ctx()->_conf->rgw_safe_max_objects_per_shard;
+
+  uint16_t shard_warn_pct =
+    driver->ctx()->_conf->rgw_shard_warning_threshold;
+  if (shard_warn_pct > 100)
+    shard_warn_pct = 90;
+
+  Formatter *formatter = flusher.get_formatter();
+  flusher.start(0);
+
+  formatter->open_array_section("users");
+
+  for (const auto& user_id : user_ids) {
+
+    formatter->open_object_section("user");
+    formatter->dump_string("user_id", user_id);
+    formatter->open_array_section("buckets");
+
+    string marker;
+    rgw::sal::BucketList buckets;
+    do {
+      std::unique_ptr<rgw::sal::User> user = driver->get_user(rgw_user(user_id));
+
+      ret = user->list_buckets(dpp, marker, string(), max_entries, false, buckets, y);
+
+      if (ret < 0)
+        return ret;
+
+      map<string, std::unique_ptr<rgw::sal::Bucket>>& m_buckets = buckets.get_buckets();
+
+      for (const auto& iter : m_buckets) {
+       auto& bucket = iter.second;
+       uint64_t num_objects = 0;
+
+       marker = bucket->get_name(); /* Casey's location for marker update,
+                                    * as we may now not reach the end of
+                                    * the loop body */
+
+       ret = bucket->load_bucket(dpp, null_yield);
+       if (ret < 0)
+         continue;
+
+       const auto& index = bucket->get_info().get_current_index();
+       if (is_layout_indexless(index)) {
+         continue; // indexless buckets don't have stats
+       }
+
+       /* need stats for num_entries */
+       string bucket_ver, master_ver;
+       std::map<RGWObjCategory, RGWStorageStats> stats;
+       ret = bucket->read_stats(dpp, index, RGW_NO_SHARD, &bucket_ver, &master_ver, stats, nullptr);
+
+       if (ret < 0)
+         continue;
+
+       for (const auto& s : stats) {
+         num_objects += s.second.num_objects;
+       }
+
+       const uint32_t num_shards = rgw::num_shards(index.layout.normal);
+       uint64_t objs_per_shard =
+         (num_shards) ? num_objects/num_shards : num_objects;
+       {
+         bool warn;
+         stringstream ss;
+         uint64_t fill_pct = objs_per_shard * 100 / safe_max_objs_per_shard;
+         if (fill_pct > 100) {
+           ss << "OVER " << fill_pct << "%";
+           warn = true;
+         } else if (fill_pct >= shard_warn_pct) {
+           ss << "WARN " << fill_pct << "%";
+           warn = true;
+         } else {
+           ss << "OK";
+           warn = false;
+         }
+
+         if (warn || !warnings_only) {
+           formatter->open_object_section("bucket");
+           formatter->dump_string("bucket", bucket->get_name());
+           formatter->dump_string("tenant", bucket->get_tenant());
+           formatter->dump_int("num_objects", num_objects);
+           formatter->dump_int("num_shards", num_shards);
+           formatter->dump_int("objects_per_shard", objs_per_shard);
+           formatter->dump_string("fill_status", ss.str());
+           formatter->close_section();
+         }
+       }
+      }
+      formatter->flush(cout);
+    } while (buckets.is_truncated()); /* foreach: bucket */
+
+    formatter->close_section();
+    formatter->close_section();
+    formatter->flush(cout);
+
+  } /* foreach: user_id */
+
+  formatter->close_section();
+  formatter->flush(cout);
+
+  return ret;
+} /* RGWBucketAdminOp::limit_check */
+
+int RGWBucketAdminOp::info(rgw::sal::Driver* driver,
+                          RGWBucketAdminOpState& op_state,
+                          RGWFormatterFlusher& flusher,
+                          optional_yield y,
+                           const DoutPrefixProvider *dpp)
+{
+  RGWBucket bucket;
+  int ret = 0;
+  const std::string& bucket_name = op_state.get_bucket_name();
+  if (!bucket_name.empty()) {
+    ret = bucket.init(driver, op_state, null_yield, dpp);
+    if (-ENOENT == ret)
+      return -ERR_NO_SUCH_BUCKET;
+    else if (ret < 0)
+      return ret;
+  }
+
+  Formatter *formatter = flusher.get_formatter();
+  flusher.start(0);
+
+  CephContext *cct = driver->ctx();
+
+  const size_t max_entries = cct->_conf->rgw_list_buckets_max_chunk;
+
+  const bool show_stats = op_state.will_fetch_stats();
+  const rgw_user& user_id = op_state.get_user_id();
+  if (op_state.is_user_op()) {
+    formatter->open_array_section("buckets");
+
+    rgw::sal::BucketList buckets;
+    std::unique_ptr<rgw::sal::User> user = driver->get_user(op_state.get_user_id());
+    std::string marker;
+    const std::string empty_end_marker;
+    constexpr bool no_need_stats = false; // set need_stats to false
+
+    do {
+      ret = user->list_buckets(dpp, marker, empty_end_marker, max_entries,
+                             no_need_stats, buckets, y);
+      if (ret < 0) {
+        return ret;
+      }
+
+      const std::string* marker_cursor = nullptr;
+      map<string, std::unique_ptr<rgw::sal::Bucket>>& m = buckets.get_buckets();
+
+      for (const auto& i : m) {
+        const std::string& obj_name = i.first;
+        if (!bucket_name.empty() && bucket_name != obj_name) {
+          continue;
+        }
+
+        if (show_stats) {
+          bucket_stats(driver, user_id.tenant, obj_name, formatter, dpp);
+       } else {
+          formatter->dump_string("bucket", obj_name);
+       }
+
+        marker_cursor = &obj_name;
+      } // for loop
+      if (marker_cursor) {
+       marker = *marker_cursor;
+      }
+
+      flusher.flush();
+    } while (buckets.is_truncated());
+
+    formatter->close_section();
+  } else if (!bucket_name.empty()) {
+    ret = bucket_stats(driver, user_id.tenant, bucket_name, formatter, dpp);
+    if (ret < 0) {
+      return ret;
+    }
+  } else {
+    void *handle = nullptr;
+    bool truncated = true;
+
+    formatter->open_array_section("buckets");
+    ret = driver->meta_list_keys_init(dpp, "bucket", string(), &handle);
+    while (ret == 0 && truncated) {
+      std::list<std::string> buckets;
+      constexpr int max_keys = 1000;
+      ret = driver->meta_list_keys_next(dpp, handle, max_keys, buckets,
+                                                  &truncated);
+      for (auto& bucket_name : buckets) {
+        if (show_stats) {
+          bucket_stats(driver, user_id.tenant, bucket_name, formatter, dpp);
+       } else {
+          formatter->dump_string("bucket", bucket_name);
+       }
+      }
+    }
+    driver->meta_list_keys_complete(handle);
+
+    formatter->close_section();
+  }
+
+  flusher.flush();
+
+  return 0;
+}
+
+int RGWBucketAdminOp::set_quota(rgw::sal::Driver* driver, RGWBucketAdminOpState& op_state, const DoutPrefixProvider *dpp)
+{
+  RGWBucket bucket;
+
+  int ret = bucket.init(driver, op_state, null_yield, dpp);
+  if (ret < 0)
+    return ret;
+  return bucket.set_quota(op_state, dpp);
+}
+
+inline auto split_tenant(const std::string& bucket_name){
+  auto p = bucket_name.find('/');
+  if(p != std::string::npos) {
+    return std::make_pair(bucket_name.substr(0,p), bucket_name.substr(p+1));
+  }
+  return std::make_pair(std::string(), bucket_name);
+}
+
+using bucket_instance_ls = std::vector<RGWBucketInfo>;
+void get_stale_instances(rgw::sal::Driver* driver, const std::string& bucket_name,
+                         const vector<std::string>& lst,
+                         bucket_instance_ls& stale_instances,
+                         const DoutPrefixProvider *dpp)
+{
+
+  bucket_instance_ls other_instances;
+// first iterate over the entries, and pick up the done buckets; these
+// are guaranteed to be stale
+  for (const auto& bucket_instance : lst){
+    RGWBucketInfo binfo;
+    std::unique_ptr<rgw::sal::Bucket> bucket;
+    rgw_bucket rbucket;
+    rgw_bucket_parse_bucket_key(driver->ctx(), bucket_instance, &rbucket, nullptr);
+    int r = driver->get_bucket(dpp, nullptr, rbucket, &bucket, null_yield);
+    if (r < 0){
+      // this can only happen if someone deletes us right when we're processing
+      ldpp_dout(dpp, -1) << "Bucket instance is invalid: " << bucket_instance
+                          << cpp_strerror(-r) << dendl;
+      continue;
+    }
+    binfo = bucket->get_info();
+    if (binfo.reshard_status == cls_rgw_reshard_status::DONE)
+      stale_instances.emplace_back(std::move(binfo));
+    else {
+      other_instances.emplace_back(std::move(binfo));
+    }
+  }
+
+  // Read the cur bucket info, if the bucket doesn't exist we can simply return
+  // all the instances
+  auto [tenant, bname] = split_tenant(bucket_name);
+  RGWBucketInfo cur_bucket_info;
+  std::unique_ptr<rgw::sal::Bucket> cur_bucket;
+  int r = driver->get_bucket(dpp, nullptr, tenant, bname, &cur_bucket, null_yield);
+  if (r < 0) {
+    if (r == -ENOENT) {
+      // bucket doesn't exist, everything is stale then
+      stale_instances.insert(std::end(stale_instances),
+                             std::make_move_iterator(other_instances.begin()),
+                             std::make_move_iterator(other_instances.end()));
+    } else {
+      // all bets are off if we can't read the bucket, just return the sureshot stale instances
+      ldpp_dout(dpp, -1) << "error: reading bucket info for bucket: "
+                          << bname << cpp_strerror(-r) << dendl;
+    }
+    return;
+  }
+
+  // Don't process further in this round if bucket is resharding
+  cur_bucket_info = cur_bucket->get_info();
+  if (cur_bucket_info.reshard_status == cls_rgw_reshard_status::IN_PROGRESS)
+    return;
+
+  other_instances.erase(std::remove_if(other_instances.begin(), other_instances.end(),
+                                       [&cur_bucket_info](const RGWBucketInfo& b){
+                                         return (b.bucket.bucket_id == cur_bucket_info.bucket.bucket_id ||
+                                                 b.bucket.bucket_id == cur_bucket_info.new_bucket_instance_id);
+                                       }),
+                        other_instances.end());
+
+  // check if there are still instances left
+  if (other_instances.empty()) {
+    return;
+  }
+
+  // Now we have a bucket with instances where the reshard status is none, this
+  // usually happens when the reshard process couldn't complete, lockdown the
+  // bucket and walk through these instances to make sure no one else interferes
+  // with these
+  {
+    RGWBucketReshardLock reshard_lock(static_cast<rgw::sal::RadosStore*>(driver), cur_bucket->get_info(), true);
+    r = reshard_lock.lock(dpp);
+    if (r < 0) {
+      // most likely bucket is under reshard, return the sureshot stale instances
+      ldpp_dout(dpp, 5) << __func__
+                             << "failed to take reshard lock; reshard underway likey" << dendl;
+      return;
+    }
+    auto sg = make_scope_guard([&reshard_lock](){ reshard_lock.unlock();} );
+    // this should be fast enough that we may not need to renew locks and check
+    // exit status?, should we read the values of the instances again?
+    stale_instances.insert(std::end(stale_instances),
+                           std::make_move_iterator(other_instances.begin()),
+                           std::make_move_iterator(other_instances.end()));
+  }
+
+  return;
+}
+
+static int process_stale_instances(rgw::sal::Driver* driver, RGWBucketAdminOpState& op_state,
+                                   RGWFormatterFlusher& flusher,
+                                   const DoutPrefixProvider *dpp,
+                                   std::function<void(const bucket_instance_ls&,
+                                                      Formatter *,
+                                                      rgw::sal::Driver*)> process_f)
+{
+  std::string marker;
+  void *handle;
+  Formatter *formatter = flusher.get_formatter();
+  static constexpr auto default_max_keys = 1000;
+
+  int ret = driver->meta_list_keys_init(dpp, "bucket.instance", marker, &handle);
+  if (ret < 0) {
+    cerr << "ERROR: can't get key: " << cpp_strerror(-ret) << std::endl;
+    return ret;
+  }
+
+  bool truncated;
+
+  formatter->open_array_section("keys");
+  auto g = make_scope_guard([&driver, &handle, &formatter]() {
+                              driver->meta_list_keys_complete(handle);
+                              formatter->close_section(); // keys
+                              formatter->flush(cout);
+                            });
+
+  do {
+    list<std::string> keys;
+
+    ret = driver->meta_list_keys_next(dpp, handle, default_max_keys, keys, &truncated);
+    if (ret < 0 && ret != -ENOENT) {
+      cerr << "ERROR: lists_keys_next(): " << cpp_strerror(-ret) << std::endl;
+      return ret;
+    } if (ret != -ENOENT) {
+      // partition the list of buckets by buckets as the listing is un sorted,
+      // since it would minimize the reads to bucket_info
+      std::unordered_map<std::string, std::vector<std::string>> bucket_instance_map;
+      for (auto &key: keys) {
+        auto pos = key.find(':');
+        if(pos != std::string::npos)
+          bucket_instance_map[key.substr(0,pos)].emplace_back(std::move(key));
+      }
+      for (const auto& kv: bucket_instance_map) {
+        bucket_instance_ls stale_lst;
+        get_stale_instances(driver, kv.first, kv.second, stale_lst, dpp);
+        process_f(stale_lst, formatter, driver);
+      }
+    }
+  } while (truncated);
+
+  return 0;
+}
+
+int RGWBucketAdminOp::list_stale_instances(rgw::sal::Driver* driver,
+                                           RGWBucketAdminOpState& op_state,
+                                           RGWFormatterFlusher& flusher,
+                                           const DoutPrefixProvider *dpp)
+{
+  auto process_f = [](const bucket_instance_ls& lst,
+                      Formatter *formatter,
+                      rgw::sal::Driver*){
+                     for (const auto& binfo: lst)
+                       formatter->dump_string("key", binfo.bucket.get_key());
+                   };
+  return process_stale_instances(driver, op_state, flusher, dpp, process_f);
+}
+
+
+int RGWBucketAdminOp::clear_stale_instances(rgw::sal::Driver* driver,
+                                            RGWBucketAdminOpState& op_state,
+                                            RGWFormatterFlusher& flusher,
+                                            const DoutPrefixProvider *dpp)
+{
+  auto process_f = [dpp](const bucket_instance_ls& lst,
+                      Formatter *formatter,
+                      rgw::sal::Driver* driver){
+                     for (const auto &binfo: lst) {
+                      std::unique_ptr<rgw::sal::Bucket> bucket;
+                      driver->get_bucket(nullptr, binfo, &bucket);
+                      int ret = bucket->purge_instance(dpp);
+                       if (ret == 0){
+                         auto md_key = "bucket.instance:" + binfo.bucket.get_key();
+                         ret = driver->meta_remove(dpp, md_key, null_yield);
+                       }
+                       formatter->open_object_section("delete_status");
+                       formatter->dump_string("bucket_instance", binfo.bucket.get_key());
+                       formatter->dump_int("status", -ret);
+                       formatter->close_section();
+                     }
+                   };
+
+  return process_stale_instances(driver, op_state, flusher, dpp, process_f);
+}
+
+static int fix_single_bucket_lc(rgw::sal::Driver* driver,
+                                const std::string& tenant_name,
+                                const std::string& bucket_name,
+                                const DoutPrefixProvider *dpp)
+{
+  std::unique_ptr<rgw::sal::Bucket> bucket;
+  int ret = driver->get_bucket(dpp, nullptr, tenant_name, bucket_name, &bucket, null_yield);
+  if (ret < 0) {
+    // TODO: Should we handle the case where the bucket could've been removed between
+    // listing and fetching?
+    return ret;
+  }
+
+  return rgw::lc::fix_lc_shard_entry(dpp, driver, driver->get_rgwlc()->get_lc(), bucket.get());
+}
+
+static void format_lc_status(Formatter* formatter,
+                             const std::string& tenant_name,
+                             const std::string& bucket_name,
+                             int status)
+{
+  formatter->open_object_section("bucket_entry");
+  std::string entry = tenant_name.empty() ? bucket_name : tenant_name + "/" + bucket_name;
+  formatter->dump_string("bucket", entry);
+  formatter->dump_int("status", status);
+  formatter->close_section(); // bucket_entry
+}
+
+static void process_single_lc_entry(rgw::sal::Driver* driver,
+                                   Formatter *formatter,
+                                    const std::string& tenant_name,
+                                    const std::string& bucket_name,
+                                    const DoutPrefixProvider *dpp)
+{
+  int ret = fix_single_bucket_lc(driver, tenant_name, bucket_name, dpp);
+  format_lc_status(formatter, tenant_name, bucket_name, -ret);
+}
+
+int RGWBucketAdminOp::fix_lc_shards(rgw::sal::Driver* driver,
+                                    RGWBucketAdminOpState& op_state,
+                                    RGWFormatterFlusher& flusher,
+                                    const DoutPrefixProvider *dpp)
+{
+  std::string marker;
+  void *handle;
+  Formatter *formatter = flusher.get_formatter();
+  static constexpr auto default_max_keys = 1000;
+
+  bool truncated;
+  if (const std::string& bucket_name = op_state.get_bucket_name();
+      ! bucket_name.empty()) {
+    const rgw_user user_id = op_state.get_user_id();
+    process_single_lc_entry(driver, formatter, user_id.tenant, bucket_name, dpp);
+    formatter->flush(cout);
+  } else {
+    int ret = driver->meta_list_keys_init(dpp, "bucket", marker, &handle);
+    if (ret < 0) {
+      std::cerr << "ERROR: can't get key: " << cpp_strerror(-ret) << std::endl;
+      return ret;
+    }
+
+    {
+      formatter->open_array_section("lc_fix_status");
+      auto sg = make_scope_guard([&driver, &handle, &formatter](){
+                                   driver->meta_list_keys_complete(handle);
+                                   formatter->close_section(); // lc_fix_status
+                                   formatter->flush(cout);
+                                 });
+      do {
+        list<std::string> keys;
+        ret = driver->meta_list_keys_next(dpp, handle, default_max_keys, keys, &truncated);
+        if (ret < 0 && ret != -ENOENT) {
+          std::cerr << "ERROR: lists_keys_next(): " << cpp_strerror(-ret) << std::endl;
+          return ret;
+        } if (ret != -ENOENT) {
+          for (const auto &key:keys) {
+            auto [tenant_name, bucket_name] = split_tenant(key);
+            process_single_lc_entry(driver, formatter, tenant_name, bucket_name, dpp);
+          }
+        }
+        formatter->flush(cout); // regularly flush every 1k entries
+      } while (truncated);
+    }
+
+  }
+  return 0;
+
+}
+
+static bool has_object_expired(const DoutPrefixProvider *dpp,
+                              rgw::sal::Driver* driver,
+                              rgw::sal::Bucket* bucket,
+                              const rgw_obj_key& key, utime_t& delete_at)
+{
+  std::unique_ptr<rgw::sal::Object> obj = bucket->get_object(key);
+  bufferlist delete_at_bl;
+
+  int ret = rgw_object_get_attr(dpp, driver, obj.get(), RGW_ATTR_DELETE_AT, delete_at_bl, null_yield);
+  if (ret < 0) {
+    return false;  // no delete at attr, proceed
+  }
+
+  ret = decode_bl(delete_at_bl, delete_at);
+  if (ret < 0) {
+    return false;  // failed to parse
+  }
+
+  if (delete_at <= ceph_clock_now() && !delete_at.is_zero()) {
+    return true;
+  }
+
+  return false;
+}
+
+static int fix_bucket_obj_expiry(const DoutPrefixProvider *dpp,
+                                rgw::sal::Driver* driver,
+                                rgw::sal::Bucket* bucket,
+                                RGWFormatterFlusher& flusher, bool dry_run)
+{
+  if (bucket->get_key().bucket_id == bucket->get_key().marker) {
+    ldpp_dout(dpp, -1) << "Not a resharded bucket skipping" << dendl;
+    return 0;  // not a resharded bucket, move along
+  }
+
+  Formatter *formatter = flusher.get_formatter();
+  formatter->open_array_section("expired_deletion_status");
+  auto sg = make_scope_guard([&formatter] {
+                              formatter->close_section();
+                              formatter->flush(std::cout);
+                            });
+
+  rgw::sal::Bucket::ListParams params;
+  rgw::sal::Bucket::ListResults results;
+
+  params.list_versions = bucket->versioned();
+  params.allow_unordered = true;
+
+  do {
+    int ret = bucket->list(dpp, params, listing_max_entries, results, null_yield);
+    if (ret < 0) {
+      ldpp_dout(dpp, -1) << "ERROR failed to list objects in the bucket" << dendl;
+      return ret;
+    }
+    for (const auto& obj : results.objs) {
+      rgw_obj_key key(obj.key);
+      utime_t delete_at;
+      if (has_object_expired(dpp, driver, bucket, key, delete_at)) {
+       formatter->open_object_section("object_status");
+       formatter->dump_string("object", key.name);
+       formatter->dump_stream("delete_at") << delete_at;
+
+       if (!dry_run) {
+         ret = rgw_remove_object(dpp, driver, bucket, key);
+         formatter->dump_int("status", ret);
+       }
+
+       formatter->close_section();  // object_status
+      }
+    }
+    formatter->flush(cout); // regularly flush every 1k entries
+  } while (results.is_truncated);
+
+  return 0;
+}
+
+int RGWBucketAdminOp::fix_obj_expiry(rgw::sal::Driver* driver,
+                                    RGWBucketAdminOpState& op_state,
+                                    RGWFormatterFlusher& flusher,
+                                     const DoutPrefixProvider *dpp, bool dry_run)
+{
+  RGWBucket admin_bucket;
+  int ret = admin_bucket.init(driver, op_state, null_yield, dpp);
+  if (ret < 0) {
+    ldpp_dout(dpp, -1) << "failed to initialize bucket" << dendl;
+    return ret;
+  }
+  std::unique_ptr<rgw::sal::Bucket> bucket;
+  ret = driver->get_bucket(nullptr, admin_bucket.get_bucket_info(), &bucket);
+  if (ret < 0) {
+    return ret;
+  }
+
+  return fix_bucket_obj_expiry(dpp, driver, bucket.get(), flusher, dry_run);
+}
+
+void RGWBucketCompleteInfo::dump(Formatter *f) const {
+  encode_json("bucket_info", info, f);
+  encode_json("attrs", attrs, f);
+}
+
+void RGWBucketCompleteInfo::decode_json(JSONObj *obj) {
+  JSONDecoder::decode_json("bucket_info", info, obj);
+  JSONDecoder::decode_json("attrs", attrs, obj);
+}
+
+class RGWBucketMetadataHandler : public RGWBucketMetadataHandlerBase {
+public:
+  struct Svc {
+    RGWSI_Bucket *bucket{nullptr};
+  } svc;
+
+  struct Ctl {
+    RGWBucketCtl *bucket{nullptr};
+  } ctl;
+
+  RGWBucketMetadataHandler() {}
+
+  void init(RGWSI_Bucket *bucket_svc,
+            RGWBucketCtl *bucket_ctl) override {
+    base_init(bucket_svc->ctx(),
+              bucket_svc->get_ep_be_handler().get());
+    svc.bucket = bucket_svc;
+    ctl.bucket = bucket_ctl;
+  }
+
+  string get_type() override { return "bucket"; }
+
+  RGWMetadataObject *get_meta_obj(JSONObj *jo, const obj_version& objv, const ceph::real_time& mtime) override {
+    RGWBucketEntryPoint be;
+
+    try {
+      decode_json_obj(be, jo);
+    } catch (JSONDecoder::err& e) {
+      return nullptr;
+    }
+
+    return new RGWBucketEntryMetadataObject(be, objv, mtime);
+  }
+
+  int do_get(RGWSI_MetaBackend_Handler::Op *op, string& entry, RGWMetadataObject **obj, optional_yield y, const DoutPrefixProvider *dpp) override {
+    RGWObjVersionTracker ot;
+    RGWBucketEntryPoint be;
+
+    real_time mtime;
+    map<string, bufferlist> attrs;
+
+    RGWSI_Bucket_EP_Ctx ctx(op->ctx());
+
+    int ret = svc.bucket->read_bucket_entrypoint_info(ctx, entry, &be, &ot, &mtime, &attrs, y, dpp);
+    if (ret < 0)
+      return ret;
+
+    RGWBucketEntryMetadataObject *mdo = new RGWBucketEntryMetadataObject(be, ot.read_version, mtime, std::move(attrs));
+
+    *obj = mdo;
+
+    return 0;
+  }
+
+  int do_put(RGWSI_MetaBackend_Handler::Op *op, string& entry,
+             RGWMetadataObject *obj,
+             RGWObjVersionTracker& objv_tracker,
+             optional_yield y,
+             const DoutPrefixProvider *dpp,
+             RGWMDLogSyncType type, bool from_remote_zone) override;
+
+  int do_remove(RGWSI_MetaBackend_Handler::Op *op, string& entry, RGWObjVersionTracker& objv_tracker,
+                optional_yield y, const DoutPrefixProvider *dpp) override {
+    RGWBucketEntryPoint be;
+
+    real_time orig_mtime;
+
+    RGWSI_Bucket_EP_Ctx ctx(op->ctx());
+
+    int ret = svc.bucket->read_bucket_entrypoint_info(ctx, entry, &be, &objv_tracker, &orig_mtime, nullptr, y, dpp);
+    if (ret < 0)
+      return ret;
+
+    /*
+     * We're unlinking the bucket but we don't want to update the entrypoint here - we're removing
+     * it immediately and don't want to invalidate our cached objv_version or the bucket obj removal
+     * will incorrectly fail.
+     */
+    ret = ctl.bucket->unlink_bucket(be.owner, be.bucket, y, dpp, false);
+    if (ret < 0) {
+      ldpp_dout(dpp, -1) << "could not unlink bucket=" << entry << " owner=" << be.owner << dendl;
+    }
+
+    ret = svc.bucket->remove_bucket_entrypoint_info(ctx, entry, &objv_tracker, y, dpp);
+    if (ret < 0) {
+      ldpp_dout(dpp, -1) << "could not delete bucket=" << entry << dendl;
+    }
+    /* idempotent */
+    return 0;
+  }
+
+  int call(std::function<int(RGWSI_Bucket_EP_Ctx& ctx)> f) {
+    return call(nullopt, f);
+  }
+
+  int call(std::optional<RGWSI_MetaBackend_CtxParams> bectx_params,
+           std::function<int(RGWSI_Bucket_EP_Ctx& ctx)> f) {
+    return be_handler->call(bectx_params, [&](RGWSI_MetaBackend_Handler::Op *op) {
+      RGWSI_Bucket_EP_Ctx ctx(op->ctx());
+      return f(ctx);
+    });
+  }
+};
+
+class RGWMetadataHandlerPut_Bucket : public RGWMetadataHandlerPut_SObj
+{
+  RGWBucketMetadataHandler *bhandler;
+  RGWBucketEntryMetadataObject *obj;
+public:
+  RGWMetadataHandlerPut_Bucket(RGWBucketMetadataHandler *_handler,
+                               RGWSI_MetaBackend_Handler::Op *op, string& entry,
+                               RGWMetadataObject *_obj, RGWObjVersionTracker& objv_tracker,
+                              optional_yield y,
+                               RGWMDLogSyncType type, bool from_remote_zone) : RGWMetadataHandlerPut_SObj(_handler, op, entry, obj, objv_tracker, y, type, from_remote_zone),
+                                                        bhandler(_handler) {
+    obj = static_cast<RGWBucketEntryMetadataObject *>(_obj);
+  }
+  ~RGWMetadataHandlerPut_Bucket() {}
+
+  void encode_obj(bufferlist *bl) override {
+    obj->get_ep().encode(*bl);
+  }
+
+  int put_checked(const DoutPrefixProvider *dpp) override;
+  int put_post(const DoutPrefixProvider *dpp) override;
+};
+
+int RGWBucketMetadataHandler::do_put(RGWSI_MetaBackend_Handler::Op *op, string& entry,
+                                     RGWMetadataObject *obj,
+                                     RGWObjVersionTracker& objv_tracker,
+                                    optional_yield y,
+                                     const DoutPrefixProvider *dpp,
+                                     RGWMDLogSyncType type, bool from_remote_zone)
+{
+  RGWMetadataHandlerPut_Bucket put_op(this, op, entry, obj, objv_tracker, y, type, from_remote_zone);
+  return do_put_operate(&put_op, dpp);
+}
+
+int RGWMetadataHandlerPut_Bucket::put_checked(const DoutPrefixProvider *dpp)
+{
+  RGWBucketEntryMetadataObject *orig_obj = static_cast<RGWBucketEntryMetadataObject *>(old_obj);
+
+  if (orig_obj) {
+    obj->set_pattrs(&orig_obj->get_attrs());
+  }
+
+  auto& be = obj->get_ep();
+  auto mtime = obj->get_mtime();
+  auto pattrs = obj->get_pattrs();
+
+  RGWSI_Bucket_EP_Ctx ctx(op->ctx());
+
+  return bhandler->svc.bucket->store_bucket_entrypoint_info(ctx, entry,
+                                                           be,
+                                                           false,
+                                                           mtime,
+                                                           pattrs,
+                                                           &objv_tracker,
+                                                          y,
+                                                           dpp);
+}
+
+int RGWMetadataHandlerPut_Bucket::put_post(const DoutPrefixProvider *dpp)
+{
+  auto& be = obj->get_ep();
+
+  int ret;
+
+  /* link bucket */
+  if (be.linked) {
+    ret = bhandler->ctl.bucket->link_bucket(be.owner, be.bucket, be.creation_time, y, dpp, false);
+  } else {
+    ret = bhandler->ctl.bucket->unlink_bucket(be.owner, be.bucket, y, dpp, false);
+  }
+
+  return ret;
+}
+
+static void get_md5_digest(const RGWBucketEntryPoint *be, string& md5_digest) {
+
+   char md5[CEPH_CRYPTO_MD5_DIGESTSIZE * 2 + 1];
+   unsigned char m[CEPH_CRYPTO_MD5_DIGESTSIZE];
+   bufferlist bl;
+
+   Formatter *f = new JSONFormatter(false);
+   be->dump(f);
+   f->flush(bl);
+
+   MD5 hash;
+   // Allow use of MD5 digest in FIPS mode for non-cryptographic purposes
+   hash.SetFlags(EVP_MD_CTX_FLAG_NON_FIPS_ALLOW);
+   hash.Update((const unsigned char *)bl.c_str(), bl.length());
+   hash.Final(m);
+
+   buf_to_hex(m, CEPH_CRYPTO_MD5_DIGESTSIZE, md5);
+
+   delete f;
+
+   md5_digest = md5;
+}
+
+#define ARCHIVE_META_ATTR RGW_ATTR_PREFIX "zone.archive.info" 
+
+struct archive_meta_info {
+  rgw_bucket orig_bucket;
+
+  bool from_attrs(CephContext *cct, map<string, bufferlist>& attrs) {
+    auto iter = attrs.find(ARCHIVE_META_ATTR);
+    if (iter == attrs.end()) {
+      return false;
+    }
+
+    auto bliter = iter->second.cbegin();
+    try {
+      decode(bliter);
+    } catch (buffer::error& err) {
+      ldout(cct, 0) << "ERROR: failed to decode archive meta info" << dendl;
+      return false;
+    }
+
+    return true;
+  }
+
+  void store_in_attrs(map<string, bufferlist>& attrs) const {
+    encode(attrs[ARCHIVE_META_ATTR]);
+  }
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(1, 1, bl);
+    encode(orig_bucket, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) {
+     DECODE_START(1, bl);
+     decode(orig_bucket, bl);
+     DECODE_FINISH(bl);
+  }
+};
+WRITE_CLASS_ENCODER(archive_meta_info)
+
+class RGWArchiveBucketMetadataHandler : public RGWBucketMetadataHandler {
+public:
+  RGWArchiveBucketMetadataHandler() {}
+
+  int do_remove(RGWSI_MetaBackend_Handler::Op *op, string& entry, RGWObjVersionTracker& objv_tracker,
+                optional_yield y, const DoutPrefixProvider *dpp) override {
+    auto cct = svc.bucket->ctx();
+
+    RGWSI_Bucket_EP_Ctx ctx(op->ctx());
+
+    ldpp_dout(dpp, 5) << "SKIP: bucket removal is not allowed on archive zone: bucket:" << entry << " ... proceeding to rename" << dendl;
+
+    string tenant_name, bucket_name;
+    parse_bucket(entry, &tenant_name, &bucket_name);
+    rgw_bucket entry_bucket;
+    entry_bucket.tenant = tenant_name;
+    entry_bucket.name = bucket_name;
+
+    real_time mtime;
+
+    /* read original entrypoint */
+
+    RGWBucketEntryPoint be;
+    map<string, bufferlist> attrs;
+    int ret = svc.bucket->read_bucket_entrypoint_info(ctx, entry, &be, &objv_tracker, &mtime, &attrs, y, dpp);
+    if (ret < 0) {
+        return ret;
+    }
+
+    string bi_meta_name = RGWSI_Bucket::get_bi_meta_key(be.bucket);
+
+    /* read original bucket instance info */
+
+    map<string, bufferlist> attrs_m;
+    ceph::real_time orig_mtime;
+    RGWBucketInfo old_bi;
+
+    ret = ctl.bucket->read_bucket_instance_info(be.bucket, &old_bi, y, dpp, RGWBucketCtl::BucketInstance::GetParams()
+                                                                    .set_mtime(&orig_mtime)
+                                                                    .set_attrs(&attrs_m));
+    if (ret < 0) {
+        return ret;
+    }
+
+    archive_meta_info ami;
+
+    if (!ami.from_attrs(svc.bucket->ctx(), attrs_m)) {
+      ami.orig_bucket = old_bi.bucket;
+      ami.store_in_attrs(attrs_m);
+    }
+
+    /* generate a new bucket instance. We could have avoided this if we could just point a new
+     * bucket entry point to the old bucket instance, however, due to limitation in the way
+     * we index buckets under the user, bucket entrypoint and bucket instance of the same
+     * bucket need to have the same name, so we need to copy the old bucket instance into
+     * to a new entry with the new name
+     */
+
+    string new_bucket_name;
+
+    RGWBucketInfo new_bi = old_bi;
+    RGWBucketEntryPoint new_be = be;
+
+    string md5_digest;
+
+    get_md5_digest(&new_be, md5_digest);
+    new_bucket_name = ami.orig_bucket.name + "-deleted-" + md5_digest;
+
+    new_bi.bucket.name = new_bucket_name;
+    new_bi.objv_tracker.clear();
+
+    new_be.bucket.name = new_bucket_name;
+
+    ret = ctl.bucket->store_bucket_instance_info(be.bucket, new_bi, y, dpp, RGWBucketCtl::BucketInstance::PutParams()
+                                                                    .set_exclusive(false)
+                                                                    .set_mtime(orig_mtime)
+                                                                    .set_attrs(&attrs_m)
+                                                                    .set_orig_info(&old_bi));
+    if (ret < 0) {
+      ldpp_dout(dpp, 0) << "ERROR: failed to put new bucket instance info for bucket=" << new_bi.bucket << " ret=" << ret << dendl;
+      return ret;
+    }
+
+    /* store a new entrypoint */
+
+    RGWObjVersionTracker ot;
+    ot.generate_new_write_ver(cct);
+
+    ret = svc.bucket->store_bucket_entrypoint_info(ctx, RGWSI_Bucket::get_entrypoint_meta_key(new_be.bucket),
+                                                   new_be, true, mtime, &attrs, nullptr, y, dpp);
+    if (ret < 0) {
+      ldpp_dout(dpp, 0) << "ERROR: failed to put new bucket entrypoint for bucket=" << new_be.bucket << " ret=" << ret << dendl;
+      return ret;
+    }
+
+    /* link new bucket */
+
+    ret = ctl.bucket->link_bucket(new_be.owner, new_be.bucket, new_be.creation_time, y, dpp, false);
+    if (ret < 0) {
+      ldpp_dout(dpp, 0) << "ERROR: failed to link new bucket for bucket=" << new_be.bucket << " ret=" << ret << dendl;
+      return ret;
+    }
+
+    /* clean up old stuff */
+
+    ret = ctl.bucket->unlink_bucket(be.owner, entry_bucket, y, dpp, false);
+    if (ret < 0) {
+        ldpp_dout(dpp, -1) << "could not unlink bucket=" << entry << " owner=" << be.owner << dendl;
+    }
+
+    // if (ret == -ECANCELED) it means that there was a race here, and someone
+    // wrote to the bucket entrypoint just before we removed it. The question is
+    // whether it was a newly created bucket entrypoint ...  in which case we
+    // should ignore the error and move forward, or whether it is a higher version
+    // of the same bucket instance ... in which we should retry
+    ret = svc.bucket->remove_bucket_entrypoint_info(ctx,
+                                                    RGWSI_Bucket::get_entrypoint_meta_key(be.bucket),
+                                                    &objv_tracker,
+                                                    y,
+                                                    dpp);
+    if (ret < 0) {
+      ldpp_dout(dpp, 0) << "ERROR: failed to put new bucket entrypoint for bucket=" << new_be.bucket << " ret=" << ret << dendl;
+      return ret;
+    }
+
+    ret = ctl.bucket->remove_bucket_instance_info(be.bucket, old_bi, y, dpp);
+    if (ret < 0) {
+      ldpp_dout(dpp, -1) << "could not delete bucket=" << entry << dendl;
+    }
+
+
+    /* idempotent */
+
+    return 0;
+  }
+
+  int do_put(RGWSI_MetaBackend_Handler::Op *op, string& entry,
+             RGWMetadataObject *obj,
+             RGWObjVersionTracker& objv_tracker,
+             optional_yield y, const DoutPrefixProvider *dpp,
+             RGWMDLogSyncType type, bool from_remote_zone) override {
+    if (entry.find("-deleted-") != string::npos) {
+      RGWObjVersionTracker ot;
+      RGWMetadataObject *robj;
+      int ret = do_get(op, entry, &robj, y, dpp);
+      if (ret != -ENOENT) {
+        if (ret < 0) {
+          return ret;
+        }
+        ot.read_version = robj->get_version();
+        delete robj;
+
+        ret = do_remove(op, entry, ot, y, dpp);
+        if (ret < 0) {
+          return ret;
+        }
+      }
+    }
+
+    return RGWBucketMetadataHandler::do_put(op, entry, obj,
+                                            objv_tracker, y, dpp, type, from_remote_zone);
+  }
+
+};
+
+class RGWBucketInstanceMetadataHandler : public RGWBucketInstanceMetadataHandlerBase {
+  int read_bucket_instance_entry(RGWSI_Bucket_BI_Ctx& ctx,
+                                 const string& entry,
+                                 RGWBucketCompleteInfo *bi,
+                                 ceph::real_time *pmtime,
+                                 optional_yield y,
+                                 const DoutPrefixProvider *dpp) {
+    return svc.bucket->read_bucket_instance_info(ctx,
+                                                 entry,
+                                                 &bi->info,
+                                                 pmtime, &bi->attrs,
+                                                 y,
+                                                 dpp);
+  }
+
+public:
+  struct Svc {
+    RGWSI_Zone *zone{nullptr};
+    RGWSI_Bucket *bucket{nullptr};
+    RGWSI_BucketIndex *bi{nullptr};
+  } svc;
+
+  rgw::sal::Driver* driver;
+
+  RGWBucketInstanceMetadataHandler(rgw::sal::Driver* driver)
+    : driver(driver) {}
+
+  void init(RGWSI_Zone *zone_svc,
+           RGWSI_Bucket *bucket_svc,
+           RGWSI_BucketIndex *bi_svc) override {
+    base_init(bucket_svc->ctx(),
+              bucket_svc->get_bi_be_handler().get());
+    svc.zone = zone_svc;
+    svc.bucket = bucket_svc;
+    svc.bi = bi_svc;
+  }
+
+  string get_type() override { return "bucket.instance"; }
+
+  RGWMetadataObject *get_meta_obj(JSONObj *jo, const obj_version& objv, const ceph::real_time& mtime) override {
+    RGWBucketCompleteInfo bci;
+
+    try {
+      decode_json_obj(bci, jo);
+    } catch (JSONDecoder::err& e) {
+      return nullptr;
+    }
+
+    return new RGWBucketInstanceMetadataObject(bci, objv, mtime);
+  }
+
+  int do_get(RGWSI_MetaBackend_Handler::Op *op, string& entry, RGWMetadataObject **obj, optional_yield y, const DoutPrefixProvider *dpp) override {
+    RGWBucketCompleteInfo bci;
+    real_time mtime;
+
+    RGWSI_Bucket_BI_Ctx ctx(op->ctx());
+
+    int ret = svc.bucket->read_bucket_instance_info(ctx, entry, &bci.info, &mtime, &bci.attrs, y, dpp);
+    if (ret < 0)
+      return ret;
+
+    RGWBucketInstanceMetadataObject *mdo = new RGWBucketInstanceMetadataObject(bci, bci.info.objv_tracker.read_version, mtime);
+
+    *obj = mdo;
+
+    return 0;
+  }
+
+  int do_put(RGWSI_MetaBackend_Handler::Op *op, string& entry,
+             RGWMetadataObject *_obj, RGWObjVersionTracker& objv_tracker,
+            optional_yield y, const DoutPrefixProvider *dpp,
+             RGWMDLogSyncType sync_type, bool from_remote_zone) override;
+
+  int do_remove(RGWSI_MetaBackend_Handler::Op *op, string& entry, RGWObjVersionTracker& objv_tracker,
+                optional_yield y, const DoutPrefixProvider *dpp) override {
+    RGWBucketCompleteInfo bci;
+
+    RGWSI_Bucket_BI_Ctx ctx(op->ctx());
+
+    int ret = read_bucket_instance_entry(ctx, entry, &bci, nullptr, y, dpp);
+    if (ret < 0 && ret != -ENOENT)
+      return ret;
+
+    return svc.bucket->remove_bucket_instance_info(ctx, entry, bci.info, &bci.info.objv_tracker, y, dpp);
+  }
+
+  int call(std::function<int(RGWSI_Bucket_BI_Ctx& ctx)> f) {
+    return call(nullopt, f);
+  }
+
+  int call(std::optional<RGWSI_MetaBackend_CtxParams> bectx_params,
+           std::function<int(RGWSI_Bucket_BI_Ctx& ctx)> f) {
+    return be_handler->call(bectx_params, [&](RGWSI_MetaBackend_Handler::Op *op) {
+      RGWSI_Bucket_BI_Ctx ctx(op->ctx());
+      return f(ctx);
+    });
+  }
+};
+
+class RGWMetadataHandlerPut_BucketInstance : public RGWMetadataHandlerPut_SObj
+{
+  CephContext *cct;
+  RGWBucketInstanceMetadataHandler *bihandler;
+  RGWBucketInstanceMetadataObject *obj;
+public:
+  RGWMetadataHandlerPut_BucketInstance(CephContext *_cct,
+                                       RGWBucketInstanceMetadataHandler *_handler,
+                                       RGWSI_MetaBackend_Handler::Op *_op, string& entry,
+                                       RGWMetadataObject *_obj, RGWObjVersionTracker& objv_tracker,
+                                      optional_yield y,
+                                       RGWMDLogSyncType type, bool from_remote_zone) : RGWMetadataHandlerPut_SObj(_handler, _op, entry, obj, objv_tracker, y, type, from_remote_zone),
+                                       cct(_cct), bihandler(_handler) {
+    obj = static_cast<RGWBucketInstanceMetadataObject *>(_obj);
+
+    auto& bci = obj->get_bci();
+    obj->set_pattrs(&bci.attrs);
+  }
+
+  void encode_obj(bufferlist *bl) override {
+    obj->get_bucket_info().encode(*bl);
+  }
+
+  int put_check(const DoutPrefixProvider *dpp) override;
+  int put_checked(const DoutPrefixProvider *dpp) override;
+  int put_post(const DoutPrefixProvider *dpp) override;
+};
+
+int RGWBucketInstanceMetadataHandler::do_put(RGWSI_MetaBackend_Handler::Op *op,
+                                             string& entry,
+                                             RGWMetadataObject *obj,
+                                             RGWObjVersionTracker& objv_tracker,
+                                             optional_yield y,
+                                             const DoutPrefixProvider *dpp,
+                                             RGWMDLogSyncType type, bool from_remote_zone)
+{
+  RGWMetadataHandlerPut_BucketInstance put_op(svc.bucket->ctx(), this, op, entry, obj,
+                                              objv_tracker, y, type, from_remote_zone);
+  return do_put_operate(&put_op, dpp);
+}
+
+void init_default_bucket_layout(CephContext *cct, rgw::BucketLayout& layout,
+                               const RGWZone& zone,
+                               std::optional<uint32_t> shards,
+                               std::optional<rgw::BucketIndexType> type) {
+  layout.current_index.gen = 0;
+  layout.current_index.layout.normal.hash_type = rgw::BucketHashType::Mod;
+
+  layout.current_index.layout.type =
+    type.value_or(rgw::BucketIndexType::Normal);
+
+  if (shards) {
+    layout.current_index.layout.normal.num_shards = *shards;
+  } else if (cct->_conf->rgw_override_bucket_index_max_shards > 0) {
+    layout.current_index.layout.normal.num_shards =
+      cct->_conf->rgw_override_bucket_index_max_shards;
+  } else {
+    layout.current_index.layout.normal.num_shards =
+      zone.bucket_index_max_shards;
+  }
+
+  if (layout.current_index.layout.type == rgw::BucketIndexType::Normal) {
+    layout.logs.push_back(log_layout_from_index(0, layout.current_index));
+  }
+}
+
+int RGWMetadataHandlerPut_BucketInstance::put_check(const DoutPrefixProvider *dpp)
+{
+  int ret;
+
+  RGWBucketCompleteInfo& bci = obj->get_bci();
+
+  RGWBucketInstanceMetadataObject *orig_obj = static_cast<RGWBucketInstanceMetadataObject *>(old_obj);
+
+  RGWBucketCompleteInfo *old_bci = (orig_obj ? &orig_obj->get_bci() : nullptr);
+
+  const bool exists = (!!orig_obj);
+
+  if (from_remote_zone) {
+    // don't sync bucket layout changes
+    if (!exists) {
+      // replace peer's layout with default-constructed, then apply our defaults
+      bci.info.layout = rgw::BucketLayout{};
+      init_default_bucket_layout(cct, bci.info.layout,
+                                bihandler->svc.zone->get_zone(),
+                                std::nullopt, std::nullopt);
+    } else {
+      bci.info.layout = old_bci->info.layout;
+    }
+  }
+
+  if (!exists || old_bci->info.bucket.bucket_id != bci.info.bucket.bucket_id) {
+    /* a new bucket, we need to select a new bucket placement for it */
+    string tenant_name;
+    string bucket_name;
+    string bucket_instance;
+    parse_bucket(entry, &tenant_name, &bucket_name, &bucket_instance);
+
+    RGWZonePlacementInfo rule_info;
+    bci.info.bucket.name = bucket_name;
+    bci.info.bucket.bucket_id = bucket_instance;
+    bci.info.bucket.tenant = tenant_name;
+    // if the sync module never writes data, don't require the zone to specify all placement targets
+    if (bihandler->svc.zone->sync_module_supports_writes()) {
+      ret = bihandler->svc.zone->select_bucket_location_by_rule(dpp, bci.info.placement_rule, &rule_info, y);
+      if (ret < 0) {
+        ldpp_dout(dpp, 0) << "ERROR: select_bucket_placement() returned " << ret << dendl;
+        return ret;
+      }
+    }
+    bci.info.layout.current_index.layout.type = rule_info.index_type;
+  } else {
+    /* existing bucket, keep its placement */
+    bci.info.bucket.explicit_placement = old_bci->info.bucket.explicit_placement;
+    bci.info.placement_rule = old_bci->info.placement_rule;
+  }
+
+  /* record the read version (if any), store the new version */
+  bci.info.objv_tracker.read_version = objv_tracker.read_version;
+  bci.info.objv_tracker.write_version = objv_tracker.write_version;
+
+  return 0;
+}
+
+int RGWMetadataHandlerPut_BucketInstance::put_checked(const DoutPrefixProvider *dpp)
+{
+  RGWBucketInstanceMetadataObject *orig_obj = static_cast<RGWBucketInstanceMetadataObject *>(old_obj);
+
+  RGWBucketInfo *orig_info = (orig_obj ? &orig_obj->get_bucket_info() : nullptr);
+
+  auto& info = obj->get_bucket_info();
+  auto mtime = obj->get_mtime();
+  auto pattrs = obj->get_pattrs();
+
+  RGWSI_Bucket_BI_Ctx ctx(op->ctx());
+
+  return bihandler->svc.bucket->store_bucket_instance_info(ctx,
+                                                         entry,
+                                                         info,
+                                                         orig_info,
+                                                         false,
+                                                         mtime,
+                                                         pattrs,
+                                                        y,
+                                                         dpp);
+}
+
+int RGWMetadataHandlerPut_BucketInstance::put_post(const DoutPrefixProvider *dpp)
+{
+  RGWBucketCompleteInfo& bci = obj->get_bci();
+
+  objv_tracker = bci.info.objv_tracker;
+
+  int ret = bihandler->svc.bi->init_index(dpp, bci.info, bci.info.layout.current_index);
+  if (ret < 0) {
+    return ret;
+  }
+
+  /* update lifecyle policy */
+  {
+    std::unique_ptr<rgw::sal::Bucket> bucket;
+    ret = bihandler->driver->get_bucket(nullptr, bci.info, &bucket);
+    if (ret < 0) {
+      ldpp_dout(dpp, 0) << __func__ << " failed to get_bucket(...) for "
+                       << bci.info.bucket.name
+                       << dendl;
+      return ret;
+    }
+
+    auto lc = bihandler->driver->get_rgwlc();
+
+    auto lc_it = bci.attrs.find(RGW_ATTR_LC);
+    if (lc_it != bci.attrs.end()) {
+      ldpp_dout(dpp, 20) << "set lc config for " << bci.info.bucket.name << dendl;
+      ret = lc->set_bucket_config(bucket.get(), bci.attrs, nullptr);
+      if (ret < 0) {
+             ldpp_dout(dpp, 0) << __func__ << " failed to set lc config for "
+                       << bci.info.bucket.name
+                       << dendl;
+             return ret;
+      }
+
+    } else {
+      ldpp_dout(dpp, 20) << "remove lc config for " << bci.info.bucket.name << dendl;
+      ret = lc->remove_bucket_config(bucket.get(), bci.attrs, false /* cannot merge attrs */);
+      if (ret < 0) {
+             ldpp_dout(dpp, 0) << __func__ << " failed to remove lc config for "
+                       << bci.info.bucket.name
+                       << dendl;
+             return ret;
+      }
+    }
+  } /* update lc */
+
+  return STATUS_APPLIED;
+}
+
+class RGWArchiveBucketInstanceMetadataHandler : public RGWBucketInstanceMetadataHandler {
+public:
+  RGWArchiveBucketInstanceMetadataHandler(rgw::sal::Driver* driver)
+    : RGWBucketInstanceMetadataHandler(driver) {}
+
+  // N.B. replication of lifecycle policy relies on logic in RGWBucketInstanceMetadataHandler::do_put(...), override with caution
+
+  int do_remove(RGWSI_MetaBackend_Handler::Op *op, string& entry, RGWObjVersionTracker& objv_tracker, optional_yield y, const DoutPrefixProvider *dpp) override {
+    ldpp_dout(dpp, 0) << "SKIP: bucket instance removal is not allowed on archive zone: bucket.instance:" << entry << dendl;
+    return 0;
+  }
+};
+
+RGWBucketCtl::RGWBucketCtl(RGWSI_Zone *zone_svc,
+                           RGWSI_Bucket *bucket_svc,
+                           RGWSI_Bucket_Sync *bucket_sync_svc,
+                           RGWSI_BucketIndex *bi_svc,
+                           RGWSI_User* user_svc)
+  : cct(zone_svc->ctx())
+{
+  svc.zone = zone_svc;
+  svc.bucket = bucket_svc;
+  svc.bucket_sync = bucket_sync_svc;
+  svc.bi = bi_svc;
+  svc.user = user_svc;
+}
+
+void RGWBucketCtl::init(RGWUserCtl *user_ctl,
+                        RGWBucketMetadataHandler *_bm_handler,
+                        RGWBucketInstanceMetadataHandler *_bmi_handler,
+                        RGWDataChangesLog *datalog,
+                        const DoutPrefixProvider *dpp)
+{
+  ctl.user = user_ctl;
+
+  bm_handler = _bm_handler;
+  bmi_handler = _bmi_handler;
+
+  bucket_be_handler = bm_handler->get_be_handler();
+  bi_be_handler = bmi_handler->get_be_handler();
+
+  datalog->set_bucket_filter(
+    [this](const rgw_bucket& bucket, optional_yield y, const DoutPrefixProvider *dpp) {
+      return bucket_exports_data(bucket, y, dpp);
+    });
+}
+
+int RGWBucketCtl::call(std::function<int(RGWSI_Bucket_X_Ctx& ctx)> f) {
+  return bm_handler->call([&](RGWSI_Bucket_EP_Ctx& ep_ctx) {
+    return bmi_handler->call([&](RGWSI_Bucket_BI_Ctx& bi_ctx) {
+      RGWSI_Bucket_X_Ctx ctx{ep_ctx, bi_ctx};
+      return f(ctx);
+    });
+  });
+}
+
+int RGWBucketCtl::read_bucket_entrypoint_info(const rgw_bucket& bucket,
+                                              RGWBucketEntryPoint *info,
+                                              optional_yield y, const DoutPrefixProvider *dpp,
+                                              const Bucket::GetParams& params)
+{
+  return bm_handler->call(params.bectx_params, [&](RGWSI_Bucket_EP_Ctx& ctx) {
+    return svc.bucket->read_bucket_entrypoint_info(ctx,
+                                                   RGWSI_Bucket::get_entrypoint_meta_key(bucket),
+                                                   info,
+                                                   params.objv_tracker,
+                                                   params.mtime,
+                                                   params.attrs,
+                                                  y,
+                                                   dpp,
+                                                   params.cache_info,
+                                                   params.refresh_version);
+  });
+}
+
+int RGWBucketCtl::store_bucket_entrypoint_info(const rgw_bucket& bucket,
+                                               RGWBucketEntryPoint& info,
+                                               optional_yield y,
+                                               const DoutPrefixProvider *dpp,
+                                               const Bucket::PutParams& params)
+{
+  return bm_handler->call([&](RGWSI_Bucket_EP_Ctx& ctx) {
+    return svc.bucket->store_bucket_entrypoint_info(ctx,
+                                                    RGWSI_Bucket::get_entrypoint_meta_key(bucket),
+                                                    info,
+                                                    params.exclusive,
+                                                    params.mtime,
+                                                    params.attrs,
+                                                    params.objv_tracker,
+                                                    y,
+                                                    dpp);
+  });
+}
+
+int RGWBucketCtl::remove_bucket_entrypoint_info(const rgw_bucket& bucket,
+                                                optional_yield y,
+                                                const DoutPrefixProvider *dpp,
+                                                const Bucket::RemoveParams& params)
+{
+  return bm_handler->call([&](RGWSI_Bucket_EP_Ctx& ctx) {
+    return svc.bucket->remove_bucket_entrypoint_info(ctx,
+                                                     RGWSI_Bucket::get_entrypoint_meta_key(bucket),
+                                                     params.objv_tracker,
+                                                    y,
+                                                     dpp);
+  });
+}
+
+int RGWBucketCtl::read_bucket_instance_info(const rgw_bucket& bucket,
+                                            RGWBucketInfo *info,
+                                            optional_yield y,
+                                            const DoutPrefixProvider *dpp,
+                                            const BucketInstance::GetParams& params)
+{
+  int ret = bmi_handler->call(params.bectx_params, [&](RGWSI_Bucket_BI_Ctx& ctx) {
+    return svc.bucket->read_bucket_instance_info(ctx,
+                                                 RGWSI_Bucket::get_bi_meta_key(bucket),
+                                                 info,
+                                                 params.mtime,
+                                                 params.attrs,
+                                                y,
+                                                 dpp,
+                                                 params.cache_info,
+                                                 params.refresh_version);
+  });
+
+  if (ret < 0) {
+    return ret;
+  }
+
+  if (params.objv_tracker) {
+    *params.objv_tracker = info->objv_tracker;
+  }
+
+  return 0;
+}
+
+int RGWBucketCtl::read_bucket_info(const rgw_bucket& bucket,
+                                   RGWBucketInfo *info,
+                                   optional_yield y,
+                                   const DoutPrefixProvider *dpp,
+                                   const BucketInstance::GetParams& params,
+                                   RGWObjVersionTracker *ep_objv_tracker)
+{
+  const rgw_bucket *b = &bucket;
+
+  std::optional<RGWBucketEntryPoint> ep;
+
+  if (b->bucket_id.empty()) {
+    ep.emplace();
+
+    int r = read_bucket_entrypoint_info(*b, &(*ep), y, dpp, RGWBucketCtl::Bucket::GetParams()
+                                                    .set_bectx_params(params.bectx_params)
+                                                    .set_objv_tracker(ep_objv_tracker));
+    if (r < 0) {
+      return r;
+    }
+
+    b = &ep->bucket;
+  }
+
+  int ret = bmi_handler->call(params.bectx_params, [&](RGWSI_Bucket_BI_Ctx& ctx) {
+    return svc.bucket->read_bucket_instance_info(ctx,
+                                                 RGWSI_Bucket::get_bi_meta_key(*b),
+                                                 info,
+                                                 params.mtime,
+                                                 params.attrs,
+                                                y, dpp,
+                                                 params.cache_info,
+                                                 params.refresh_version);
+  });
+
+  if (ret < 0) {
+    return ret;
+  }
+
+  if (params.objv_tracker) {
+    *params.objv_tracker = info->objv_tracker;
+  }
+
+  return 0;
+}
+
+int RGWBucketCtl::do_store_bucket_instance_info(RGWSI_Bucket_BI_Ctx& ctx,
+                                                const rgw_bucket& bucket,
+                                                RGWBucketInfo& info,
+                                                optional_yield y,
+                                                const DoutPrefixProvider *dpp,
+                                                const BucketInstance::PutParams& params)
+{
+  if (params.objv_tracker) {
+    info.objv_tracker = *params.objv_tracker;
+  }
+
+  return svc.bucket->store_bucket_instance_info(ctx,
+                                                RGWSI_Bucket::get_bi_meta_key(bucket),
+                                                info,
+                                                params.orig_info,
+                                                params.exclusive,
+                                                params.mtime,
+                                                params.attrs,
+                                                y,
+                                                dpp);
+}
+
+int RGWBucketCtl::store_bucket_instance_info(const rgw_bucket& bucket,
+                                            RGWBucketInfo& info,
+                                            optional_yield y,
+                                            const DoutPrefixProvider *dpp,
+                                            const BucketInstance::PutParams& params)
+{
+  return bmi_handler->call([&](RGWSI_Bucket_BI_Ctx& ctx) {
+    return do_store_bucket_instance_info(ctx, bucket, info, y, dpp, params);
+  });
+}
+
+int RGWBucketCtl::remove_bucket_instance_info(const rgw_bucket& bucket,
+                                              RGWBucketInfo& info,
+                                              optional_yield y,
+                                              const DoutPrefixProvider *dpp,
+                                              const BucketInstance::RemoveParams& params)
+{
+  if (params.objv_tracker) {
+    info.objv_tracker = *params.objv_tracker;
+  }
+
+  return bmi_handler->call([&](RGWSI_Bucket_BI_Ctx& ctx) {
+    return svc.bucket->remove_bucket_instance_info(ctx,
+                                                   RGWSI_Bucket::get_bi_meta_key(bucket),
+                                                   info,
+                                                   &info.objv_tracker,
+                                                   y,
+                                                   dpp);
+  });
+}
+
+int RGWBucketCtl::do_store_linked_bucket_info(RGWSI_Bucket_X_Ctx& ctx,
+                                              RGWBucketInfo& info,
+                                              RGWBucketInfo *orig_info,
+                                              bool exclusive, real_time mtime,
+                                              obj_version *pep_objv,
+                                              map<string, bufferlist> *pattrs,
+                                              bool create_entry_point,
+                                             optional_yield y, const DoutPrefixProvider *dpp)
+{
+  bool create_head = !info.has_instance_obj || create_entry_point;
+
+  int ret = svc.bucket->store_bucket_instance_info(ctx.bi,
+                                                   RGWSI_Bucket::get_bi_meta_key(info.bucket),
+                                                   info,
+                                                   orig_info,
+                                                   exclusive,
+                                                   mtime, pattrs,
+                                                  y, dpp);
+  if (ret < 0) {
+    return ret;
+  }
+
+  if (!create_head)
+    return 0; /* done! */
+
+  RGWBucketEntryPoint entry_point;
+  entry_point.bucket = info.bucket;
+  entry_point.owner = info.owner;
+  entry_point.creation_time = info.creation_time;
+  entry_point.linked = true;
+  RGWObjVersionTracker ot;
+  if (pep_objv && !pep_objv->tag.empty()) {
+    ot.write_version = *pep_objv;
+  } else {
+    ot.generate_new_write_ver(cct);
+    if (pep_objv) {
+      *pep_objv = ot.write_version;
+    }
+  }
+  ret = svc.bucket->store_bucket_entrypoint_info(ctx.ep,
+                                                 RGWSI_Bucket::get_entrypoint_meta_key(info.bucket),
+                                                 entry_point,
+                                                 exclusive,
+                                                 mtime,
+                                                 pattrs,
+                                                 &ot,
+                                                y,
+                                                 dpp);
+  if (ret < 0)
+    return ret;
+
+  return 0;
+}
+int RGWBucketCtl::convert_old_bucket_info(RGWSI_Bucket_X_Ctx& ctx,
+                                          const rgw_bucket& bucket,
+                                          optional_yield y,
+                                          const DoutPrefixProvider *dpp)
+{
+  RGWBucketEntryPoint entry_point;
+  real_time ep_mtime;
+  RGWObjVersionTracker ot;
+  map<string, bufferlist> attrs;
+  RGWBucketInfo info;
+  auto cct = svc.bucket->ctx();
+
+  ldpp_dout(dpp, 10) << "RGWRados::convert_old_bucket_info(): bucket=" << bucket << dendl;
+
+  int ret = svc.bucket->read_bucket_entrypoint_info(ctx.ep,
+                                                    RGWSI_Bucket::get_entrypoint_meta_key(bucket),
+                                                    &entry_point, &ot, &ep_mtime, &attrs, y, dpp);
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: get_bucket_entrypoint_info() returned " << ret << " bucket=" << bucket << dendl;
+    return ret;
+  }
+
+  if (!entry_point.has_bucket_info) {
+    /* already converted! */
+    return 0;
+  }
+
+  info = entry_point.old_bucket_info;
+
+  ot.generate_new_write_ver(cct);
+
+  ret = do_store_linked_bucket_info(ctx, info, nullptr, false, ep_mtime, &ot.write_version, &attrs, true, y, dpp);
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: failed to put_linked_bucket_info(): " << ret << dendl;
+    return ret;
+  }
+
+  return 0;
+}
+
+int RGWBucketCtl::set_bucket_instance_attrs(RGWBucketInfo& bucket_info,
+                                            map<string, bufferlist>& attrs,
+                                            RGWObjVersionTracker *objv_tracker,
+                                            optional_yield y,
+                                            const DoutPrefixProvider *dpp)
+{
+  return call([&](RGWSI_Bucket_X_Ctx& ctx) {
+    rgw_bucket& bucket = bucket_info.bucket;
+
+    if (!bucket_info.has_instance_obj) {
+      /* an old bucket object, need to convert it */
+        int ret = convert_old_bucket_info(ctx, bucket, y, dpp);
+        if (ret < 0) {
+          ldpp_dout(dpp, 0) << "ERROR: failed converting old bucket info: " << ret << dendl;
+          return ret;
+        }
+    }
+
+    return do_store_bucket_instance_info(ctx.bi,
+                                         bucket,
+                                         bucket_info,
+                                         y,
+                                         dpp,
+                                         BucketInstance::PutParams().set_attrs(&attrs)
+                                                                    .set_objv_tracker(objv_tracker)
+                                                                    .set_orig_info(&bucket_info));
+    });
+}
+
+
+int RGWBucketCtl::link_bucket(const rgw_user& user_id,
+                              const rgw_bucket& bucket,
+                              ceph::real_time creation_time,
+                             optional_yield y,
+                              const DoutPrefixProvider *dpp,
+                              bool update_entrypoint,
+                              rgw_ep_info *pinfo)
+{
+  return bm_handler->call([&](RGWSI_Bucket_EP_Ctx& ctx) {
+    return do_link_bucket(ctx, user_id, bucket, creation_time,
+                          update_entrypoint, pinfo, y, dpp);
+  });
+}
+
+int RGWBucketCtl::do_link_bucket(RGWSI_Bucket_EP_Ctx& ctx,
+                                 const rgw_user& user_id,
+                                 const rgw_bucket& bucket,
+                                 ceph::real_time creation_time,
+                                 bool update_entrypoint,
+                                 rgw_ep_info *pinfo,
+                                optional_yield y,
+                                 const DoutPrefixProvider *dpp)
+{
+  int ret;
+
+  RGWBucketEntryPoint ep;
+  RGWObjVersionTracker ot;
+  RGWObjVersionTracker& rot = (pinfo) ? pinfo->ep_objv : ot;
+  map<string, bufferlist> attrs, *pattrs = nullptr;
+  string meta_key;
+
+  if (update_entrypoint) {
+    meta_key = RGWSI_Bucket::get_entrypoint_meta_key(bucket);
+    if (pinfo) {
+      ep = pinfo->ep;
+      pattrs = &pinfo->attrs;
+    } else {
+      ret = svc.bucket->read_bucket_entrypoint_info(ctx,
+                                                    meta_key,
+                                                    &ep, &rot,
+                                                    nullptr, &attrs,
+                                                    y, dpp);
+      if (ret < 0 && ret != -ENOENT) {
+        ldpp_dout(dpp, 0) << "ERROR: read_bucket_entrypoint_info() returned: "
+                      << cpp_strerror(-ret) << dendl;
+      }
+      pattrs = &attrs;
+    }
+  }
+
+  ret = svc.user->add_bucket(dpp, user_id, bucket, creation_time, y);
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: error adding bucket to user directory:"
+                 << " user=" << user_id
+                  << " bucket=" << bucket
+                 << " err=" << cpp_strerror(-ret)
+                 << dendl;
+    goto done_err;
+  }
+
+  if (!update_entrypoint)
+    return 0;
+
+  ep.linked = true;
+  ep.owner = user_id;
+  ep.bucket = bucket;
+  ret = svc.bucket->store_bucket_entrypoint_info(
+    ctx, meta_key, ep, false, real_time(), pattrs, &rot, y, dpp);
+  if (ret < 0)
+    goto done_err;
+
+  return 0;
+
+done_err:
+  int r = do_unlink_bucket(ctx, user_id, bucket, true, y, dpp);
+  if (r < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: failed unlinking bucket on error cleanup: "
+                           << cpp_strerror(-r) << dendl;
+  }
+  return ret;
+}
+
+int RGWBucketCtl::unlink_bucket(const rgw_user& user_id, const rgw_bucket& bucket, optional_yield y, const DoutPrefixProvider *dpp, bool update_entrypoint)
+{
+  return bm_handler->call([&](RGWSI_Bucket_EP_Ctx& ctx) {
+    return do_unlink_bucket(ctx, user_id, bucket, update_entrypoint, y, dpp);
+  });
+}
+
+int RGWBucketCtl::do_unlink_bucket(RGWSI_Bucket_EP_Ctx& ctx,
+                                   const rgw_user& user_id,
+                                   const rgw_bucket& bucket,
+                                   bool update_entrypoint,
+                                  optional_yield y,
+                                   const DoutPrefixProvider *dpp)
+{
+  int ret = svc.user->remove_bucket(dpp, user_id, bucket, y);
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: error removing bucket from directory: "
+        << cpp_strerror(-ret)<< dendl;
+  }
+
+  if (!update_entrypoint)
+    return 0;
+
+  RGWBucketEntryPoint ep;
+  RGWObjVersionTracker ot;
+  map<string, bufferlist> attrs;
+  string meta_key = RGWSI_Bucket::get_entrypoint_meta_key(bucket);
+  ret = svc.bucket->read_bucket_entrypoint_info(ctx, meta_key, &ep, &ot, nullptr, &attrs, y, dpp);
+  if (ret == -ENOENT)
+    return 0;
+  if (ret < 0)
+    return ret;
+
+  if (!ep.linked)
+    return 0;
+
+  if (ep.owner != user_id) {
+    ldpp_dout(dpp, 0) << "bucket entry point user mismatch, can't unlink bucket: " << ep.owner << " != " << user_id << dendl;
+    return -EINVAL;
+  }
+
+  ep.linked = false;
+  return svc.bucket->store_bucket_entrypoint_info(ctx, meta_key, ep, false, real_time(), &attrs, &ot, y, dpp);
+}
+
+// TODO: remove RGWRados dependency for bucket listing
+int RGWBucketCtl::chown(rgw::sal::Driver* driver, rgw::sal::Bucket* bucket,
+                        const rgw_user& user_id, const std::string& display_name,
+                        const std::string& marker, optional_yield y, const DoutPrefixProvider *dpp)
+{
+  map<string, bool> common_prefixes;
+
+  rgw::sal::Bucket::ListParams params;
+  rgw::sal::Bucket::ListResults results;
+
+  params.list_versions = true;
+  params.allow_unordered = true;
+  params.marker = marker;
+
+  int count = 0;
+  int max_entries = 1000;
+
+  //Loop through objects and update object acls to point to bucket owner
+
+  do {
+    RGWObjectCtx obj_ctx(driver);
+    results.objs.clear();
+    int ret = bucket->list(dpp, params, max_entries, results, y);
+    if (ret < 0) {
+      ldpp_dout(dpp, 0) << "ERROR: list objects failed: " << cpp_strerror(-ret) << dendl;
+      return ret;
+    }
+
+    params.marker = results.next_marker;
+    count += results.objs.size();
+
+    for (const auto& obj : results.objs) {
+      std::unique_ptr<rgw::sal::Object> r_obj = bucket->get_object(obj.key);
+
+      ret = r_obj->get_obj_attrs(y, dpp);
+      if (ret < 0){
+        ldpp_dout(dpp, 0) << "ERROR: failed to read object " << obj.key.name << cpp_strerror(-ret) << dendl;
+        continue;
+      }
+      const auto& aiter = r_obj->get_attrs().find(RGW_ATTR_ACL);
+      if (aiter == r_obj->get_attrs().end()) {
+        ldpp_dout(dpp, 0) << "ERROR: no acls found for object " << obj.key.name << " .Continuing with next object." << dendl;
+        continue;
+      } else {
+        bufferlist& bl = aiter->second;
+        RGWAccessControlPolicy policy(driver->ctx());
+        ACLOwner owner;
+        try {
+          decode(policy, bl);
+          owner = policy.get_owner();
+        } catch (buffer::error& err) {
+          ldpp_dout(dpp, 0) << "ERROR: decode policy failed" << err.what()
+                                << dendl;
+          return -EIO;
+        }
+
+        //Get the ACL from the policy
+        RGWAccessControlList& acl = policy.get_acl();
+
+        //Remove grant that is set to old owner
+        acl.remove_canon_user_grant(owner.get_id());
+
+        //Create a grant and add grant
+        ACLGrant grant;
+        grant.set_canon(user_id, display_name, RGW_PERM_FULL_CONTROL);
+        acl.add_grant(&grant);
+
+        //Update the ACL owner to the new user
+        owner.set_id(user_id);
+        owner.set_name(display_name);
+        policy.set_owner(owner);
+
+        bl.clear();
+        encode(policy, bl);
+
+       r_obj->set_atomic();
+       map<string, bufferlist> attrs;
+       attrs[RGW_ATTR_ACL] = bl;
+       ret = r_obj->set_obj_attrs(dpp, &attrs, nullptr, y);
+        if (ret < 0) {
+          ldpp_dout(dpp, 0) << "ERROR: modify attr failed " << cpp_strerror(-ret) << dendl;
+          return ret;
+        }
+      }
+    }
+    cerr << count << " objects processed in " << bucket
+        << ". Next marker " << params.marker.name << std::endl;
+  } while(results.is_truncated);
+  return 0;
+}
+
+int RGWBucketCtl::read_bucket_stats(const rgw_bucket& bucket,
+                                    RGWBucketEnt *result,
+                                    optional_yield y,
+                                    const DoutPrefixProvider *dpp)
+{
+  return call([&](RGWSI_Bucket_X_Ctx& ctx) {
+    return svc.bucket->read_bucket_stats(ctx, bucket, result, y, dpp);
+  });
+}
+
+int RGWBucketCtl::read_buckets_stats(map<string, RGWBucketEnt>& m,
+                                     optional_yield y, const DoutPrefixProvider *dpp)
+{
+  return call([&](RGWSI_Bucket_X_Ctx& ctx) {
+    return svc.bucket->read_buckets_stats(ctx, m, y, dpp);
+  });
+}
+
+int RGWBucketCtl::sync_user_stats(const DoutPrefixProvider *dpp, 
+                                  const rgw_user& user_id,
+                                  const RGWBucketInfo& bucket_info,
+                                 optional_yield y,
+                                  RGWBucketEnt* pent)
+{
+  RGWBucketEnt ent;
+  if (!pent) {
+    pent = &ent;
+  }
+  int r = svc.bi->read_stats(dpp, bucket_info, pent, null_yield);
+  if (r < 0) {
+    ldpp_dout(dpp, 20) << __func__ << "(): failed to read bucket stats (r=" << r << ")" << dendl;
+    return r;
+  }
+
+  return svc.user->flush_bucket_stats(dpp, user_id, *pent, y);
+}
+
+int RGWBucketCtl::get_sync_policy_handler(std::optional<rgw_zone_id> zone,
+                                          std::optional<rgw_bucket> bucket,
+                                          RGWBucketSyncPolicyHandlerRef *phandler,
+                                          optional_yield y,
+                                          const DoutPrefixProvider *dpp)
+{
+  int r = call([&](RGWSI_Bucket_X_Ctx& ctx) {
+    return svc.bucket_sync->get_policy_handler(ctx, zone, bucket, phandler, y, dpp);
+  });
+  if (r < 0) {
+    ldpp_dout(dpp, 20) << __func__ << "(): failed to get policy handler for bucket=" << bucket << " (r=" << r << ")" << dendl;
+    return r;
+  }
+  return 0;
+}
+
+int RGWBucketCtl::bucket_exports_data(const rgw_bucket& bucket,
+                                      optional_yield y,
+                                      const DoutPrefixProvider *dpp)
+{
+
+  RGWBucketSyncPolicyHandlerRef handler;
+
+  int r = get_sync_policy_handler(std::nullopt, bucket, &handler, y, dpp);
+  if (r < 0) {
+    return r;
+  }
+
+  return handler->bucket_exports_data();
+}
+
+int RGWBucketCtl::bucket_imports_data(const rgw_bucket& bucket,
+                                      optional_yield y, const DoutPrefixProvider *dpp)
+{
+
+  RGWBucketSyncPolicyHandlerRef handler;
+
+  int r = get_sync_policy_handler(std::nullopt, bucket, &handler, y, dpp);
+  if (r < 0) {
+    return r;
+  }
+
+  return handler->bucket_imports_data();
+}
+
+RGWBucketMetadataHandlerBase* RGWBucketMetaHandlerAllocator::alloc()
+{
+  return new RGWBucketMetadataHandler();
+}
+
+RGWBucketInstanceMetadataHandlerBase* RGWBucketInstanceMetaHandlerAllocator::alloc(rgw::sal::Driver* driver)
+{
+  return new RGWBucketInstanceMetadataHandler(driver);
+}
+
+RGWBucketMetadataHandlerBase* RGWArchiveBucketMetaHandlerAllocator::alloc()
+{
+  return new RGWArchiveBucketMetadataHandler();
+}
+
+RGWBucketInstanceMetadataHandlerBase* RGWArchiveBucketInstanceMetaHandlerAllocator::alloc(rgw::sal::Driver* driver)
+{
+  return new RGWArchiveBucketInstanceMetadataHandler(driver);
+}
+
+
+void RGWBucketEntryPoint::generate_test_instances(list<RGWBucketEntryPoint*>& o)
+{
+  RGWBucketEntryPoint *bp = new RGWBucketEntryPoint();
+  init_bucket(&bp->bucket, "tenant", "bucket", "pool", ".index.pool", "marker", "10");
+  bp->owner = "owner";
+  bp->creation_time = ceph::real_clock::from_ceph_timespec({ceph_le32(2), ceph_le32(3)});
+
+  o.push_back(bp);
+  o.push_back(new RGWBucketEntryPoint);
+}
+
+void RGWBucketEntryPoint::dump(Formatter *f) const
+{
+  encode_json("bucket", bucket, f);
+  encode_json("owner", owner, f);
+  utime_t ut(creation_time);
+  encode_json("creation_time", ut, f);
+  encode_json("linked", linked, f);
+  encode_json("has_bucket_info", has_bucket_info, f);
+  if (has_bucket_info) {
+    encode_json("old_bucket_info", old_bucket_info, f);
+  }
+}
+
+void RGWBucketEntryPoint::decode_json(JSONObj *obj) {
+  JSONDecoder::decode_json("bucket", bucket, obj);
+  JSONDecoder::decode_json("owner", owner, obj);
+  utime_t ut;
+  JSONDecoder::decode_json("creation_time", ut, obj);
+  creation_time = ut.to_real_time();
+  JSONDecoder::decode_json("linked", linked, obj);
+  JSONDecoder::decode_json("has_bucket_info", has_bucket_info, obj);
+  if (has_bucket_info) {
+    JSONDecoder::decode_json("old_bucket_info", old_bucket_info, obj);
+  }
+}
+
diff --git a/src/rgw/driver/rados/rgw_bucket.h b/src/rgw/driver/rados/rgw_bucket.h
new file mode 100644 (file)
index 0000000..636a1f2
--- /dev/null
@@ -0,0 +1,765 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include <string>
+#include <memory>
+#include <variant>
+
+#include <boost/container/flat_map.hpp>
+#include <boost/container/flat_set.hpp>
+
+#include "include/types.h"
+#include "rgw_common.h"
+#include "rgw_tools.h"
+#include "rgw_metadata.h"
+
+#include "rgw_string.h"
+#include "rgw_sal.h"
+
+#include "common/Formatter.h"
+#include "common/lru_map.h"
+#include "common/ceph_time.h"
+
+#include "rgw_formats.h"
+
+#include "services/svc_bucket_types.h"
+#include "services/svc_bucket_sync.h"
+
+// define as static when RGWBucket implementation completes
+extern void rgw_get_buckets_obj(const rgw_user& user_id, std::string& buckets_obj_id);
+
+class RGWSI_Meta;
+class RGWBucketMetadataHandler;
+class RGWBucketInstanceMetadataHandler;
+class RGWUserCtl;
+class RGWBucketCtl;
+class RGWZone;
+struct RGWZoneParams;
+
+extern void init_bucket(rgw_bucket *b, const char *t, const char *n, const char *dp, const char *ip, const char *m, const char *id);
+extern int rgw_bucket_parse_bucket_key(CephContext *cct, const std::string& key,
+                                       rgw_bucket* bucket, int *shard_id);
+
+extern std::string rgw_make_bucket_entry_name(const std::string& tenant_name,
+                                              const std::string& bucket_name);
+
+extern void rgw_parse_url_bucket(const std::string& bucket,
+                                 const std::string& auth_tenant,
+                                 std::string &tenant_name, std::string &bucket_name);
+
+// this is used as a filter to RGWRados::cls_bucket_list_ordered; it
+// conforms to the type RGWBucketListNameFilter
+extern bool rgw_bucket_object_check_filter(const std::string& oid);
+
+void init_default_bucket_layout(CephContext *cct, rgw::BucketLayout& layout,
+                               const RGWZone& zone,
+                               std::optional<uint32_t> shards,
+                               std::optional<rgw::BucketIndexType> type);
+
+struct RGWBucketCompleteInfo {
+  RGWBucketInfo info;
+  std::map<std::string, bufferlist> attrs;
+
+  void dump(Formatter *f) const;
+  void decode_json(JSONObj *obj);
+};
+
+class RGWBucketEntryMetadataObject : public RGWMetadataObject {
+  RGWBucketEntryPoint ep;
+  std::map<std::string, bufferlist> attrs;
+public:
+  RGWBucketEntryMetadataObject(RGWBucketEntryPoint& _ep, const obj_version& v, real_time m) : ep(_ep) {
+    objv = v;
+    mtime = m;
+    set_pattrs (&attrs);
+  }
+  RGWBucketEntryMetadataObject(RGWBucketEntryPoint& _ep, const obj_version& v, real_time m, std::map<std::string, bufferlist>&& _attrs) :
+    ep(_ep), attrs(std::move(_attrs)) {
+    objv = v;
+    mtime = m;
+    set_pattrs (&attrs);
+  }
+
+  void dump(Formatter *f) const override {
+    ep.dump(f);
+  }
+
+  RGWBucketEntryPoint& get_ep() {
+    return ep;
+  }
+
+  std::map<std::string, bufferlist>& get_attrs() {
+    return attrs;
+  }
+};
+
+class RGWBucketInstanceMetadataObject : public RGWMetadataObject {
+  RGWBucketCompleteInfo info;
+public:
+  RGWBucketInstanceMetadataObject() {}
+  RGWBucketInstanceMetadataObject(RGWBucketCompleteInfo& i, const obj_version& v, real_time m) : info(i) {
+    objv = v;
+    mtime = m;
+  }
+
+  void dump(Formatter *f) const override {
+    info.dump(f);
+  }
+
+  void decode_json(JSONObj *obj) {
+    info.decode_json(obj);
+  }
+
+  RGWBucketCompleteInfo& get_bci() {
+    return info;
+  }
+  RGWBucketInfo& get_bucket_info() {
+    return info.info;
+  }
+};
+
+/**
+ * store a list of the user's buckets, with associated functinos.
+ */
+class RGWUserBuckets {
+  std::map<std::string, RGWBucketEnt> buckets;
+
+public:
+  RGWUserBuckets() = default;
+  RGWUserBuckets(RGWUserBuckets&&) = default;
+
+  RGWUserBuckets& operator=(const RGWUserBuckets&) = default;
+
+  void encode(bufferlist& bl) const {
+    using ceph::encode;
+    encode(buckets, bl);
+  }
+  void decode(bufferlist::const_iterator& bl) {
+    using ceph::decode;
+    decode(buckets, bl);
+  }
+  /**
+   * Check if the user owns a bucket by the given name.
+   */
+  bool owns(std::string& name) {
+    std::map<std::string, RGWBucketEnt>::iterator iter;
+    iter = buckets.find(name);
+    return (iter != buckets.end());
+  }
+
+  /**
+   * Add a (created) bucket to the user's bucket list.
+   */
+  void add(const RGWBucketEnt& bucket) {
+    buckets[bucket.bucket.name] = bucket;
+  }
+
+  /**
+   * Remove a bucket from the user's list by name.
+   */
+  void remove(const std::string& name) {
+    std::map<std::string, RGWBucketEnt>::iterator iter;
+    iter = buckets.find(name);
+    if (iter != buckets.end()) {
+      buckets.erase(iter);
+    }
+  }
+
+  /**
+   * Get the user's buckets as a map.
+   */
+  std::map<std::string, RGWBucketEnt>& get_buckets() { return buckets; }
+
+  /**
+   * Cleanup data structure
+   */
+  void clear() { buckets.clear(); }
+
+  size_t count() { return buckets.size(); }
+};
+WRITE_CLASS_ENCODER(RGWUserBuckets)
+
+class RGWBucketMetadataHandlerBase : public RGWMetadataHandler_GenericMetaBE {
+public:
+  virtual ~RGWBucketMetadataHandlerBase() {}
+  virtual void init(RGWSI_Bucket *bucket_svc,
+                    RGWBucketCtl *bucket_ctl) = 0;
+
+};
+
+class RGWBucketInstanceMetadataHandlerBase : public RGWMetadataHandler_GenericMetaBE {
+public:
+  virtual ~RGWBucketInstanceMetadataHandlerBase() {}
+  virtual void init(RGWSI_Zone *zone_svc,
+                    RGWSI_Bucket *bucket_svc,
+                    RGWSI_BucketIndex *bi_svc) = 0;
+};
+
+class RGWBucketMetaHandlerAllocator {
+public:
+  static RGWBucketMetadataHandlerBase *alloc();
+};
+
+class RGWBucketInstanceMetaHandlerAllocator {
+public:
+  static RGWBucketInstanceMetadataHandlerBase *alloc(rgw::sal::Driver* driver);
+};
+
+class RGWArchiveBucketMetaHandlerAllocator {
+public:
+  static RGWBucketMetadataHandlerBase *alloc();
+};
+
+class RGWArchiveBucketInstanceMetaHandlerAllocator {
+public:
+  static RGWBucketInstanceMetadataHandlerBase *alloc(rgw::sal::Driver* driver);
+};
+
+extern int rgw_remove_object(const DoutPrefixProvider *dpp, rgw::sal::Driver* driver, rgw::sal::Bucket* bucket, rgw_obj_key& key);
+
+extern int rgw_object_get_attr(rgw::sal::Driver* driver, rgw::sal::Object* obj,
+                              const char* attr_name, bufferlist& out_bl,
+                              optional_yield y);
+
+extern void check_bad_user_bucket_mapping(rgw::sal::Driver* driver, rgw::sal::User* user, bool fix, optional_yield y, const DoutPrefixProvider *dpp);
+
+struct RGWBucketAdminOpState {
+  rgw_user uid;
+  std::string display_name;
+  std::string bucket_name;
+  std::string bucket_id;
+  std::string object_name;
+  std::string new_bucket_name;
+
+  bool list_buckets;
+  bool stat_buckets;
+  bool check_objects;
+  bool fix_index;
+  bool delete_child_objects;
+  bool bucket_stored;
+  bool sync_bucket;
+  int max_aio = 0;
+
+  std::unique_ptr<rgw::sal::Bucket>  bucket;
+
+  RGWQuotaInfo quota;
+  RGWRateLimitInfo ratelimit_info;
+
+  void set_fetch_stats(bool value) { stat_buckets = value; }
+  void set_check_objects(bool value) { check_objects = value; }
+  void set_fix_index(bool value) { fix_index = value; }
+  void set_delete_children(bool value) { delete_child_objects = value; }
+
+  void set_max_aio(int value) { max_aio = value; }
+
+  void set_user_id(const rgw_user& user_id) {
+    if (!user_id.empty())
+      uid = user_id;
+  }
+  void set_tenant(const std::string& tenant_str) {
+    uid.tenant = tenant_str;
+  }
+  void set_bucket_name(const std::string& bucket_str) {
+    bucket_name = bucket_str; 
+  }
+  void set_object(std::string& object_str) {
+    object_name = object_str;
+  }
+  void set_new_bucket_name(std::string& new_bucket_str) {
+    new_bucket_name = new_bucket_str;
+  }
+  void set_quota(RGWQuotaInfo& value) {
+    quota = value;
+  }
+  void set_bucket_ratelimit(RGWRateLimitInfo& value) {
+    ratelimit_info = value;
+  }
+
+
+  void set_sync_bucket(bool value) { sync_bucket = value; }
+
+  rgw_user& get_user_id() { return uid; }
+  std::string& get_user_display_name() { return display_name; }
+  std::string& get_bucket_name() { return bucket_name; }
+  std::string& get_object_name() { return object_name; }
+  std::string& get_tenant() { return uid.tenant; }
+
+  rgw::sal::Bucket* get_bucket() { return bucket.get(); }
+  void set_bucket(std::unique_ptr<rgw::sal::Bucket> _bucket) {
+    bucket = std::move(_bucket);
+    bucket_stored = true;
+  }
+
+  void set_bucket_id(const std::string& bi) {
+    bucket_id = bi;
+  }
+  const std::string& get_bucket_id() { return bucket_id; }
+
+  bool will_fetch_stats() { return stat_buckets; }
+  bool will_fix_index() { return fix_index; }
+  bool will_delete_children() { return delete_child_objects; }
+  bool will_check_objects() { return check_objects; }
+  bool is_user_op() { return !uid.empty(); }
+  bool is_system_op() { return uid.empty(); }
+  bool has_bucket_stored() { return bucket_stored; }
+  int get_max_aio() { return max_aio; }
+  bool will_sync_bucket() { return sync_bucket; }
+
+  RGWBucketAdminOpState() : list_buckets(false), stat_buckets(false), check_objects(false), 
+                            fix_index(false), delete_child_objects(false),
+                            bucket_stored(false), sync_bucket(true)  {}
+};
+
+
+/*
+ * A simple wrapper class for administrative bucket operations
+ */
+class RGWBucket {
+  RGWUserBuckets buckets;
+  rgw::sal::Driver* driver;
+  RGWAccessHandle handle;
+
+  std::unique_ptr<rgw::sal::Bucket> bucket;
+  std::unique_ptr<rgw::sal::User> user;
+
+  bool failure;
+
+  RGWObjVersionTracker ep_objv; // entrypoint object version
+
+public:
+  RGWBucket() : driver(NULL), handle(NULL), failure(false) {}
+  int init(rgw::sal::Driver* storage, RGWBucketAdminOpState& op_state, optional_yield y,
+             const DoutPrefixProvider *dpp, std::string *err_msg = NULL);
+
+  int check_bad_index_multipart(RGWBucketAdminOpState& op_state,
+              RGWFormatterFlusher& flusher,
+              const DoutPrefixProvider *dpp, std::string *err_msg = NULL);
+
+  int check_object_index(const DoutPrefixProvider *dpp, 
+                         RGWBucketAdminOpState& op_state,
+                         RGWFormatterFlusher& flusher,
+                         optional_yield y,
+                         std::string *err_msg = NULL);
+
+  int check_index(const DoutPrefixProvider *dpp,
+          RGWBucketAdminOpState& op_state,
+          std::map<RGWObjCategory, RGWStorageStats>& existing_stats,
+          std::map<RGWObjCategory, RGWStorageStats>& calculated_stats,
+          std::string *err_msg = NULL);
+
+  int chown(RGWBucketAdminOpState& op_state, const std::string& marker,
+            optional_yield y, const DoutPrefixProvider *dpp, std::string *err_msg = NULL);
+  int set_quota(RGWBucketAdminOpState& op_state, const DoutPrefixProvider *dpp, std::string *err_msg = NULL);
+
+  int remove_object(const DoutPrefixProvider *dpp, RGWBucketAdminOpState& op_state, std::string *err_msg = NULL);
+  int policy_bl_to_stream(bufferlist& bl, std::ostream& o);
+  int get_policy(RGWBucketAdminOpState& op_state, RGWAccessControlPolicy& policy, optional_yield y, const DoutPrefixProvider *dpp);
+  int sync(RGWBucketAdminOpState& op_state, const DoutPrefixProvider *dpp, std::string *err_msg = NULL);
+
+  void clear_failure() { failure = false; }
+
+  const RGWBucketInfo& get_bucket_info() const { return bucket->get_info(); }
+};
+
+class RGWBucketAdminOp {
+public:
+  static int get_policy(rgw::sal::Driver* driver, RGWBucketAdminOpState& op_state,
+                  RGWFormatterFlusher& flusher, const DoutPrefixProvider *dpp);
+  static int get_policy(rgw::sal::Driver* driver, RGWBucketAdminOpState& op_state,
+                  RGWAccessControlPolicy& policy, const DoutPrefixProvider *dpp);
+  static int dump_s3_policy(rgw::sal::Driver* driver, RGWBucketAdminOpState& op_state,
+                  std::ostream& os, const DoutPrefixProvider *dpp);
+
+  static int unlink(rgw::sal::Driver* driver, RGWBucketAdminOpState& op_state, const DoutPrefixProvider *dpp);
+  static int link(rgw::sal::Driver* driver, RGWBucketAdminOpState& op_state, const DoutPrefixProvider *dpp, std::string *err_msg = NULL);
+  static int chown(rgw::sal::Driver* driver, RGWBucketAdminOpState& op_state, const std::string& marker, const DoutPrefixProvider *dpp, std::string *err_msg = NULL);
+
+  static int check_index(rgw::sal::Driver* driver, RGWBucketAdminOpState& op_state,
+                  RGWFormatterFlusher& flusher, optional_yield y, const DoutPrefixProvider *dpp);
+
+  static int remove_bucket(rgw::sal::Driver* driver, RGWBucketAdminOpState& op_state, optional_yield y,
+                          const DoutPrefixProvider *dpp, bool bypass_gc = false, bool keep_index_consistent = true);
+  static int remove_object(rgw::sal::Driver* driver, RGWBucketAdminOpState& op_state, const DoutPrefixProvider *dpp);
+  static int info(rgw::sal::Driver* driver, RGWBucketAdminOpState& op_state, RGWFormatterFlusher& flusher, optional_yield y, const DoutPrefixProvider *dpp);
+  static int limit_check(rgw::sal::Driver* driver, RGWBucketAdminOpState& op_state,
+                        const std::list<std::string>& user_ids,
+                        RGWFormatterFlusher& flusher, optional_yield y,
+                         const DoutPrefixProvider *dpp,
+                        bool warnings_only = false);
+  static int set_quota(rgw::sal::Driver* driver, RGWBucketAdminOpState& op_state, const DoutPrefixProvider *dpp);
+
+  static int list_stale_instances(rgw::sal::Driver* driver, RGWBucketAdminOpState& op_state,
+                                 RGWFormatterFlusher& flusher, const DoutPrefixProvider *dpp);
+
+  static int clear_stale_instances(rgw::sal::Driver* driver, RGWBucketAdminOpState& op_state,
+                                  RGWFormatterFlusher& flusher, const DoutPrefixProvider *dpp);
+  static int fix_lc_shards(rgw::sal::Driver* driver, RGWBucketAdminOpState& op_state,
+                           RGWFormatterFlusher& flusher, const DoutPrefixProvider *dpp);
+  static int fix_obj_expiry(rgw::sal::Driver* driver, RGWBucketAdminOpState& op_state,
+                           RGWFormatterFlusher& flusher, const DoutPrefixProvider *dpp, bool dry_run = false);
+
+  static int sync_bucket(rgw::sal::Driver* driver, RGWBucketAdminOpState& op_state, const DoutPrefixProvider *dpp, std::string *err_msg = NULL);
+};
+
+struct rgw_ep_info {
+  RGWBucketEntryPoint &ep;
+  std::map<std::string, buffer::list>& attrs;
+  RGWObjVersionTracker ep_objv;
+  rgw_ep_info(RGWBucketEntryPoint &ep, std::map<std::string, bufferlist>& attrs)
+    : ep(ep), attrs(attrs) {}
+};
+
+class RGWBucketCtl {
+  CephContext *cct;
+
+  struct Svc {
+    RGWSI_Zone *zone{nullptr};
+    RGWSI_Bucket *bucket{nullptr};
+    RGWSI_Bucket_Sync *bucket_sync{nullptr};
+    RGWSI_BucketIndex *bi{nullptr};
+    RGWSI_User* user = nullptr;
+  } svc;
+
+  struct Ctl {
+    RGWUserCtl *user{nullptr};
+  } ctl;
+
+  RGWBucketMetadataHandler *bm_handler;
+  RGWBucketInstanceMetadataHandler *bmi_handler;
+
+  RGWSI_Bucket_BE_Handler bucket_be_handler; /* bucket backend handler */
+  RGWSI_BucketInstance_BE_Handler bi_be_handler; /* bucket instance backend handler */
+
+  int call(std::function<int(RGWSI_Bucket_X_Ctx& ctx)> f);
+
+public:
+  RGWBucketCtl(RGWSI_Zone *zone_svc,
+               RGWSI_Bucket *bucket_svc,
+               RGWSI_Bucket_Sync *bucket_sync_svc,
+               RGWSI_BucketIndex *bi_svc,
+               RGWSI_User* user_svc);
+
+  void init(RGWUserCtl *user_ctl,
+            RGWBucketMetadataHandler *_bm_handler,
+            RGWBucketInstanceMetadataHandler *_bmi_handler,
+            RGWDataChangesLog *datalog,
+            const DoutPrefixProvider *dpp);
+
+  struct Bucket {
+    struct GetParams {
+      RGWObjVersionTracker *objv_tracker{nullptr};
+      real_time *mtime{nullptr};
+      std::map<std::string, bufferlist> *attrs{nullptr};
+      rgw_cache_entry_info *cache_info{nullptr};
+      boost::optional<obj_version> refresh_version;
+      std::optional<RGWSI_MetaBackend_CtxParams> bectx_params;
+
+      GetParams() {}
+
+      GetParams& set_objv_tracker(RGWObjVersionTracker *_objv_tracker) {
+        objv_tracker = _objv_tracker;
+        return *this;
+      }
+
+      GetParams& set_mtime(ceph::real_time *_mtime) {
+        mtime = _mtime;
+        return *this;
+      }
+
+      GetParams& set_attrs(std::map<std::string, bufferlist> *_attrs) {
+        attrs = _attrs;
+        return *this;
+      }
+
+      GetParams& set_cache_info(rgw_cache_entry_info *_cache_info) {
+        cache_info = _cache_info;
+        return *this;
+      }
+
+      GetParams& set_refresh_version(const obj_version& _refresh_version) {
+        refresh_version = _refresh_version;
+        return *this;
+      }
+
+      GetParams& set_bectx_params(std::optional<RGWSI_MetaBackend_CtxParams> _bectx_params) {
+        bectx_params = _bectx_params;
+        return *this;
+      }
+    };
+
+    struct PutParams {
+      RGWObjVersionTracker *objv_tracker{nullptr};
+      ceph::real_time mtime;
+      bool exclusive{false};
+      std::map<std::string, bufferlist> *attrs{nullptr};
+
+      PutParams() {}
+
+      PutParams& set_objv_tracker(RGWObjVersionTracker *_objv_tracker) {
+        objv_tracker = _objv_tracker;
+        return *this;
+      }
+
+      PutParams& set_mtime(const ceph::real_time& _mtime) {
+        mtime = _mtime;
+        return *this;
+      }
+
+      PutParams& set_exclusive(bool _exclusive) {
+        exclusive = _exclusive;
+        return *this;
+      }
+
+      PutParams& set_attrs(std::map<std::string, bufferlist> *_attrs) {
+        attrs = _attrs;
+        return *this;
+      }
+    };
+
+    struct RemoveParams {
+      RGWObjVersionTracker *objv_tracker{nullptr};
+
+      RemoveParams() {}
+
+      RemoveParams& set_objv_tracker(RGWObjVersionTracker *_objv_tracker) {
+        objv_tracker = _objv_tracker;
+        return *this;
+      }
+    };
+  };
+
+  struct BucketInstance {
+    struct GetParams {
+      real_time *mtime{nullptr};
+      std::map<std::string, bufferlist> *attrs{nullptr};
+      rgw_cache_entry_info *cache_info{nullptr};
+      boost::optional<obj_version> refresh_version;
+      RGWObjVersionTracker *objv_tracker{nullptr};
+      std::optional<RGWSI_MetaBackend_CtxParams> bectx_params;
+
+      GetParams() {}
+
+      GetParams& set_mtime(ceph::real_time *_mtime) {
+        mtime = _mtime;
+        return *this;
+      }
+
+      GetParams& set_attrs(std::map<std::string, bufferlist> *_attrs) {
+        attrs = _attrs;
+        return *this;
+      }
+
+      GetParams& set_cache_info(rgw_cache_entry_info *_cache_info) {
+        cache_info = _cache_info;
+        return *this;
+      }
+
+      GetParams& set_refresh_version(const obj_version& _refresh_version) {
+        refresh_version = _refresh_version;
+        return *this;
+      }
+
+      GetParams& set_objv_tracker(RGWObjVersionTracker *_objv_tracker) {
+        objv_tracker = _objv_tracker;
+        return *this;
+      }
+
+      GetParams& set_bectx_params(std::optional<RGWSI_MetaBackend_CtxParams> _bectx_params) {
+        bectx_params = _bectx_params;
+        return *this;
+      }
+    };
+
+    struct PutParams {
+      std::optional<RGWBucketInfo *> orig_info; /* nullopt: orig_info was not fetched,
+                                                   nullptr: orig_info was not found (new bucket instance */
+      ceph::real_time mtime;
+      bool exclusive{false};
+      std::map<std::string, bufferlist> *attrs{nullptr};
+      RGWObjVersionTracker *objv_tracker{nullptr};
+
+      PutParams() {}
+
+      PutParams& set_orig_info(RGWBucketInfo *pinfo) {
+        orig_info = pinfo;
+        return *this;
+      }
+
+      PutParams& set_mtime(const ceph::real_time& _mtime) {
+        mtime = _mtime;
+        return *this;
+      }
+
+      PutParams& set_exclusive(bool _exclusive) {
+        exclusive = _exclusive;
+        return *this;
+      }
+
+      PutParams& set_attrs(std::map<std::string, bufferlist> *_attrs) {
+        attrs = _attrs;
+        return *this;
+      }
+
+      PutParams& set_objv_tracker(RGWObjVersionTracker *_objv_tracker) {
+        objv_tracker = _objv_tracker;
+        return *this;
+      }
+    };
+
+    struct RemoveParams {
+      RGWObjVersionTracker *objv_tracker{nullptr};
+
+      RemoveParams() {}
+
+      RemoveParams& set_objv_tracker(RGWObjVersionTracker *_objv_tracker) {
+        objv_tracker = _objv_tracker;
+        return *this;
+      }
+    };
+  };
+
+  /* bucket entrypoint */
+  int read_bucket_entrypoint_info(const rgw_bucket& bucket,
+                                  RGWBucketEntryPoint *info,
+                                  optional_yield y,
+                                  const DoutPrefixProvider *dpp,
+                                  const Bucket::GetParams& params = {});
+  int store_bucket_entrypoint_info(const rgw_bucket& bucket,
+                                   RGWBucketEntryPoint& info,
+                                   optional_yield y,
+                                   const DoutPrefixProvider *dpp,
+                                   const Bucket::PutParams& params = {});
+  int remove_bucket_entrypoint_info(const rgw_bucket& bucket,
+                                    optional_yield y,
+                                    const DoutPrefixProvider *dpp,
+                                    const Bucket::RemoveParams& params = {});
+
+  /* bucket instance */
+  int read_bucket_instance_info(const rgw_bucket& bucket,
+                                  RGWBucketInfo *info,
+                                  optional_yield y,
+                                  const DoutPrefixProvider *dpp,
+                                  const BucketInstance::GetParams& params = {});
+  int store_bucket_instance_info(const rgw_bucket& bucket,
+                                 RGWBucketInfo& info,
+                                 optional_yield y,
+                                 const DoutPrefixProvider *dpp,
+                                 const BucketInstance::PutParams& params = {});
+  int remove_bucket_instance_info(const rgw_bucket& bucket,
+                                  RGWBucketInfo& info,
+                                  optional_yield y,
+                                  const DoutPrefixProvider *dpp,
+                                  const BucketInstance::RemoveParams& params = {});
+
+  /*
+   * bucket_id may or may not be provided
+   *
+   * ep_objv_tracker might not be populated even if provided. Will only be set if entrypoint is read
+   * (that is: if bucket_id is empty).
+   */
+  int read_bucket_info(const rgw_bucket& bucket,
+                       RGWBucketInfo *info,
+                       optional_yield y,
+                       const DoutPrefixProvider *dpp,
+                       const BucketInstance::GetParams& params = {},
+                      RGWObjVersionTracker *ep_objv_tracker = nullptr);
+
+
+  int set_bucket_instance_attrs(RGWBucketInfo& bucket_info,
+                                std::map<std::string, bufferlist>& attrs,
+                                RGWObjVersionTracker *objv_tracker,
+                                optional_yield y,
+                                const DoutPrefixProvider *dpp);
+
+  /* user/bucket */
+  int link_bucket(const rgw_user& user_id,
+                  const rgw_bucket& bucket,
+                  ceph::real_time creation_time,
+                 optional_yield y,
+                  const DoutPrefixProvider *dpp,
+                  bool update_entrypoint = true,
+                  rgw_ep_info *pinfo = nullptr);
+
+  int unlink_bucket(const rgw_user& user_id,
+                    const rgw_bucket& bucket,
+                   optional_yield y,
+                    const DoutPrefixProvider *dpp,
+                    bool update_entrypoint = true);
+
+  int chown(rgw::sal::Driver* driver, rgw::sal::Bucket* bucket,
+            const rgw_user& user_id, const std::string& display_name,
+            const std::string& marker, optional_yield y, const DoutPrefixProvider *dpp);
+
+  int read_buckets_stats(std::map<std::string, RGWBucketEnt>& m,
+                         optional_yield y,
+                         const DoutPrefixProvider *dpp);
+
+  int read_bucket_stats(const rgw_bucket& bucket,
+                        RGWBucketEnt *result,
+                        optional_yield y,
+                        const DoutPrefixProvider *dpp);
+
+  /* quota related */
+  int sync_user_stats(const DoutPrefixProvider *dpp, 
+                      const rgw_user& user_id, const RGWBucketInfo& bucket_info,
+                     optional_yield y,
+                      RGWBucketEnt* pent);
+
+  /* bucket sync */
+  int get_sync_policy_handler(std::optional<rgw_zone_id> zone,
+                              std::optional<rgw_bucket> bucket,
+                             RGWBucketSyncPolicyHandlerRef *phandler,
+                             optional_yield y,
+                              const DoutPrefixProvider *dpp);
+  int bucket_exports_data(const rgw_bucket& bucket,
+                          optional_yield y,
+                          const DoutPrefixProvider *dpp);
+  int bucket_imports_data(const rgw_bucket& bucket,
+                          optional_yield y,
+                          const DoutPrefixProvider *dpp);
+
+private:
+  int convert_old_bucket_info(RGWSI_Bucket_X_Ctx& ctx,
+                              const rgw_bucket& bucket,
+                              optional_yield y,
+                              const DoutPrefixProvider *dpp);
+
+  int do_store_bucket_instance_info(RGWSI_Bucket_BI_Ctx& ctx,
+                                    const rgw_bucket& bucket,
+                                    RGWBucketInfo& info,
+                                    optional_yield y,
+                                    const DoutPrefixProvider *dpp,
+                                    const BucketInstance::PutParams& params);
+
+  int do_store_linked_bucket_info(RGWSI_Bucket_X_Ctx& ctx,
+                                  RGWBucketInfo& info,
+                                  RGWBucketInfo *orig_info,
+                                  bool exclusive, real_time mtime,
+                                  obj_version *pep_objv,
+                                  std::map<std::string, bufferlist> *pattrs,
+                                  bool create_entry_point,
+                                 optional_yield,
+                                  const DoutPrefixProvider *dpp);
+
+  int do_link_bucket(RGWSI_Bucket_EP_Ctx& ctx,
+                     const rgw_user& user,
+                     const rgw_bucket& bucket,
+                     ceph::real_time creation_time,
+                     bool update_entrypoint,
+                     rgw_ep_info *pinfo,
+                    optional_yield y,
+                     const DoutPrefixProvider *dpp);
+
+  int do_unlink_bucket(RGWSI_Bucket_EP_Ctx& ctx,
+                       const rgw_user& user_id,
+                       const rgw_bucket& bucket,
+                       bool update_entrypoint,
+                      optional_yield y,
+                       const DoutPrefixProvider *dpp);
+
+};
+
+bool rgw_find_bucket_by_id(const DoutPrefixProvider *dpp, CephContext *cct, rgw::sal::Driver* driver, const std::string& marker,
+                           const std::string& bucket_id, rgw_bucket* bucket_out);
diff --git a/src/rgw/driver/rados/rgw_bucket_sync.cc b/src/rgw/driver/rados/rgw_bucket_sync.cc
new file mode 100644 (file)
index 0000000..5fd81c5
--- /dev/null
@@ -0,0 +1,941 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "rgw_common.h"
+#include "rgw_bucket_sync.h"
+#include "rgw_data_sync.h"
+#include "rgw_zone.h"
+
+#include "services/svc_zone.h"
+#include "services/svc_bucket_sync.h"
+
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+
+ostream& operator<<(ostream& os, const rgw_sync_bucket_entity& e) {
+  os << "{b=" << rgw_sync_bucket_entities::bucket_key(e.bucket) << ",z=" << e.zone.value_or(rgw_zone_id()) << ",az=" << (int)e.all_zones << "}";
+  return os;
+}
+
+ostream& operator<<(ostream& os, const rgw_sync_bucket_pipe& pipe) {
+  os << "{s=" << pipe.source << ",d=" << pipe.dest << "}";
+  return os;
+}
+
+ostream& operator<<(ostream& os, const rgw_sync_bucket_entities& e) {
+  os << "{b=" << rgw_sync_bucket_entities::bucket_key(e.bucket) << ",z=" << e.zones.value_or(std::set<rgw_zone_id>()) << "}";
+  return os;
+}
+
+ostream& operator<<(ostream& os, const rgw_sync_bucket_pipes& pipe) {
+  os << "{id=" << pipe.id << ",s=" << pipe.source << ",d=" << pipe.dest << "}";
+  return os;
+}
+
+static std::vector<rgw_sync_bucket_pipe> filter_relevant_pipes(const std::vector<rgw_sync_bucket_pipes>& pipes,
+                                                               const rgw_zone_id& source_zone,
+                                                               const rgw_zone_id& dest_zone)
+{
+  std::vector<rgw_sync_bucket_pipe> relevant_pipes;
+  for (auto& p : pipes) {
+    if (p.source.match_zone(source_zone) &&
+        p.dest.match_zone(dest_zone)) {
+      for (auto pipe : p.expand()) {
+        pipe.source.apply_zone(source_zone);
+        pipe.dest.apply_zone(dest_zone);
+        relevant_pipes.push_back(pipe);
+      }
+    }
+  }
+
+  return relevant_pipes;
+}
+
+static bool is_wildcard_bucket(const rgw_bucket& bucket)
+{
+  return bucket.name.empty();
+}
+
+void rgw_sync_group_pipe_map::dump(ceph::Formatter *f) const
+{
+  encode_json("zone", zone.id, f);
+  encode_json("buckets", rgw_sync_bucket_entities::bucket_key(bucket), f);
+  encode_json("sources", sources, f);
+  encode_json("dests", dests, f);
+}
+
+
+template <typename CB1, typename CB2>
+void rgw_sync_group_pipe_map::try_add_to_pipe_map(const rgw_zone_id& source_zone,
+                                                  const rgw_zone_id& dest_zone,
+                                                  const std::vector<rgw_sync_bucket_pipes>& pipes,
+                                                  zb_pipe_map_t *pipe_map,
+                                                  CB1 filter_cb,
+                                                  CB2 call_filter_cb)
+{
+  if (!filter_cb(source_zone, nullopt, dest_zone, nullopt)) {
+    return;
+  }
+  auto relevant_pipes = filter_relevant_pipes(pipes, source_zone, dest_zone);
+
+  for (auto& pipe : relevant_pipes) {
+    rgw_sync_bucket_entity zb;
+    if (!call_filter_cb(pipe, &zb)) {
+      continue;
+    }
+    pipe_map->insert(make_pair(zb, pipe));
+  }
+}
+          
+template <typename CB>
+void rgw_sync_group_pipe_map::try_add_source(const rgw_zone_id& source_zone,
+                  const rgw_zone_id& dest_zone,
+                  const std::vector<rgw_sync_bucket_pipes>& pipes,
+                  CB filter_cb)
+{
+  return try_add_to_pipe_map(source_zone, dest_zone, pipes,
+                             &sources,
+                             filter_cb,
+                             [&](const rgw_sync_bucket_pipe& pipe, rgw_sync_bucket_entity *zb) {
+                             *zb = rgw_sync_bucket_entity{source_zone, pipe.source.get_bucket()};
+                             return filter_cb(source_zone, zb->bucket, dest_zone, pipe.dest.get_bucket());
+                             });
+}
+
+template <typename CB>
+void rgw_sync_group_pipe_map::try_add_dest(const rgw_zone_id& source_zone,
+                                           const rgw_zone_id& dest_zone,
+                                           const std::vector<rgw_sync_bucket_pipes>& pipes,
+                                           CB filter_cb)
+{
+  return try_add_to_pipe_map(source_zone, dest_zone, pipes,
+                             &dests,
+                             filter_cb,
+                             [&](const rgw_sync_bucket_pipe& pipe, rgw_sync_bucket_entity *zb) {
+                             *zb = rgw_sync_bucket_entity{dest_zone, pipe.dest.get_bucket()};
+                             return filter_cb(source_zone, pipe.source.get_bucket(), dest_zone, zb->bucket);
+                             });
+}
+
+using zb_pipe_map_t = rgw_sync_group_pipe_map::zb_pipe_map_t;
+
+pair<zb_pipe_map_t::const_iterator, zb_pipe_map_t::const_iterator> rgw_sync_group_pipe_map::find_pipes(const zb_pipe_map_t& m,
+                                                                                                       const rgw_zone_id& zone,
+                                                                                                       std::optional<rgw_bucket> b) const
+{
+  if (!b) {
+    return m.equal_range(rgw_sync_bucket_entity{zone, rgw_bucket()});
+  }
+
+  auto zb = rgw_sync_bucket_entity{zone, *b};
+
+  auto range = m.equal_range(zb);
+  if (range.first == range.second &&
+      !is_wildcard_bucket(*b)) {
+    /* couldn't find the specific bucket, try to find by wildcard */
+    zb.bucket = rgw_bucket();
+    range = m.equal_range(zb);
+  }
+
+  return range;
+}
+
+
+template <typename CB>
+void rgw_sync_group_pipe_map::init(const DoutPrefixProvider *dpp,
+                                   CephContext *cct,
+                                   const rgw_zone_id& _zone,
+                                   std::optional<rgw_bucket> _bucket,
+                                   const rgw_sync_policy_group& group,
+                                   rgw_sync_data_flow_group *_default_flow,
+                                   std::set<rgw_zone_id> *_pall_zones,
+                                   CB filter_cb) {
+  zone = _zone;
+  bucket = _bucket;
+  default_flow = _default_flow;
+  pall_zones = _pall_zones;
+
+  rgw_sync_bucket_entity zb(zone, bucket);
+
+  status = group.status;
+
+  std::vector<rgw_sync_bucket_pipes> zone_pipes;
+
+  string bucket_key = (bucket ? bucket->get_key() : "*");
+
+  /* only look at pipes that touch the specific zone and bucket */
+  for (auto& pipe : group.pipes) {
+    if (pipe.contains_zone_bucket(zone, bucket)) {
+      ldpp_dout(dpp, 20) << __func__ << "(): pipe_map (zone=" << zone << " bucket=" << bucket_key << "): adding potential pipe: " << pipe << dendl;
+      zone_pipes.push_back(pipe);
+    }
+  }
+
+  const rgw_sync_data_flow_group *pflow;
+
+  if (!group.data_flow.empty()) {
+    pflow = &group.data_flow;
+  } else {
+    if (!default_flow) {
+      return;
+    }
+    pflow = default_flow;
+  }
+
+  auto& flow = *pflow;
+
+  pall_zones->insert(zone);
+
+  /* symmetrical */
+  for (auto& symmetrical_group : flow.symmetrical) {
+    if (symmetrical_group.zones.find(zone) != symmetrical_group.zones.end()) {
+      for (auto& z : symmetrical_group.zones) {
+        if (z != zone) {
+          pall_zones->insert(z);
+          try_add_source(z, zone, zone_pipes, filter_cb);
+          try_add_dest(zone, z, zone_pipes, filter_cb);
+        }
+      }
+    }
+  }
+
+  /* directional */
+  for (auto& rule : flow.directional) {
+    if (rule.source_zone == zone) {
+      pall_zones->insert(rule.dest_zone);
+      try_add_dest(zone, rule.dest_zone, zone_pipes, filter_cb);
+    } else if (rule.dest_zone == zone) {
+      pall_zones->insert(rule.source_zone);
+      try_add_source(rule.source_zone, zone, zone_pipes, filter_cb);
+    }
+  }
+}
+
+/*
+ * find all relevant pipes in our zone that match {dest_bucket} <- {source_zone, source_bucket}
+ */
+vector<rgw_sync_bucket_pipe> rgw_sync_group_pipe_map::find_source_pipes(const rgw_zone_id& source_zone,
+                                                                        std::optional<rgw_bucket> source_bucket,
+                                                                        std::optional<rgw_bucket> dest_bucket) const {
+  vector<rgw_sync_bucket_pipe> result;
+
+  auto range = find_pipes(sources, source_zone, source_bucket);
+
+  for (auto iter = range.first; iter != range.second; ++iter) {
+    auto pipe = iter->second;
+    if (pipe.dest.match_bucket(dest_bucket)) {
+      result.push_back(pipe);
+    }
+  }
+  return result;
+}
+
+/*
+ * find all relevant pipes in other zones that pull from a specific
+ * source bucket in out zone {source_bucket} -> {dest_zone, dest_bucket}
+ */
+vector<rgw_sync_bucket_pipe> rgw_sync_group_pipe_map::find_dest_pipes(std::optional<rgw_bucket> source_bucket,
+                                                                      const rgw_zone_id& dest_zone,
+                                                                      std::optional<rgw_bucket> dest_bucket) const {
+  vector<rgw_sync_bucket_pipe> result;
+
+  auto range = find_pipes(dests, dest_zone, dest_bucket);
+
+  for (auto iter = range.first; iter != range.second; ++iter) {
+    auto pipe = iter->second;
+    if (pipe.source.match_bucket(source_bucket)) {
+      result.push_back(pipe);
+    }
+  }
+
+  return result;
+}
+
+/*
+ * find all relevant pipes from {source_zone, source_bucket} -> {dest_zone, dest_bucket}
+ */
+vector<rgw_sync_bucket_pipe> rgw_sync_group_pipe_map::find_pipes(const rgw_zone_id& source_zone,
+                                                                 std::optional<rgw_bucket> source_bucket,
+                                                                 const rgw_zone_id& dest_zone,
+                                                                 std::optional<rgw_bucket> dest_bucket) const {
+  if (dest_zone == zone) {
+    return find_source_pipes(source_zone, source_bucket, dest_bucket);
+  }
+
+  if (source_zone == zone) {
+    return find_dest_pipes(source_bucket, dest_zone, dest_bucket);
+  }
+
+  return vector<rgw_sync_bucket_pipe>();
+}
+
+void RGWBucketSyncFlowManager::pipe_rules::insert(const rgw_sync_bucket_pipe& pipe)
+{
+  pipes.push_back(pipe);
+
+  auto ppipe = &pipes.back();
+  auto prefix = ppipe->params.source.filter.prefix.value_or(string());
+
+  prefix_refs.insert(make_pair(prefix, ppipe));
+
+  for (auto& t : ppipe->params.source.filter.tags) {
+    string tag = t.key + "=" + t.value;
+    auto titer = tag_refs.find(tag);
+    if (titer != tag_refs.end() &&
+        ppipe->params.priority > titer->second->params.priority) {
+      titer->second = ppipe;
+    } else {
+      tag_refs[tag] = ppipe;
+    }
+  }
+}
+
+bool RGWBucketSyncFlowManager::pipe_rules::find_basic_info_without_tags(const rgw_obj_key& key,
+                                                                        std::optional<rgw_user> *user,
+                                                                        std::optional<rgw_user> *acl_translation_owner,
+                                                                        std::optional<string> *storage_class,
+                                                                        rgw_sync_pipe_params::Mode *mode,
+                                                                        bool *need_more_info) const
+{
+  std::optional<string> owner;
+
+  *need_more_info = false;
+
+  if (prefix_refs.empty()) {
+    return false;
+  }
+
+  auto end = prefix_refs.upper_bound(key.name);
+  auto iter = end;
+  if (iter != prefix_refs.begin()) {
+    --iter;
+  }
+  if (iter == prefix_refs.end()) {
+    return false;
+  }
+
+  if (iter != prefix_refs.begin()) {
+    iter = prefix_refs.find(iter->first); /* prefix_refs is multimap, find first element
+                                             holding that key */
+  }
+
+  std::vector<decltype(iter)> iters;
+
+  std::optional<int> priority;
+
+  for (; iter != end; ++iter) {
+    auto& prefix = iter->first;
+    if (!boost::starts_with(key.name, prefix)) {
+      continue;
+    }
+
+    auto& rule_params = iter->second->params;
+    auto& filter = rule_params.source.filter;
+
+    if (rule_params.priority > priority) {
+      priority = rule_params.priority;
+
+      if (!filter.has_tags()) {
+        iters.clear();
+      }
+      iters.push_back(iter);
+
+      *need_more_info = filter.has_tags(); /* if highest priority filter has tags, then
+                                              we can't be sure if it would be used.
+                                              We need to first read the info from the source object */
+    }
+  }
+
+  if (iters.empty()) {
+    return false;
+  }
+
+  std::optional<rgw_user> _user;
+  std::optional<rgw_sync_pipe_acl_translation> _acl_translation;
+  std::optional<string> _storage_class;
+  rgw_sync_pipe_params::Mode _mode{rgw_sync_pipe_params::Mode::MODE_SYSTEM};
+
+  // make sure all params are the same by saving the first one
+  // encountered and comparing all subsequent to it
+  bool first_iter = true;
+  for (auto& iter : iters) {
+    const rgw_sync_pipe_params& rule_params = iter->second->params;
+    if (first_iter) {
+      _user = rule_params.user;
+      _acl_translation = rule_params.dest.acl_translation;
+      _storage_class = rule_params.dest.storage_class;
+      _mode = rule_params.mode;
+      first_iter = false;
+    } else {
+      // note: three of these == operators are comparing std::optional
+      // against std::optional; as one would expect they are equal a)
+      // if both do not contain values or b) if both do and those
+      // contained values are the same
+      const bool conflict =
+       !(_user == rule_params.user &&
+         _acl_translation == rule_params.dest.acl_translation &&
+         _storage_class == rule_params.dest.storage_class &&
+         _mode == rule_params.mode);
+      if (conflict) {
+       *need_more_info = true;
+       return false;
+      }
+    }
+  }
+
+  *user = _user;
+  if (_acl_translation) {
+    *acl_translation_owner = _acl_translation->owner;
+  }
+  *storage_class = _storage_class;
+  *mode = _mode;
+
+  return true;
+}
+
+bool RGWBucketSyncFlowManager::pipe_rules::find_obj_params(const rgw_obj_key& key,
+                                                           const RGWObjTags::tag_map_t& tags,
+                                                           rgw_sync_pipe_params *params) const
+{
+  if (prefix_refs.empty()) {
+    return false;
+  }
+
+  auto iter = prefix_refs.upper_bound(key.name);
+  if (iter != prefix_refs.begin()) {
+    --iter;
+  }
+  if (iter == prefix_refs.end()) {
+    return false;
+  }
+
+  auto end = prefix_refs.upper_bound(key.name);
+  auto max = end;
+
+  std::optional<int> priority;
+
+  for (; iter != end; ++iter) {
+    /* NOTE: this is not the most efficient way to do it,
+     * a trie data structure would be better
+     */
+    auto& prefix = iter->first;
+    if (!boost::starts_with(key.name, prefix)) {
+      continue;
+    }
+
+    auto& rule_params = iter->second->params;
+    auto& filter = rule_params.source.filter;
+
+    if (!filter.check_tags(tags)) {
+      continue;
+    }
+
+    if (rule_params.priority > priority) {
+      priority = rule_params.priority;
+      max = iter;
+    }
+  }
+
+  if (max == end) {
+    return false;
+  }
+
+  *params = max->second->params;
+  return true;
+}
+
+/*
+ * return either the current prefix for s, or the next one if s is not within a prefix
+ */
+
+RGWBucketSyncFlowManager::pipe_rules::prefix_map_t::const_iterator RGWBucketSyncFlowManager::pipe_rules::prefix_search(const std::string& s) const
+{
+  if (prefix_refs.empty()) {
+    return prefix_refs.end();
+  }
+  auto next = prefix_refs.upper_bound(s);
+  auto iter = next;
+  if (iter != prefix_refs.begin()) {
+    --iter;
+  }
+  if (!boost::starts_with(s, iter->first)) {
+    return next;
+  }
+
+  return iter;
+}
+
+void RGWBucketSyncFlowManager::pipe_set::insert(const rgw_sync_bucket_pipe& pipe) {
+  pipe_map.insert(make_pair(pipe.id, pipe));
+
+  auto& rules_ref = rules[endpoints_pair(pipe)];
+
+  if (!rules_ref) {
+    rules_ref = make_shared<RGWBucketSyncFlowManager::pipe_rules>();
+  }
+
+  rules_ref->insert(pipe);
+
+  pipe_handler h(rules_ref, pipe);
+
+  handlers.insert(h);
+}
+
+void RGWBucketSyncFlowManager::pipe_set::dump(ceph::Formatter *f) const
+{
+  encode_json("pipes", pipe_map, f);
+}
+
+bool RGWBucketSyncFlowManager::allowed_data_flow(const rgw_zone_id& source_zone,
+                                                 std::optional<rgw_bucket> source_bucket,
+                                                 const rgw_zone_id& dest_zone,
+                                                 std::optional<rgw_bucket> dest_bucket,
+                                                 bool check_activated) const
+{
+  bool found = false;
+  bool found_activated = false;
+
+  for (auto m : flow_groups) {
+    auto& fm = m.second;
+    auto pipes = fm.find_pipes(source_zone, source_bucket,
+                               dest_zone, dest_bucket);
+
+    bool is_found = !pipes.empty();
+
+    if (is_found) {
+      switch (fm.status) {
+        case rgw_sync_policy_group::Status::FORBIDDEN:
+          return false;
+        case rgw_sync_policy_group::Status::ENABLED:
+          found = true;
+          found_activated = true;
+          break;
+        case rgw_sync_policy_group::Status::ALLOWED:
+          found = true;
+          break;
+        default:
+          break; /* unknown -- ignore */
+      }
+    }
+  }
+
+  if (check_activated && found_activated) {
+    return true;
+  }
+
+  return found;
+}
+
+void RGWBucketSyncFlowManager::init(const DoutPrefixProvider *dpp, const rgw_sync_policy_info& sync_policy) {
+  std::optional<rgw_sync_data_flow_group> default_flow;
+  if (parent) {
+    default_flow.emplace();
+    default_flow->init_default(parent->all_zones);
+  }
+
+  for (auto& item : sync_policy.groups) {
+    auto& group = item.second;
+    auto& flow_group_map = flow_groups[group.id];
+
+    flow_group_map.init(dpp, cct, zone_id, bucket, group,
+                        (default_flow ? &(*default_flow) : nullptr),
+                        &all_zones,
+                        [&](const rgw_zone_id& source_zone,
+                            std::optional<rgw_bucket> source_bucket,
+                            const rgw_zone_id& dest_zone,
+                            std::optional<rgw_bucket> dest_bucket) {
+                        if (!parent) {
+                          return true;
+                        }
+                        return parent->allowed_data_flow(source_zone,
+                                                         source_bucket,
+                                                         dest_zone,
+                                                         dest_bucket,
+                                                         false); /* just check that it's not disabled */
+                        });
+  }
+}
+
+void RGWBucketSyncFlowManager::reflect(const DoutPrefixProvider *dpp,
+                                       std::optional<rgw_bucket> effective_bucket,
+                                       RGWBucketSyncFlowManager::pipe_set *source_pipes,
+                                       RGWBucketSyncFlowManager::pipe_set *dest_pipes,
+                                       bool only_enabled) const
+
+{
+  string effective_bucket_key;
+  if (effective_bucket) {
+    effective_bucket_key = effective_bucket->get_key();
+  }
+  if (parent) {
+    parent->reflect(dpp, effective_bucket, source_pipes, dest_pipes, only_enabled);
+  }
+
+  for (auto& item : flow_groups) {
+    auto& flow_group_map = item.second;
+
+    /* only return enabled groups */
+    if (flow_group_map.status != rgw_sync_policy_group::Status::ENABLED &&
+        (only_enabled || flow_group_map.status != rgw_sync_policy_group::Status::ALLOWED)) {
+      continue;
+    }
+
+    for (auto& entry : flow_group_map.sources) {
+      rgw_sync_bucket_pipe pipe = entry.second;
+      if (!pipe.dest.match_bucket(effective_bucket)) {
+        continue;
+      }
+
+      pipe.source.apply_bucket(effective_bucket);
+      pipe.dest.apply_bucket(effective_bucket);
+
+      ldpp_dout(dpp, 20) << __func__ << "(): flow manager (bucket=" << effective_bucket_key << "): adding source pipe: " << pipe << dendl;
+      source_pipes->insert(pipe);
+    }
+
+    for (auto& entry : flow_group_map.dests) {
+      rgw_sync_bucket_pipe pipe = entry.second;
+
+      if (!pipe.source.match_bucket(effective_bucket)) {
+        continue;
+      }
+
+      pipe.source.apply_bucket(effective_bucket);
+      pipe.dest.apply_bucket(effective_bucket);
+
+      ldpp_dout(dpp, 20) << __func__ << "(): flow manager (bucket=" << effective_bucket_key << "): adding dest pipe: " << pipe << dendl;
+      dest_pipes->insert(pipe);
+    }
+  }
+}
+
+
+RGWBucketSyncFlowManager::RGWBucketSyncFlowManager(CephContext *_cct,
+                                                   const rgw_zone_id& _zone_id,
+                                                   std::optional<rgw_bucket> _bucket,
+                                                   const RGWBucketSyncFlowManager *_parent) : cct(_cct),
+                                                                                              zone_id(_zone_id),
+                                                                                              bucket(_bucket),
+                                                                                              parent(_parent) {}
+
+
+void RGWSyncPolicyCompat::convert_old_sync_config(RGWSI_Zone *zone_svc,
+                                                  RGWSI_SyncModules *sync_modules_svc,
+                                                  rgw_sync_policy_info *ppolicy)
+{
+  bool found = false;
+
+  rgw_sync_policy_info policy;
+
+  auto& group = policy.groups["default"];
+  auto& zonegroup = zone_svc->get_zonegroup();
+
+  for (const auto& ziter1 : zonegroup.zones) {
+    auto& id1 = ziter1.first;
+    const RGWZone& z1 = ziter1.second;
+
+    for (const auto& ziter2 : zonegroup.zones) {
+      auto& id2 = ziter2.first;
+      const RGWZone& z2 = ziter2.second;
+
+      if (id1 == id2) {
+        continue;
+      }
+
+      if (z1.syncs_from(z2.name)) {
+        found = true;
+        rgw_sync_directional_rule *rule;
+        group.data_flow.find_or_create_directional(id2,
+                                                   id1,
+                                                   &rule);
+      }
+    }
+  }
+
+  if (!found) { /* nothing syncs */
+    return;
+  }
+
+  rgw_sync_bucket_pipes pipes;
+  pipes.id = "all";
+  pipes.source.all_zones = true;
+  pipes.dest.all_zones = true;
+
+  group.pipes.emplace_back(std::move(pipes));
+
+
+  group.status = rgw_sync_policy_group::Status::ENABLED;
+
+  *ppolicy = std::move(policy);
+}
+
+RGWBucketSyncPolicyHandler::RGWBucketSyncPolicyHandler(RGWSI_Zone *_zone_svc,
+                                                       RGWSI_SyncModules *sync_modules_svc,
+                                                      RGWSI_Bucket_Sync *_bucket_sync_svc,
+                                                       std::optional<rgw_zone_id> effective_zone) : zone_svc(_zone_svc) ,
+                                                                                                    bucket_sync_svc(_bucket_sync_svc) {
+  zone_id = effective_zone.value_or(zone_svc->zone_id());
+  flow_mgr.reset(new RGWBucketSyncFlowManager(zone_svc->ctx(),
+                                              zone_id,
+                                              nullopt,
+                                              nullptr));
+  sync_policy = zone_svc->get_zonegroup().sync_policy;
+
+  if (sync_policy.empty()) {
+    RGWSyncPolicyCompat::convert_old_sync_config(zone_svc, sync_modules_svc, &sync_policy);
+    legacy_config = true;
+  }
+}
+
+RGWBucketSyncPolicyHandler::RGWBucketSyncPolicyHandler(const RGWBucketSyncPolicyHandler *_parent,
+                                                       const RGWBucketInfo& _bucket_info,
+                                                       map<string, bufferlist>&& _bucket_attrs) : parent(_parent),
+                                                                                                       bucket_info(_bucket_info),
+                                                                                                       bucket_attrs(std::move(_bucket_attrs)) {
+  if (_bucket_info.sync_policy) {
+    sync_policy = *_bucket_info.sync_policy;
+
+    for (auto& entry : sync_policy.groups) {
+      for (auto& pipe : entry.second.pipes) {
+        if (pipe.params.mode == rgw_sync_pipe_params::MODE_USER &&
+            pipe.params.user.empty()) {
+          pipe.params.user = _bucket_info.owner;
+        }
+      }
+    }
+  }
+  legacy_config = parent->legacy_config;
+  bucket = _bucket_info.bucket;
+  zone_svc = parent->zone_svc;
+  bucket_sync_svc = parent->bucket_sync_svc;
+  flow_mgr.reset(new RGWBucketSyncFlowManager(zone_svc->ctx(),
+                                              parent->zone_id,
+                                              _bucket_info.bucket,
+                                              parent->flow_mgr.get()));
+}
+
+RGWBucketSyncPolicyHandler::RGWBucketSyncPolicyHandler(const RGWBucketSyncPolicyHandler *_parent,
+                                                       const rgw_bucket& _bucket,
+                                                       std::optional<rgw_sync_policy_info> _sync_policy) : parent(_parent) {
+  if (_sync_policy) {
+    sync_policy = *_sync_policy;
+  }
+  legacy_config = parent->legacy_config;
+  bucket = _bucket;
+  zone_svc = parent->zone_svc;
+  bucket_sync_svc = parent->bucket_sync_svc;
+  flow_mgr.reset(new RGWBucketSyncFlowManager(zone_svc->ctx(),
+                                              parent->zone_id,
+                                              _bucket,
+                                              parent->flow_mgr.get()));
+}
+
+RGWBucketSyncPolicyHandler *RGWBucketSyncPolicyHandler::alloc_child(const RGWBucketInfo& bucket_info,
+                                                                    map<string, bufferlist>&& bucket_attrs) const
+{
+  return new RGWBucketSyncPolicyHandler(this, bucket_info, std::move(bucket_attrs));
+}
+
+RGWBucketSyncPolicyHandler *RGWBucketSyncPolicyHandler::alloc_child(const rgw_bucket& bucket,
+                                                                    std::optional<rgw_sync_policy_info> sync_policy) const
+{
+  return new RGWBucketSyncPolicyHandler(this, bucket, sync_policy);
+}
+
+int RGWBucketSyncPolicyHandler::init(const DoutPrefixProvider *dpp, optional_yield y)
+{
+  int r = bucket_sync_svc->get_bucket_sync_hints(dpp, bucket.value_or(rgw_bucket()),
+                                                &source_hints,
+                                                &target_hints,
+                                                y);
+  if (r < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: failed to initialize bucket sync policy handler: get_bucket_sync_hints() on bucket="
+      << bucket << " returned r=" << r << dendl;
+    return r;
+  }
+
+  flow_mgr->init(dpp, sync_policy);
+
+  reflect(dpp, &source_pipes,
+          &target_pipes,
+          &sources,
+          &targets,
+          &source_zones,
+          &target_zones,
+          true);
+
+  return 0;
+}
+
+void RGWBucketSyncPolicyHandler::reflect(const DoutPrefixProvider *dpp, RGWBucketSyncFlowManager::pipe_set *psource_pipes,
+                                         RGWBucketSyncFlowManager::pipe_set *ptarget_pipes,
+                                         map<rgw_zone_id, RGWBucketSyncFlowManager::pipe_set> *psources,
+                                         map<rgw_zone_id, RGWBucketSyncFlowManager::pipe_set> *ptargets,
+                                         std::set<rgw_zone_id> *psource_zones,
+                                         std::set<rgw_zone_id> *ptarget_zones,
+                                         bool only_enabled) const
+{
+  RGWBucketSyncFlowManager::pipe_set _source_pipes;
+  RGWBucketSyncFlowManager::pipe_set _target_pipes;
+  map<rgw_zone_id, RGWBucketSyncFlowManager::pipe_set> _sources;
+  map<rgw_zone_id, RGWBucketSyncFlowManager::pipe_set> _targets;
+  std::set<rgw_zone_id> _source_zones;
+  std::set<rgw_zone_id> _target_zones;
+
+  flow_mgr->reflect(dpp, bucket, &_source_pipes, &_target_pipes, only_enabled);
+
+  for (auto& entry : _source_pipes.pipe_map) {
+    auto& pipe = entry.second;
+    if (!pipe.source.zone) {
+      continue;
+    }
+    _source_zones.insert(*pipe.source.zone);
+    _sources[*pipe.source.zone].insert(pipe);
+  }
+
+  for (auto& entry : _target_pipes.pipe_map) {
+    auto& pipe = entry.second;
+    if (!pipe.dest.zone) {
+      continue;
+    }
+    _target_zones.insert(*pipe.dest.zone);
+    _targets[*pipe.dest.zone].insert(pipe);
+  }
+
+  if (psource_pipes) {
+    *psource_pipes = std::move(_source_pipes);
+  }
+  if (ptarget_pipes) {
+    *ptarget_pipes = std::move(_target_pipes);
+  }
+  if (psources) {
+    *psources = std::move(_sources);
+  }
+  if (ptargets) {
+    *ptargets = std::move(_targets);
+  }
+  if (psource_zones) {
+    *psource_zones = std::move(_source_zones);
+  }
+  if (ptarget_zones) {
+    *ptarget_zones = std::move(_target_zones);
+  }
+}
+
+multimap<rgw_zone_id, rgw_sync_bucket_pipe> RGWBucketSyncPolicyHandler::get_all_sources() const
+{
+  multimap<rgw_zone_id, rgw_sync_bucket_pipe> m;
+
+  for (auto& source_entry : sources) {
+    auto& zone_id = source_entry.first;
+
+    auto& pipes = source_entry.second.pipe_map;
+
+    for (auto& entry : pipes) {
+      auto& pipe = entry.second;
+      m.insert(make_pair(zone_id, pipe));
+    }
+  }
+
+  for (auto& pipe : resolved_sources) {
+    if (!pipe.source.zone) {
+      continue;
+    }
+
+    m.insert(make_pair(*pipe.source.zone, pipe));
+  }
+
+  return m;
+}
+
+multimap<rgw_zone_id, rgw_sync_bucket_pipe> RGWBucketSyncPolicyHandler::get_all_dests() const
+{
+  multimap<rgw_zone_id, rgw_sync_bucket_pipe> m;
+
+  for (auto& dest_entry : targets) {
+    auto& zone_id = dest_entry.first;
+
+    auto& pipes = dest_entry.second.pipe_map;
+
+    for (auto& entry : pipes) {
+      auto& pipe = entry.second;
+      m.insert(make_pair(zone_id, pipe));
+    }
+  }
+
+  for (auto& pipe : resolved_dests) {
+    if (!pipe.dest.zone) {
+      continue;
+    }
+
+    m.insert(make_pair(*pipe.dest.zone, pipe));
+  }
+
+  return m;
+}
+
+multimap<rgw_zone_id, rgw_sync_bucket_pipe> RGWBucketSyncPolicyHandler::get_all_dests_in_zone(const rgw_zone_id& zone_id) const
+{
+  multimap<rgw_zone_id, rgw_sync_bucket_pipe> m;
+
+  auto iter = targets.find(zone_id);
+  if (iter != targets.end()) {
+    auto& pipes = iter->second.pipe_map;
+
+    for (auto& entry : pipes) {
+      auto& pipe = entry.second;
+      m.insert(make_pair(zone_id, pipe));
+    }
+  }
+
+  for (auto& pipe : resolved_dests) {
+    if (!pipe.dest.zone ||
+        *pipe.dest.zone != zone_id) {
+      continue;
+    }
+
+    m.insert(make_pair(*pipe.dest.zone, pipe));
+  }
+
+  return m;
+}
+
+void RGWBucketSyncPolicyHandler::get_pipes(std::set<rgw_sync_bucket_pipe> *_sources, std::set<rgw_sync_bucket_pipe> *_targets,
+                                           std::optional<rgw_sync_bucket_entity> filter_peer) { /* return raw pipes */
+  for (auto& entry : source_pipes.pipe_map) {
+    auto& source_pipe = entry.second;
+    if (!filter_peer ||
+        source_pipe.source.match(*filter_peer)) {
+      _sources->insert(source_pipe);
+    }
+  }
+
+  for (auto& entry : target_pipes.pipe_map) {
+    auto& target_pipe = entry.second;
+    if (!filter_peer ||
+        target_pipe.dest.match(*filter_peer)) {
+      _targets->insert(target_pipe);
+    }
+  }
+}
+
+bool RGWBucketSyncPolicyHandler::bucket_exports_data() const
+{
+  if (!bucket) {
+    return false;
+  }
+
+  if (bucket_is_sync_source()) {
+    return true;
+  }
+
+  return (zone_svc->need_to_log_data() &&
+          bucket_info->datasync_flag_enabled());
+}
+
+bool RGWBucketSyncPolicyHandler::bucket_imports_data() const
+{
+  return bucket_is_sync_target();
+}
+
diff --git a/src/rgw/driver/rados/rgw_bucket_sync.h b/src/rgw/driver/rados/rgw_bucket_sync.h
new file mode 100644 (file)
index 0000000..7614377
--- /dev/null
@@ -0,0 +1,412 @@
+
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2018 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#pragma once
+
+#include "rgw_common.h"
+#include "rgw_sync_policy.h"
+
+class RGWSI_Zone;
+class RGWSI_SyncModules;
+class RGWSI_Bucket_Sync;
+
+struct rgw_sync_group_pipe_map;
+struct rgw_sync_bucket_pipes;
+struct rgw_sync_policy_info;
+
+struct rgw_sync_group_pipe_map {
+  rgw_zone_id zone;
+  std::optional<rgw_bucket> bucket;
+
+  rgw_sync_policy_group::Status status{rgw_sync_policy_group::Status::FORBIDDEN};
+
+  using zb_pipe_map_t = std::multimap<rgw_sync_bucket_entity, rgw_sync_bucket_pipe>;
+
+  zb_pipe_map_t sources; /* all the pipes where zone is pulling from */
+  zb_pipe_map_t dests; /* all the pipes that pull from zone */
+
+  std::set<rgw_zone_id> *pall_zones{nullptr};
+  rgw_sync_data_flow_group *default_flow{nullptr}; /* flow to use if policy doesn't define it,
+                                                      used in the case of bucket sync policy, not at the
+                                                      zonegroup level */
+
+  void dump(ceph::Formatter *f) const;
+
+  template <typename CB1, typename CB2>
+  void try_add_to_pipe_map(const rgw_zone_id& source_zone,
+                           const rgw_zone_id& dest_zone,
+                           const std::vector<rgw_sync_bucket_pipes>& pipes,
+                           zb_pipe_map_t *pipe_map,
+                           CB1 filter_cb,
+                           CB2 call_filter_cb);
+          
+  template <typename CB>
+  void try_add_source(const rgw_zone_id& source_zone,
+                      const rgw_zone_id& dest_zone,
+                      const std::vector<rgw_sync_bucket_pipes>& pipes,
+                      CB filter_cb);
+          
+  template <typename CB>
+  void try_add_dest(const rgw_zone_id& source_zone,
+                  const rgw_zone_id& dest_zone,
+                  const std::vector<rgw_sync_bucket_pipes>& pipes,
+                  CB filter_cb);
+          
+  std::pair<zb_pipe_map_t::const_iterator, zb_pipe_map_t::const_iterator> find_pipes(const zb_pipe_map_t& m,
+                                                                                const rgw_zone_id& zone,
+                                                                                std::optional<rgw_bucket> b) const;
+
+  template <typename CB>
+  void init(const DoutPrefixProvider *dpp, CephContext *cct,
+            const rgw_zone_id& _zone,
+            std::optional<rgw_bucket> _bucket,
+            const rgw_sync_policy_group& group,
+            rgw_sync_data_flow_group *_default_flow,
+            std::set<rgw_zone_id> *_pall_zones,
+            CB filter_cb);
+
+  /*
+   * find all relevant pipes in our zone that match {dest_bucket} <- {source_zone, source_bucket}
+   */
+  std::vector<rgw_sync_bucket_pipe> find_source_pipes(const rgw_zone_id& source_zone,
+                                                 std::optional<rgw_bucket> source_bucket,
+                                                 std::optional<rgw_bucket> dest_bucket) const;
+
+  /*
+   * find all relevant pipes in other zones that pull from a specific
+   * source bucket in out zone {source_bucket} -> {dest_zone, dest_bucket}
+   */
+  std::vector<rgw_sync_bucket_pipe> find_dest_pipes(std::optional<rgw_bucket> source_bucket,
+                                               const rgw_zone_id& dest_zone,
+                                               std::optional<rgw_bucket> dest_bucket) const;
+
+  /*
+   * find all relevant pipes from {source_zone, source_bucket} -> {dest_zone, dest_bucket}
+   */
+  std::vector<rgw_sync_bucket_pipe> find_pipes(const rgw_zone_id& source_zone,
+                                          std::optional<rgw_bucket> source_bucket,
+                                          const rgw_zone_id& dest_zone,
+                                          std::optional<rgw_bucket> dest_bucket) const;
+};
+
+class RGWSyncPolicyCompat {
+public:
+  static void convert_old_sync_config(RGWSI_Zone *zone_svc,
+                                      RGWSI_SyncModules *sync_modules_svc,
+                                      rgw_sync_policy_info *ppolicy);
+};
+
+class RGWBucketSyncFlowManager {
+  friend class RGWBucketSyncPolicyHandler;
+public:
+  struct endpoints_pair {
+    rgw_sync_bucket_entity source;
+    rgw_sync_bucket_entity dest;
+
+    endpoints_pair() {}
+    endpoints_pair(const rgw_sync_bucket_pipe& pipe) {
+      source = pipe.source;
+      dest = pipe.dest;
+    }
+
+    bool operator<(const endpoints_pair& e) const {
+      if (source < e.source) {
+        return true;
+      }
+      if (e.source < source) {
+        return false;
+      }
+      return (dest < e.dest);
+    }
+  };
+
+  /*
+   * pipe_rules: deal with a set of pipes that have common endpoints_pair
+   */
+  class pipe_rules {
+    std::list<rgw_sync_bucket_pipe> pipes;
+
+  public:
+    using prefix_map_t = std::multimap<std::string, rgw_sync_bucket_pipe *>;
+
+    std::map<std::string, rgw_sync_bucket_pipe *> tag_refs;
+    prefix_map_t prefix_refs;
+
+    void insert(const rgw_sync_bucket_pipe& pipe);
+
+    bool find_basic_info_without_tags(const rgw_obj_key& key,
+                                      std::optional<rgw_user> *user,
+                                      std::optional<rgw_user> *acl_translation,
+                                      std::optional<std::string> *storage_class,
+                                      rgw_sync_pipe_params::Mode *mode,
+                                      bool *need_more_info) const;
+    bool find_obj_params(const rgw_obj_key& key, 
+                         const RGWObjTags::tag_map_t& tags,
+                         rgw_sync_pipe_params *params) const;
+
+    void scan_prefixes(std::vector<std::string> *prefixes) const;
+
+    prefix_map_t::const_iterator prefix_begin() const {
+      return prefix_refs.begin();
+    }
+    prefix_map_t::const_iterator prefix_search(const std::string& s) const;
+    prefix_map_t::const_iterator prefix_end() const {
+      return prefix_refs.end();
+    }
+  };
+
+  using pipe_rules_ref = std::shared_ptr<pipe_rules>;
+
+  /*
+   * pipe_handler: extends endpoints_rule to point at the corresponding rules handler
+   */
+  struct pipe_handler : public endpoints_pair {
+    pipe_rules_ref rules;
+
+    pipe_handler() {}
+    pipe_handler(pipe_rules_ref& _rules,
+                 const rgw_sync_bucket_pipe& _pipe) : endpoints_pair(_pipe),
+                                                      rules(_rules) {}
+    bool specific() const {
+      return source.specific() && dest.specific();
+    }
+    
+    bool find_basic_info_without_tags(const rgw_obj_key& key,
+                                      std::optional<rgw_user> *user,
+                                      std::optional<rgw_user> *acl_translation,
+                                      std::optional<std::string> *storage_class,
+                                      rgw_sync_pipe_params::Mode *mode,
+                                      bool *need_more_info) const {
+      if (!rules) {
+        return false;
+      }
+      return rules->find_basic_info_without_tags(key, user, acl_translation, storage_class, mode, need_more_info);
+    }
+
+    bool find_obj_params(const rgw_obj_key& key,
+                         const RGWObjTags::tag_map_t& tags,
+                         rgw_sync_pipe_params *params) const {
+      if (!rules) {
+        return false;
+      }
+      return rules->find_obj_params(key, tags, params);
+    }
+  };
+
+  struct pipe_set {
+    std::map<endpoints_pair, pipe_rules_ref> rules;
+    std::multimap<std::string, rgw_sync_bucket_pipe> pipe_map;
+
+    std::set<pipe_handler> handlers;
+
+    using iterator = std::set<pipe_handler>::iterator;
+
+    void clear() {
+      rules.clear();
+      pipe_map.clear();
+      handlers.clear();
+    }
+
+    void insert(const rgw_sync_bucket_pipe& pipe);
+
+    iterator begin() const {
+      return handlers.begin();
+    }
+
+    iterator end() const {
+      return handlers.end();
+    }
+
+    void dump(ceph::Formatter *f) const;
+  };
+
+private:
+
+  CephContext *cct;
+
+  rgw_zone_id zone_id;
+  std::optional<rgw_bucket> bucket;
+
+  const RGWBucketSyncFlowManager *parent{nullptr};
+
+  std::map<std::string, rgw_sync_group_pipe_map> flow_groups;
+
+  std::set<rgw_zone_id> all_zones;
+
+  bool allowed_data_flow(const rgw_zone_id& source_zone,
+                         std::optional<rgw_bucket> source_bucket,
+                         const rgw_zone_id& dest_zone,
+                         std::optional<rgw_bucket> dest_bucket,
+                         bool check_activated) const;
+
+  /*
+   * find all the matching flows om a flow map for a specific bucket
+   */
+  void update_flow_maps(const rgw_sync_bucket_pipes& pipe);
+
+  void init(const DoutPrefixProvider *dpp, const rgw_sync_policy_info& sync_policy);
+
+public:
+
+  RGWBucketSyncFlowManager(CephContext *_cct,
+                           const rgw_zone_id& _zone_id,
+                           std::optional<rgw_bucket> _bucket,
+                           const RGWBucketSyncFlowManager *_parent);
+
+  void reflect(const DoutPrefixProvider *dpp, std::optional<rgw_bucket> effective_bucket,
+               pipe_set *flow_by_source,
+               pipe_set *flow_by_dest,  
+               bool only_enabled) const;
+
+};
+
+static inline std::ostream& operator<<(std::ostream& os, const RGWBucketSyncFlowManager::endpoints_pair& e) {
+  os << e.dest << " -> " << e.source;
+  return os;
+}
+
+class RGWBucketSyncPolicyHandler {
+  bool legacy_config{false};
+  const RGWBucketSyncPolicyHandler *parent{nullptr};
+  RGWSI_Zone *zone_svc;
+  RGWSI_Bucket_Sync *bucket_sync_svc;
+  rgw_zone_id zone_id;
+  std::optional<RGWBucketInfo> bucket_info;
+  std::optional<std::map<std::string, bufferlist> > bucket_attrs;
+  std::optional<rgw_bucket> bucket;
+  std::unique_ptr<RGWBucketSyncFlowManager> flow_mgr;
+  rgw_sync_policy_info sync_policy;
+
+  RGWBucketSyncFlowManager::pipe_set source_pipes;
+  RGWBucketSyncFlowManager::pipe_set target_pipes;
+
+  std::map<rgw_zone_id, RGWBucketSyncFlowManager::pipe_set> sources; /* source pipes by source zone id */
+  std::map<rgw_zone_id, RGWBucketSyncFlowManager::pipe_set> targets; /* target pipes by target zone id */
+
+  std::set<rgw_zone_id> source_zones;
+  std::set<rgw_zone_id> target_zones;
+
+  std::set<rgw_bucket> source_hints;
+  std::set<rgw_bucket> target_hints;
+  std::set<rgw_sync_bucket_pipe> resolved_sources;
+  std::set<rgw_sync_bucket_pipe> resolved_dests;
+
+
+  bool bucket_is_sync_source() const {
+    return !targets.empty() || !resolved_dests.empty();
+  }
+
+  bool bucket_is_sync_target() const {
+    return !sources.empty() || !resolved_sources.empty();
+  }
+
+  RGWBucketSyncPolicyHandler(const RGWBucketSyncPolicyHandler *_parent,
+                             const RGWBucketInfo& _bucket_info,
+                             std::map<std::string, bufferlist>&& _bucket_attrs);
+
+  RGWBucketSyncPolicyHandler(const RGWBucketSyncPolicyHandler *_parent,
+                             const rgw_bucket& _bucket,
+                             std::optional<rgw_sync_policy_info> _sync_policy);
+public:
+  RGWBucketSyncPolicyHandler(RGWSI_Zone *_zone_svc,
+                             RGWSI_SyncModules *sync_modules_svc,
+                            RGWSI_Bucket_Sync *bucket_sync_svc,
+                             std::optional<rgw_zone_id> effective_zone = std::nullopt);
+
+  RGWBucketSyncPolicyHandler *alloc_child(const RGWBucketInfo& bucket_info,
+                                          std::map<std::string, bufferlist>&& bucket_attrs) const;
+  RGWBucketSyncPolicyHandler *alloc_child(const rgw_bucket& bucket,
+                                          std::optional<rgw_sync_policy_info> sync_policy) const;
+
+  int init(const DoutPrefixProvider *dpp, optional_yield y);
+
+  void reflect(const DoutPrefixProvider *dpp, RGWBucketSyncFlowManager::pipe_set *psource_pipes,
+               RGWBucketSyncFlowManager::pipe_set *ptarget_pipes,
+               std::map<rgw_zone_id, RGWBucketSyncFlowManager::pipe_set> *psources,
+               std::map<rgw_zone_id, RGWBucketSyncFlowManager::pipe_set> *ptargets,
+               std::set<rgw_zone_id> *psource_zones,
+               std::set<rgw_zone_id> *ptarget_zones,
+               bool only_enabled) const;
+
+  void set_resolved_hints(std::set<rgw_sync_bucket_pipe>&& _resolved_sources,
+                          std::set<rgw_sync_bucket_pipe>&& _resolved_dests) {
+    resolved_sources = std::move(_resolved_sources);
+    resolved_dests = std::move(_resolved_dests);
+  }
+
+  const std::set<rgw_sync_bucket_pipe>& get_resolved_source_hints() {
+    return resolved_sources;
+  }
+
+  const std::set<rgw_sync_bucket_pipe>& get_resolved_dest_hints() {
+    return resolved_dests;
+  }
+
+  const std::set<rgw_zone_id>& get_source_zones() const {
+    return source_zones;
+  }
+
+  const std::set<rgw_zone_id>& get_target_zones() const {
+    return target_zones;
+  }
+
+  const  std::map<rgw_zone_id, RGWBucketSyncFlowManager::pipe_set>& get_sources() {
+    return sources;
+  }
+
+  std::multimap<rgw_zone_id, rgw_sync_bucket_pipe> get_all_sources() const;
+  std::multimap<rgw_zone_id, rgw_sync_bucket_pipe> get_all_dests() const;
+  std::multimap<rgw_zone_id, rgw_sync_bucket_pipe> get_all_dests_in_zone(const rgw_zone_id& zone_id) const;
+
+  const std::map<rgw_zone_id, RGWBucketSyncFlowManager::pipe_set>& get_targets() {
+    return targets;
+  }
+
+  const std::optional<RGWBucketInfo>& get_bucket_info() const {
+    return bucket_info;
+  }
+
+  const std::optional<std::map<std::string, bufferlist> >& get_bucket_attrs() const {
+    return bucket_attrs;
+  }
+
+  void get_pipes(RGWBucketSyncFlowManager::pipe_set **_sources, RGWBucketSyncFlowManager::pipe_set **_targets) { /* return raw pipes (with zone name) */
+    *_sources = &source_pipes;
+    *_targets = &target_pipes;
+  }
+  void get_pipes(std::set<rgw_sync_bucket_pipe> *sources, std::set<rgw_sync_bucket_pipe> *targets,
+                 std::optional<rgw_sync_bucket_entity> filter_peer);
+
+  const std::set<rgw_bucket>& get_source_hints() const {
+    return source_hints;
+  }
+
+  const std::set<rgw_bucket>& get_target_hints() const {
+    return target_hints;
+  }
+
+  bool bucket_exports_data() const;
+  bool bucket_imports_data() const;
+
+  const rgw_sync_policy_info& get_sync_policy() const {
+    return sync_policy;
+  }
+
+  bool is_legacy_config() const {
+    return legacy_config;
+  }
+};
+
diff --git a/src/rgw/driver/rados/rgw_cr_rados.cc b/src/rgw/driver/rados/rgw_cr_rados.cc
new file mode 100644 (file)
index 0000000..0507972
--- /dev/null
@@ -0,0 +1,1138 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "include/compat.h"
+#include "rgw_sal.h"
+#include "rgw_zone.h"
+#include "rgw_coroutine.h"
+#include "rgw_cr_rados.h"
+#include "rgw_sync_counters.h"
+#include "rgw_bucket.h"
+#include "rgw_datalog_notify.h"
+#include "rgw_cr_rest.h"
+#include "rgw_rest_conn.h"
+#include "rgw_rados.h"
+
+#include "services/svc_zone.h"
+#include "services/svc_zone_utils.h"
+#include "services/svc_sys_obj.h"
+#include "services/svc_cls.h"
+
+#include "cls/lock/cls_lock_client.h"
+#include "cls/rgw/cls_rgw_client.h"
+
+#include <boost/asio/yield.hpp>
+#include <boost/container/flat_set.hpp>
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+
+bool RGWAsyncRadosProcessor::RGWWQ::_enqueue(RGWAsyncRadosRequest *req) {
+  if (processor->is_going_down()) {
+    return false;
+  }
+  req->get();
+  processor->m_req_queue.push_back(req);
+  dout(20) << "enqueued request req=" << hex << req << dec << dendl;
+  _dump_queue();
+  return true;
+}
+
+bool RGWAsyncRadosProcessor::RGWWQ::_empty() {
+  return processor->m_req_queue.empty();
+}
+
+RGWAsyncRadosRequest *RGWAsyncRadosProcessor::RGWWQ::_dequeue() {
+  if (processor->m_req_queue.empty())
+    return NULL;
+  RGWAsyncRadosRequest *req = processor->m_req_queue.front();
+  processor->m_req_queue.pop_front();
+  dout(20) << "dequeued request req=" << hex << req << dec << dendl;
+  _dump_queue();
+  return req;
+}
+
+void RGWAsyncRadosProcessor::RGWWQ::_process(RGWAsyncRadosRequest *req, ThreadPool::TPHandle& handle) {
+  processor->handle_request(this, req);
+  processor->req_throttle.put(1);
+}
+
+void RGWAsyncRadosProcessor::RGWWQ::_dump_queue() {
+  if (!g_conf()->subsys.should_gather<ceph_subsys_rgw, 20>()) {
+    return;
+  }
+  deque<RGWAsyncRadosRequest *>::iterator iter;
+  if (processor->m_req_queue.empty()) {
+    dout(20) << "RGWWQ: empty" << dendl;
+    return;
+  }
+  dout(20) << "RGWWQ:" << dendl;
+  for (iter = processor->m_req_queue.begin(); iter != processor->m_req_queue.end(); ++iter) {
+    dout(20) << "req: " << hex << *iter << dec << dendl;
+  }
+}
+
+RGWAsyncRadosProcessor::RGWAsyncRadosProcessor(CephContext *_cct, int num_threads)
+  : cct(_cct), m_tp(cct, "RGWAsyncRadosProcessor::m_tp", "rados_async", num_threads),
+    req_throttle(_cct, "rgw_async_rados_ops", num_threads * 2),
+    req_wq(this,
+          ceph::make_timespan(g_conf()->rgw_op_thread_timeout),
+          ceph::make_timespan(g_conf()->rgw_op_thread_suicide_timeout),
+          &m_tp) {
+}
+
+void RGWAsyncRadosProcessor::start() {
+  m_tp.start();
+}
+
+void RGWAsyncRadosProcessor::stop() {
+  going_down = true;
+  m_tp.drain(&req_wq);
+  m_tp.stop();
+  for (auto iter = m_req_queue.begin(); iter != m_req_queue.end(); ++iter) {
+    (*iter)->put();
+  }
+}
+
+void RGWAsyncRadosProcessor::handle_request(const DoutPrefixProvider *dpp, RGWAsyncRadosRequest *req) {
+  req->send_request(dpp);
+  req->put();
+}
+
+void RGWAsyncRadosProcessor::queue(RGWAsyncRadosRequest *req) {
+  req_throttle.get(1);
+  req_wq.queue(req);
+}
+
+int RGWAsyncGetSystemObj::_send_request(const DoutPrefixProvider *dpp)
+{
+  map<string, bufferlist> *pattrs = want_attrs ? &attrs : nullptr;
+
+  auto sysobj = svc_sysobj->get_obj(obj);
+  return sysobj.rop()
+               .set_objv_tracker(&objv_tracker)
+               .set_attrs(pattrs)
+              .set_raw_attrs(raw_attrs)
+               .read(dpp, &bl, null_yield);
+}
+
+RGWAsyncGetSystemObj::RGWAsyncGetSystemObj(const DoutPrefixProvider *_dpp, RGWCoroutine *caller, RGWAioCompletionNotifier *cn, RGWSI_SysObj *_svc,
+                       RGWObjVersionTracker *_objv_tracker, const rgw_raw_obj& _obj,
+                       bool want_attrs, bool raw_attrs)
+  : RGWAsyncRadosRequest(caller, cn), dpp(_dpp), svc_sysobj(_svc),
+    obj(_obj), want_attrs(want_attrs), raw_attrs(raw_attrs)
+{
+  if (_objv_tracker) {
+    objv_tracker = *_objv_tracker;
+  }
+}
+
+int RGWSimpleRadosReadAttrsCR::send_request(const DoutPrefixProvider *dpp)
+{
+  req = new RGWAsyncGetSystemObj(dpp, this, stack->create_completion_notifier(),
+                                svc, objv_tracker, obj, true, raw_attrs);
+  async_rados->queue(req);
+  return 0;
+}
+
+int RGWSimpleRadosReadAttrsCR::request_complete()
+{
+  if (pattrs) {
+    *pattrs = std::move(req->attrs);
+  }
+  if (objv_tracker) {
+    *objv_tracker = req->objv_tracker;
+  }
+  return req->get_ret_status();
+}
+
+int RGWAsyncPutSystemObj::_send_request(const DoutPrefixProvider *dpp)
+{
+  auto sysobj = svc->get_obj(obj);
+  return sysobj.wop()
+               .set_objv_tracker(&objv_tracker)
+               .set_exclusive(exclusive)
+               .write_data(dpp, bl, null_yield);
+}
+
+RGWAsyncPutSystemObj::RGWAsyncPutSystemObj(const DoutPrefixProvider *_dpp, 
+                     RGWCoroutine *caller, 
+                     RGWAioCompletionNotifier *cn,
+                     RGWSI_SysObj *_svc,
+                     RGWObjVersionTracker *_objv_tracker, const rgw_raw_obj& _obj,
+                     bool _exclusive, bufferlist _bl)
+  : RGWAsyncRadosRequest(caller, cn), dpp(_dpp), svc(_svc),
+    obj(_obj), exclusive(_exclusive), bl(std::move(_bl))
+{
+  if (_objv_tracker) {
+    objv_tracker = *_objv_tracker;
+  }
+}
+
+int RGWAsyncPutSystemObjAttrs::_send_request(const DoutPrefixProvider *dpp)
+{
+  auto sysobj = svc->get_obj(obj);
+  return sysobj.wop()
+               .set_objv_tracker(&objv_tracker)
+               .set_exclusive(exclusive)
+               .set_attrs(attrs)
+               .write_attrs(dpp, null_yield);
+}
+
+RGWAsyncPutSystemObjAttrs::RGWAsyncPutSystemObjAttrs(const DoutPrefixProvider *_dpp, RGWCoroutine *caller, RGWAioCompletionNotifier *cn,
+                     RGWSI_SysObj *_svc,
+                     RGWObjVersionTracker *_objv_tracker, const rgw_raw_obj& _obj,
+                     map<string, bufferlist> _attrs, bool exclusive)
+  : RGWAsyncRadosRequest(caller, cn), dpp(_dpp), svc(_svc),
+    obj(_obj), attrs(std::move(_attrs)), exclusive(exclusive)
+{
+  if (_objv_tracker) {
+    objv_tracker = *_objv_tracker;
+  }
+}
+
+
+RGWOmapAppend::RGWOmapAppend(RGWAsyncRadosProcessor *_async_rados, rgw::sal::RadosStore* _store, const rgw_raw_obj& _obj,
+                             uint64_t _window_size)
+                      : RGWConsumerCR<string>(_store->ctx()), async_rados(_async_rados),
+                        store(_store), obj(_obj), going_down(false), num_pending_entries(0), window_size(_window_size), total_entries(0)
+{
+}
+
+int RGWAsyncLockSystemObj::_send_request(const DoutPrefixProvider *dpp)
+{
+  rgw_rados_ref ref;
+  int r = store->getRados()->get_raw_obj_ref(dpp, obj, &ref);
+  if (r < 0) {
+    ldpp_dout(dpp, -1) << "ERROR: failed to get ref for (" << obj << ") ret=" << r << dendl;
+    return r;
+  }
+
+  rados::cls::lock::Lock l(lock_name);
+  utime_t duration(duration_secs, 0);
+  l.set_duration(duration);
+  l.set_cookie(cookie);
+  l.set_may_renew(true);
+
+  return l.lock_exclusive(&ref.pool.ioctx(), ref.obj.oid);
+}
+
+RGWAsyncLockSystemObj::RGWAsyncLockSystemObj(RGWCoroutine *caller, RGWAioCompletionNotifier *cn, rgw::sal::RadosStore* _store,
+                      RGWObjVersionTracker *_objv_tracker, const rgw_raw_obj& _obj,
+                       const string& _name, const string& _cookie, uint32_t _duration_secs) : RGWAsyncRadosRequest(caller, cn), store(_store),
+                                                              obj(_obj),
+                                                              lock_name(_name),
+                                                              cookie(_cookie),
+                                                              duration_secs(_duration_secs)
+{
+}
+
+int RGWAsyncUnlockSystemObj::_send_request(const DoutPrefixProvider *dpp)
+{
+  rgw_rados_ref ref;
+  int r = store->getRados()->get_raw_obj_ref(dpp, obj, &ref);
+  if (r < 0) {
+    ldpp_dout(dpp, -1) << "ERROR: failed to get ref for (" << obj << ") ret=" << r << dendl;
+    return r;
+  }
+
+  rados::cls::lock::Lock l(lock_name);
+
+  l.set_cookie(cookie);
+
+  return l.unlock(&ref.pool.ioctx(), ref.obj.oid);
+}
+
+RGWAsyncUnlockSystemObj::RGWAsyncUnlockSystemObj(RGWCoroutine *caller, RGWAioCompletionNotifier *cn, rgw::sal::RadosStore* _store,
+                                                 RGWObjVersionTracker *_objv_tracker, const rgw_raw_obj& _obj,
+                                                 const string& _name, const string& _cookie) : RGWAsyncRadosRequest(caller, cn), store(_store),
+  obj(_obj),
+  lock_name(_name), cookie(_cookie)
+{
+}
+
+RGWRadosSetOmapKeysCR::RGWRadosSetOmapKeysCR(rgw::sal::RadosStore* _store,
+                      const rgw_raw_obj& _obj,
+                      map<string, bufferlist>& _entries) : RGWSimpleCoroutine(_store->ctx()),
+                                                store(_store),
+                                                entries(_entries),
+                                                obj(_obj), cn(NULL)
+{
+  stringstream& s = set_description();
+  s << "set omap keys dest=" << obj << " keys=[" << s.str() << "]";
+  for (auto i = entries.begin(); i != entries.end(); ++i) {
+    if (i != entries.begin()) {
+      s << ", ";
+    }
+    s << i->first;
+  }
+  s << "]";
+}
+
+int RGWRadosSetOmapKeysCR::send_request(const DoutPrefixProvider *dpp)
+{
+  int r = store->getRados()->get_raw_obj_ref(dpp, obj, &ref);
+  if (r < 0) {
+    ldpp_dout(dpp, -1) << "ERROR: failed to get ref for (" << obj << ") ret=" << r << dendl;
+    return r;
+  }
+
+  set_status() << "sending request";
+
+  librados::ObjectWriteOperation op;
+  op.omap_set(entries);
+
+  cn = stack->create_completion_notifier();
+  return ref.pool.ioctx().aio_operate(ref.obj.oid, cn->completion(), &op);
+}
+
+int RGWRadosSetOmapKeysCR::request_complete()
+{
+  int r = cn->completion()->get_return_value();
+
+  set_status() << "request complete; ret=" << r;
+
+  return r;
+}
+
+RGWRadosGetOmapKeysCR::RGWRadosGetOmapKeysCR(rgw::sal::RadosStore* _store,
+                      const rgw_raw_obj& _obj,
+                      const string& _marker,
+                      int _max_entries,
+                      ResultPtr _result)
+  : RGWSimpleCoroutine(_store->ctx()), store(_store), obj(_obj),
+    marker(_marker), max_entries(_max_entries),
+    result(std::move(_result))
+{
+  ceph_assert(result); // must be allocated
+  set_description() << "get omap keys dest=" << obj << " marker=" << marker;
+}
+
+int RGWRadosGetOmapKeysCR::send_request(const DoutPrefixProvider *dpp) {
+  int r = store->getRados()->get_raw_obj_ref(dpp, obj, &result->ref);
+  if (r < 0) {
+    ldpp_dout(dpp, -1) << "ERROR: failed to get ref for (" << obj << ") ret=" << r << dendl;
+    return r;
+  }
+
+  set_status() << "send request";
+
+  librados::ObjectReadOperation op;
+  op.omap_get_keys2(marker, max_entries, &result->entries, &result->more, nullptr);
+
+  cn = stack->create_completion_notifier(result);
+  return result->ref.pool.ioctx().aio_operate(result->ref.obj.oid, cn->completion(), &op, NULL);
+}
+
+int RGWRadosGetOmapKeysCR::request_complete()
+{
+  int r = cn->completion()->get_return_value();
+
+  set_status() << "request complete; ret=" << r;
+
+  return r;
+}
+
+RGWRadosGetOmapValsCR::RGWRadosGetOmapValsCR(rgw::sal::RadosStore* _store,
+                      const rgw_raw_obj& _obj,
+                      const string& _marker,
+                      int _max_entries,
+                      ResultPtr _result)
+  : RGWSimpleCoroutine(_store->ctx()), store(_store), obj(_obj),
+    marker(_marker), max_entries(_max_entries),
+    result(std::move(_result))
+{
+  ceph_assert(result); // must be allocated
+  set_description() << "get omap keys dest=" << obj << " marker=" << marker;
+}
+
+int RGWRadosGetOmapValsCR::send_request(const DoutPrefixProvider *dpp) {
+  int r = store->getRados()->get_raw_obj_ref(dpp, obj, &result->ref);
+  if (r < 0) {
+    ldpp_dout(dpp, -1) << "ERROR: failed to get ref for (" << obj << ") ret=" << r << dendl;
+    return r;
+  }
+
+  set_status() << "send request";
+
+  librados::ObjectReadOperation op;
+  op.omap_get_vals2(marker, max_entries, &result->entries, &result->more, nullptr);
+
+  cn = stack->create_completion_notifier(result);
+  return result->ref.pool.ioctx().aio_operate(result->ref.obj.oid, cn->completion(), &op, NULL);
+}
+
+int RGWRadosGetOmapValsCR::request_complete()
+{
+  int r = cn->completion()->get_return_value();
+
+  set_status() << "request complete; ret=" << r;
+
+  return r;
+}
+
+RGWRadosRemoveOmapKeysCR::RGWRadosRemoveOmapKeysCR(rgw::sal::RadosStore* _store,
+                      const rgw_raw_obj& _obj,
+                      const set<string>& _keys) : RGWSimpleCoroutine(_store->ctx()),
+                                                store(_store),
+                                                keys(_keys),
+                                                obj(_obj), cn(NULL)
+{
+  set_description() << "remove omap keys dest=" << obj << " keys=" << keys;
+}
+
+int RGWRadosRemoveOmapKeysCR::send_request(const DoutPrefixProvider *dpp) {
+  int r = store->getRados()->get_raw_obj_ref(dpp, obj, &ref);
+  if (r < 0) {
+    ldpp_dout(dpp, -1) << "ERROR: failed to get ref for (" << obj << ") ret=" << r << dendl;
+    return r;
+  }
+
+  set_status() << "send request";
+
+  librados::ObjectWriteOperation op;
+  op.omap_rm_keys(keys);
+
+  cn = stack->create_completion_notifier();
+  return ref.pool.ioctx().aio_operate(ref.obj.oid, cn->completion(), &op);
+}
+
+int RGWRadosRemoveOmapKeysCR::request_complete()
+{
+  int r = cn->completion()->get_return_value();
+
+  set_status() << "request complete; ret=" << r;
+
+  return r;
+}
+
+RGWRadosRemoveCR::RGWRadosRemoveCR(rgw::sal::RadosStore* store, const rgw_raw_obj& obj,
+                                   RGWObjVersionTracker* objv_tracker)
+  : RGWSimpleCoroutine(store->ctx()),
+    store(store), obj(obj), objv_tracker(objv_tracker)
+{
+  set_description() << "remove dest=" << obj;
+}
+
+int RGWRadosRemoveCR::send_request(const DoutPrefixProvider *dpp)
+{
+  auto rados = store->getRados()->get_rados_handle();
+  int r = rados->ioctx_create(obj.pool.name.c_str(), ioctx);
+  if (r < 0) {
+    lderr(cct) << "ERROR: failed to open pool (" << obj.pool.name << ") ret=" << r << dendl;
+    return r;
+  }
+  ioctx.locator_set_key(obj.loc);
+
+  set_status() << "send request";
+
+  librados::ObjectWriteOperation op;
+  if (objv_tracker) {
+    objv_tracker->prepare_op_for_write(&op);
+  }
+  op.remove();
+
+  cn = stack->create_completion_notifier();
+  return ioctx.aio_operate(obj.oid, cn->completion(), &op);
+}
+
+int RGWRadosRemoveCR::request_complete()
+{
+  int r = cn->completion()->get_return_value();
+
+  set_status() << "request complete; ret=" << r;
+
+  return r;
+}
+
+RGWRadosRemoveOidCR::RGWRadosRemoveOidCR(rgw::sal::RadosStore* store,
+                                        librados::IoCtx&& ioctx,
+                                        std::string_view oid,
+                                        RGWObjVersionTracker* objv_tracker)
+  : RGWSimpleCoroutine(store->ctx()), ioctx(std::move(ioctx)),
+    oid(std::string(oid)), objv_tracker(objv_tracker)
+{
+  set_description() << "remove dest=" << oid;
+}
+
+RGWRadosRemoveOidCR::RGWRadosRemoveOidCR(rgw::sal::RadosStore* store,
+                                        RGWSI_RADOS::Obj& obj,
+                                        RGWObjVersionTracker* objv_tracker)
+  : RGWSimpleCoroutine(store->ctx()),
+    ioctx(librados::IoCtx(obj.get_ref().pool.ioctx())),
+    oid(obj.get_ref().obj.oid),
+    objv_tracker(objv_tracker)
+{
+  set_description() << "remove dest=" << oid;
+}
+
+RGWRadosRemoveOidCR::RGWRadosRemoveOidCR(rgw::sal::RadosStore* store,
+                                        RGWSI_RADOS::Obj&& obj,
+                                        RGWObjVersionTracker* objv_tracker)
+  : RGWSimpleCoroutine(store->ctx()),
+    ioctx(std::move(obj.get_ref().pool.ioctx())),
+    oid(std::move(obj.get_ref().obj.oid)),
+    objv_tracker(objv_tracker)
+{
+  set_description() << "remove dest=" << oid;
+}
+
+int RGWRadosRemoveOidCR::send_request(const DoutPrefixProvider *dpp)
+{
+  librados::ObjectWriteOperation op;
+  if (objv_tracker) {
+    objv_tracker->prepare_op_for_write(&op);
+  }
+  op.remove();
+
+  cn = stack->create_completion_notifier();
+  return ioctx.aio_operate(oid, cn->completion(), &op);
+}
+
+int RGWRadosRemoveOidCR::request_complete()
+{
+  int r = cn->completion()->get_return_value();
+
+  set_status() << "request complete; ret=" << r;
+
+  return r;
+}
+
+RGWSimpleRadosLockCR::RGWSimpleRadosLockCR(RGWAsyncRadosProcessor *_async_rados, rgw::sal::RadosStore* _store,
+                      const rgw_raw_obj& _obj,
+                      const string& _lock_name,
+                      const string& _cookie,
+                      uint32_t _duration) : RGWSimpleCoroutine(_store->ctx()),
+                                                async_rados(_async_rados),
+                                                store(_store),
+                                                lock_name(_lock_name),
+                                                cookie(_cookie),
+                                                duration(_duration),
+                                                obj(_obj),
+                                                req(NULL)
+{
+  set_description() << "rados lock dest=" << obj << " lock=" << lock_name << " cookie=" << cookie << " duration=" << duration;
+}
+
+void RGWSimpleRadosLockCR::request_cleanup()
+{
+  if (req) {
+    req->finish();
+    req = NULL;
+  }
+}
+
+int RGWSimpleRadosLockCR::send_request(const DoutPrefixProvider *dpp)
+{
+  set_status() << "sending request";
+  req = new RGWAsyncLockSystemObj(this, stack->create_completion_notifier(),
+                                 store, NULL, obj, lock_name, cookie, duration);
+  async_rados->queue(req);
+  return 0;
+}
+
+int RGWSimpleRadosLockCR::request_complete()
+{
+  set_status() << "request complete; ret=" << req->get_ret_status();
+  return req->get_ret_status();
+}
+
+RGWSimpleRadosUnlockCR::RGWSimpleRadosUnlockCR(RGWAsyncRadosProcessor *_async_rados, rgw::sal::RadosStore* _store,
+                      const rgw_raw_obj& _obj,
+                      const string& _lock_name,
+                      const string& _cookie) : RGWSimpleCoroutine(_store->ctx()),
+                                                async_rados(_async_rados),
+                                                store(_store),
+                                                lock_name(_lock_name),
+                                                cookie(_cookie),
+                                                obj(_obj),
+                                                req(NULL)
+{
+  set_description() << "rados unlock dest=" << obj << " lock=" << lock_name << " cookie=" << cookie;
+}
+
+void RGWSimpleRadosUnlockCR::request_cleanup()
+{
+  if (req) {
+    req->finish();
+    req = NULL;
+  }
+}
+
+int RGWSimpleRadosUnlockCR::send_request(const DoutPrefixProvider *dpp)
+{
+  set_status() << "sending request";
+
+  req = new RGWAsyncUnlockSystemObj(this, stack->create_completion_notifier(),
+                                 store, NULL, obj, lock_name, cookie);
+  async_rados->queue(req);
+  return 0;
+}
+
+int RGWSimpleRadosUnlockCR::request_complete()
+{
+  set_status() << "request complete; ret=" << req->get_ret_status();
+  return req->get_ret_status();
+}
+
+int RGWOmapAppend::operate(const DoutPrefixProvider *dpp) {
+  reenter(this) {
+    for (;;) {
+      if (!has_product() && going_down) {
+        set_status() << "going down";
+        break;
+      }
+      set_status() << "waiting for product";
+      yield wait_for_product();
+      yield {
+        string entry;
+        while (consume(&entry)) {
+          set_status() << "adding entry: " << entry;
+          entries[entry] = bufferlist();
+          if (entries.size() >= window_size) {
+            break;
+          }
+        }
+        if (entries.size() >= window_size || going_down) {
+          set_status() << "flushing to omap";
+          call(new RGWRadosSetOmapKeysCR(store, obj, entries));
+          entries.clear();
+        }
+      }
+      if (get_ret_status() < 0) {
+        ldout(cct, 0) << "ERROR: failed to store entries in omap" << dendl;
+        return set_state(RGWCoroutine_Error);
+      }
+    }
+    /* done with coroutine */
+    return set_state(RGWCoroutine_Done);
+  }
+  return 0;
+}
+
+void RGWOmapAppend::flush_pending() {
+  receive(pending_entries);
+  num_pending_entries = 0;
+}
+
+bool RGWOmapAppend::append(const string& s) {
+  if (is_done()) {
+    return false;
+  }
+  ++total_entries;
+  pending_entries.push_back(s);
+  if (++num_pending_entries >= (int)window_size) {
+    flush_pending();
+  }
+  return true;
+}
+
+bool RGWOmapAppend::finish() {
+  going_down = true;
+  flush_pending();
+  set_sleeping(false);
+  return (!is_done());
+}
+
+int RGWAsyncGetBucketInstanceInfo::_send_request(const DoutPrefixProvider *dpp)
+{
+  int r;
+  if (!bucket.bucket_id.empty()) {
+    r = store->getRados()->get_bucket_instance_info(bucket, bucket_info, nullptr, &attrs, null_yield, dpp);
+  } else {
+    r = store->ctl()->bucket->read_bucket_info(bucket, &bucket_info, null_yield, dpp,
+                                               RGWBucketCtl::BucketInstance::GetParams().set_attrs(&attrs));
+  }
+  if (r < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: failed to get bucket instance info for "
+        << bucket << dendl;
+    return r;
+  }
+
+  return 0;
+}
+
+int RGWAsyncPutBucketInstanceInfo::_send_request(const DoutPrefixProvider *dpp)
+{
+  auto r = store->getRados()->put_bucket_instance_info(bucket_info, exclusive,
+                                                      mtime, attrs, dpp);
+  if (r < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: failed to put bucket instance info for "
+                     << bucket_info.bucket << dendl;
+    return r;
+  }
+
+  return 0;
+}
+
+RGWRadosBILogTrimCR::RGWRadosBILogTrimCR(
+  const DoutPrefixProvider *dpp,
+  rgw::sal::RadosStore* store,
+  const RGWBucketInfo& bucket_info,
+  int shard_id,
+  const rgw::bucket_index_layout_generation& generation,
+  const std::string& start_marker,
+  const std::string& end_marker)
+  : RGWSimpleCoroutine(store->ctx()), bucket_info(bucket_info),
+    shard_id(shard_id), generation(generation), bs(store->getRados()),
+    start_marker(BucketIndexShardsManager::get_shard_marker(start_marker)),
+    end_marker(BucketIndexShardsManager::get_shard_marker(end_marker))
+{
+}
+
+int RGWRadosBILogTrimCR::send_request(const DoutPrefixProvider *dpp)
+{
+  int r = bs.init(dpp, bucket_info, generation, shard_id);
+  if (r < 0) {
+    ldpp_dout(dpp, -1) << "ERROR: bucket shard init failed ret=" << r << dendl;
+    return r;
+  }
+
+  bufferlist in;
+  cls_rgw_bi_log_trim_op call;
+  call.start_marker = std::move(start_marker);
+  call.end_marker = std::move(end_marker);
+  encode(call, in);
+
+  librados::ObjectWriteOperation op;
+  op.exec(RGW_CLASS, RGW_BI_LOG_TRIM, in);
+
+  cn = stack->create_completion_notifier();
+  return bs.bucket_obj.aio_operate(cn->completion(), &op);
+}
+
+int RGWRadosBILogTrimCR::request_complete()
+{
+  int r = cn->completion()->get_return_value();
+  set_status() << "request complete; ret=" << r;
+  return r;
+}
+
+int RGWAsyncFetchRemoteObj::_send_request(const DoutPrefixProvider *dpp)
+{
+  RGWObjectCtx obj_ctx(store);
+
+  char buf[16];
+  snprintf(buf, sizeof(buf), ".%lld", (long long)store->getRados()->instance_id());
+  rgw::sal::Attrs attrs;
+
+  rgw::sal::RadosBucket bucket(store, src_bucket);
+  rgw::sal::RadosObject src_obj(store, key, &bucket);
+  rgw::sal::RadosBucket dest_bucket(store, dest_bucket_info);
+  rgw::sal::RadosObject dest_obj(store, dest_key.value_or(key), &dest_bucket);
+    
+  std::string etag;
+
+  std::optional<uint64_t> bytes_transferred;
+  int r = store->getRados()->fetch_remote_obj(obj_ctx,
+                       user_id.value_or(rgw_user()),
+                       NULL, /* req_info */
+                       source_zone,
+                       &dest_obj,
+                       &src_obj,
+                       &dest_bucket, /* dest */
+                       nullptr, /* source */
+                       dest_placement_rule,
+                       nullptr, /* real_time* src_mtime, */
+                       NULL, /* real_time* mtime, */
+                       NULL, /* const real_time* mod_ptr, */
+                       NULL, /* const real_time* unmod_ptr, */
+                       false, /* high precision time */
+                       NULL, /* const char *if_match, */
+                       NULL, /* const char *if_nomatch, */
+                       RGWRados::ATTRSMOD_NONE,
+                       copy_if_newer,
+                       attrs,
+                       RGWObjCategory::Main,
+                       versioned_epoch,
+                       real_time(), /* delete_at */
+                       NULL, /* string *ptag, */
+                       &etag, /* string *petag, */
+                       NULL, /* void (*progress_cb)(off_t, void *), */
+                       NULL, /* void *progress_data*); */
+                       dpp,
+                       filter.get(),
+                       &zones_trace,
+                       &bytes_transferred);
+
+  if (r < 0) {
+    ldpp_dout(dpp, 0) << "store->fetch_remote_obj() returned r=" << r << dendl;
+    if (counters) {
+      counters->inc(sync_counters::l_fetch_err, 1);
+    }
+  } else {
+      // r >= 0
+      if (bytes_transferred) {
+        // send notification that object was succesfully synced
+        std::string user_id = "rgw sync";
+        std::string req_id = "0";
+                       
+        RGWObjTags obj_tags;
+        auto iter = attrs.find(RGW_ATTR_TAGS);
+        if (iter != attrs.end()) {
+          try {
+            auto it = iter->second.cbegin();
+            obj_tags.decode(it);
+          } catch (buffer::error &err) {
+            ldpp_dout(dpp, 1) << "ERROR: " << __func__ << ": caught buffer::error couldn't decode TagSet " << dendl;
+          }
+        }
+
+        // NOTE: we create a mutable copy of bucket.get_tenant as the get_notification function expects a std::string&, not const
+        std::string tenant(dest_bucket.get_tenant());
+
+        std::unique_ptr<rgw::sal::Notification> notify 
+                 = store->get_notification(dpp, &dest_obj, nullptr, rgw::notify::ObjectSyncedCreate,
+                  &dest_bucket, user_id,
+                  tenant,
+                  req_id, null_yield);
+
+        auto notify_res = static_cast<rgw::sal::RadosNotification*>(notify.get())->get_reservation();
+        int ret = rgw::notify::publish_reserve(dpp, rgw::notify::ObjectSyncedCreate, notify_res, &obj_tags);
+        if (ret < 0) {
+          ldpp_dout(dpp, 1) << "ERROR: reserving notification failed, with error: " << ret << dendl;
+          // no need to return, the sync already happened
+        } else {
+          ret = rgw::notify::publish_commit(&dest_obj, dest_obj.get_obj_size(), ceph::real_clock::now(), etag, dest_obj.get_instance(), rgw::notify::ObjectSyncedCreate, notify_res, dpp);
+          if (ret < 0) {
+            ldpp_dout(dpp, 1) << "ERROR: publishing notification failed, with error: " << ret << dendl;
+          }
+        }
+      }
+      
+      if (counters) {
+        if (bytes_transferred) {
+          counters->inc(sync_counters::l_fetch, *bytes_transferred);
+        } else {
+          counters->inc(sync_counters::l_fetch_not_modified);
+        }
+      }
+  }
+  return r;
+}
+
+int RGWAsyncStatRemoteObj::_send_request(const DoutPrefixProvider *dpp)
+{
+  RGWObjectCtx obj_ctx(store);
+
+  string user_id;
+  char buf[16];
+  snprintf(buf, sizeof(buf), ".%lld", (long long)store->getRados()->instance_id());
+
+  rgw::sal::RadosBucket bucket(store, src_bucket);
+  rgw::sal::RadosObject src_obj(store, key, &bucket);
+
+  int r = store->getRados()->stat_remote_obj(dpp,
+                       obj_ctx,
+                       rgw_user(user_id),
+                       nullptr, /* req_info */
+                       source_zone,
+                       &src_obj,
+                       nullptr, /* source */
+                       pmtime, /* real_time* src_mtime, */
+                       psize, /* uint64_t * */
+                       nullptr, /* const real_time* mod_ptr, */
+                       nullptr, /* const real_time* unmod_ptr, */
+                       true, /* high precision time */
+                       nullptr, /* const char *if_match, */
+                       nullptr, /* const char *if_nomatch, */
+                       pattrs,
+                       pheaders,
+                       nullptr,
+                       nullptr, /* string *ptag, */
+                       petag); /* string *petag, */
+
+  if (r < 0) {
+    ldpp_dout(dpp, 0) << "store->stat_remote_obj() returned r=" << r << dendl;
+  }
+  return r;
+}
+
+
+int RGWAsyncRemoveObj::_send_request(const DoutPrefixProvider *dpp)
+{
+  ldpp_dout(dpp, 0) << __func__ << "(): deleting obj=" << obj << dendl;
+
+  obj->set_atomic();
+
+  RGWObjState *state;
+
+  int ret = obj->get_obj_state(dpp, &state, null_yield);
+  if (ret < 0) {
+    ldpp_dout(dpp, 20) << __func__ << "(): get_obj_state() obj=" << obj << " returned ret=" << ret << dendl;
+    return ret;
+  }
+
+  /* has there been any racing object write? */
+  if (del_if_older && (state->mtime > timestamp)) {
+    ldpp_dout(dpp, 20) << __func__ << "(): skipping object removal obj=" << obj << " (obj mtime=" << state->mtime << ", request timestamp=" << timestamp << ")" << dendl;
+    return 0;
+  }
+
+  RGWAccessControlPolicy policy;
+
+  /* decode policy */
+  map<string, bufferlist>::iterator iter = state->attrset.find(RGW_ATTR_ACL);
+  if (iter != state->attrset.end()) {
+    auto bliter = iter->second.cbegin();
+    try {
+      policy.decode(bliter);
+    } catch (buffer::error& err) {
+      ldpp_dout(dpp, 0) << "ERROR: could not decode policy, caught buffer::error" << dendl;
+      return -EIO;
+    }
+  }
+
+  std::unique_ptr<rgw::sal::Object::DeleteOp> del_op = obj->get_delete_op();
+
+  del_op->params.bucket_owner = bucket->get_info().owner;
+  del_op->params.obj_owner = policy.get_owner();
+  if (del_if_older) {
+    del_op->params.unmod_since = timestamp;
+  }
+  if (versioned) {
+    del_op->params.versioning_status = BUCKET_VERSIONED;
+  }
+  del_op->params.olh_epoch = versioned_epoch;
+  del_op->params.marker_version_id = marker_version_id;
+  del_op->params.obj_owner.set_id(rgw_user(owner));
+  del_op->params.obj_owner.set_name(owner_display_name);
+  del_op->params.mtime = timestamp;
+  del_op->params.high_precision_time = true;
+  del_op->params.zones_trace = &zones_trace;
+
+  ret = del_op->delete_obj(dpp, null_yield);
+  if (ret < 0) {
+    ldpp_dout(dpp, 20) << __func__ << "(): delete_obj() obj=" << obj << " returned ret=" << ret << dendl;
+  }
+  return ret;
+}
+
+int RGWContinuousLeaseCR::operate(const DoutPrefixProvider *dpp)
+{
+  if (aborted) {
+    caller->set_sleeping(false);
+    return set_cr_done();
+  }
+  reenter(this) {
+    last_renew_try_time = ceph::coarse_mono_clock::now();
+    while (!going_down) {
+      yield call(new RGWSimpleRadosLockCR(async_rados, store, obj, lock_name, cookie, interval));
+      current_time = ceph::coarse_mono_clock::now();
+      if (current_time - last_renew_try_time > interval_tolerance) {
+        // renewal should happen between 50%-90% of interval
+        ldout(store->ctx(), 1) << *this << ": WARNING: did not renew lock " << obj << ":" << lock_name << ": within 90\% of interval. " << 
+          (current_time - last_renew_try_time) << " > " << interval_tolerance << dendl;
+      }
+      last_renew_try_time = current_time;
+
+      caller->set_sleeping(false); /* will only be relevant when we return, that's why we can do it early */
+      if (retcode < 0) {
+        set_locked(false);
+        ldout(store->ctx(), 20) << *this << ": couldn't lock " << obj << ":" << lock_name << ": retcode=" << retcode << dendl;
+        return set_state(RGWCoroutine_Error, retcode);
+      }
+      ldout(store->ctx(), 20) << *this << ": successfully locked " << obj << ":" << lock_name << dendl;
+      set_locked(true);
+      yield wait(utime_t(interval / 2, 0));
+    }
+    set_locked(false); /* moot at this point anyway */
+    yield call(new RGWSimpleRadosUnlockCR(async_rados, store, obj, lock_name, cookie));
+    return set_state(RGWCoroutine_Done);
+  }
+  return 0;
+}
+
+RGWRadosTimelogAddCR::RGWRadosTimelogAddCR(const DoutPrefixProvider *_dpp, rgw::sal::RadosStore* _store, const string& _oid,
+                      const cls_log_entry& entry) : RGWSimpleCoroutine(_store->ctx()),
+                                                dpp(_dpp),
+                                                store(_store),
+                                                oid(_oid), cn(NULL)
+{
+  stringstream& s = set_description();
+  s << "timelog add entry oid=" <<  oid << "entry={id=" << entry.id << ", section=" << entry.section << ", name=" << entry.name << "}";
+  entries.push_back(entry);
+}
+
+int RGWRadosTimelogAddCR::send_request(const DoutPrefixProvider *dpp)
+{
+  set_status() << "sending request";
+
+  cn = stack->create_completion_notifier();
+  return store->svc()->cls->timelog.add(dpp, oid, entries, cn->completion(), true, null_yield);
+}
+
+int RGWRadosTimelogAddCR::request_complete()
+{
+  int r = cn->completion()->get_return_value();
+
+  set_status() << "request complete; ret=" << r;
+
+  return r;
+}
+
+RGWRadosTimelogTrimCR::RGWRadosTimelogTrimCR(const DoutPrefixProvider *dpp,
+                                             rgw::sal::RadosStore* store,
+                                             const std::string& oid,
+                                             const real_time& start_time,
+                                             const real_time& end_time,
+                                             const std::string& from_marker,
+                                             const std::string& to_marker)
+  : RGWSimpleCoroutine(store->ctx()), dpp(dpp), store(store), oid(oid),
+    start_time(start_time), end_time(end_time),
+    from_marker(from_marker), to_marker(to_marker)
+{
+  set_description() << "timelog trim oid=" <<  oid
+      << " start_time=" << start_time << " end_time=" << end_time
+      << " from_marker=" << from_marker << " to_marker=" << to_marker;
+}
+
+int RGWRadosTimelogTrimCR::send_request(const DoutPrefixProvider *dpp)
+{
+  set_status() << "sending request";
+
+  cn = stack->create_completion_notifier();
+  return store->svc()->cls->timelog.trim(dpp, oid, start_time, end_time, from_marker,
+                                      to_marker, cn->completion(),
+                                      null_yield);
+}
+
+int RGWRadosTimelogTrimCR::request_complete()
+{
+  int r = cn->completion()->get_return_value();
+
+  set_status() << "request complete; ret=" << r;
+
+  return r;
+}
+
+
+RGWSyncLogTrimCR::RGWSyncLogTrimCR(const DoutPrefixProvider *dpp,
+                                   rgw::sal::RadosStore* store, const std::string& oid,
+                                   const std::string& to_marker,
+                                   std::string *last_trim_marker)
+  : RGWRadosTimelogTrimCR(dpp, store, oid, real_time{}, real_time{},
+                          std::string{}, to_marker),
+    cct(store->ctx()), last_trim_marker(last_trim_marker)
+{
+}
+
+int RGWSyncLogTrimCR::request_complete()
+{
+  int r = RGWRadosTimelogTrimCR::request_complete();
+  if (r != -ENODATA) {
+    return r;
+  }
+  // nothing left to trim, update last_trim_marker
+  if (*last_trim_marker < to_marker && to_marker != max_marker) {
+    *last_trim_marker = to_marker;
+  }
+  return 0;
+}
+
+
+int RGWAsyncStatObj::_send_request(const DoutPrefixProvider *dpp)
+{
+  rgw_raw_obj raw_obj;
+  store->getRados()->obj_to_raw(bucket_info.placement_rule, obj, &raw_obj);
+  return store->getRados()->raw_obj_stat(dpp, raw_obj, psize, pmtime, pepoch,
+                             nullptr, nullptr, objv_tracker, null_yield);
+}
+
+RGWStatObjCR::RGWStatObjCR(const DoutPrefixProvider *dpp,
+                           RGWAsyncRadosProcessor *async_rados, rgw::sal::RadosStore* store,
+                           const RGWBucketInfo& _bucket_info, const rgw_obj& obj, uint64_t *psize,
+                           real_time* pmtime, uint64_t *pepoch,
+                           RGWObjVersionTracker *objv_tracker)
+  : RGWSimpleCoroutine(store->ctx()), dpp(dpp), store(store), async_rados(async_rados),
+    bucket_info(_bucket_info), obj(obj), psize(psize), pmtime(pmtime), pepoch(pepoch),
+    objv_tracker(objv_tracker)
+{
+}
+
+void RGWStatObjCR::request_cleanup()
+{
+  if (req) {
+    req->finish();
+    req = NULL;
+  }
+}
+
+int RGWStatObjCR::send_request(const DoutPrefixProvider *dpp)
+{
+  req = new RGWAsyncStatObj(dpp, this, stack->create_completion_notifier(),
+                            store, bucket_info, obj, psize, pmtime, pepoch, objv_tracker);
+  async_rados->queue(req);
+  return 0;
+}
+
+int RGWStatObjCR::request_complete()
+{
+  return req->get_ret_status();
+}
+
+RGWRadosNotifyCR::RGWRadosNotifyCR(rgw::sal::RadosStore* store, const rgw_raw_obj& obj,
+                                   bufferlist& request, uint64_t timeout_ms,
+                                   bufferlist *response)
+  : RGWSimpleCoroutine(store->ctx()), store(store), obj(obj),
+    request(request), timeout_ms(timeout_ms), response(response)
+{
+  set_description() << "notify dest=" << obj;
+}
+
+int RGWRadosNotifyCR::send_request(const DoutPrefixProvider *dpp)
+{
+  int r = store->getRados()->get_raw_obj_ref(dpp, obj, &ref);
+  if (r < 0) {
+    ldpp_dout(dpp, -1) << "ERROR: failed to get ref for (" << obj << ") ret=" << r << dendl;
+    return r;
+  }
+
+  set_status() << "sending request";
+
+  cn = stack->create_completion_notifier();
+  return ref.pool.ioctx().aio_notify(ref.obj.oid, cn->completion(), request,
+                              timeout_ms, response);
+}
+
+int RGWRadosNotifyCR::request_complete()
+{
+  int r = cn->completion()->get_return_value();
+
+  set_status() << "request complete; ret=" << r;
+
+  return r;
+}
+
+
+int RGWDataPostNotifyCR::operate(const DoutPrefixProvider* dpp)
+{
+  reenter(this) {
+    using PostNotify2 = RGWPostRESTResourceCR<bc::flat_map<int, bc::flat_set<rgw_data_notify_entry>>, int>;
+    yield {
+      rgw_http_param_pair pairs[] = { { "type", "data" },
+                                      { "notify2", NULL },
+                                      { "source-zone", source_zone },
+                                      { NULL, NULL } };
+      call(new PostNotify2(store->ctx(), conn, &http_manager, "/admin/log", pairs, shards, nullptr));
+    }
+    if (retcode == -ERR_METHOD_NOT_ALLOWED) {
+      using PostNotify1 = RGWPostRESTResourceCR<rgw_data_notify_v1_encoder, int>;
+      yield {
+        rgw_http_param_pair pairs[] = { { "type", "data" },
+                                        { "notify", NULL },
+                                        { "source-zone", source_zone },
+                                        { NULL, NULL } };
+        auto encoder = rgw_data_notify_v1_encoder{shards};
+        call(new PostNotify1(store->ctx(), conn, &http_manager, "/admin/log", pairs, encoder, nullptr));
+      }
+    }
+    if (retcode < 0) {
+      return set_cr_error(retcode);
+    }
+    return set_cr_done();
+  }
+  return 0;
+}
diff --git a/src/rgw/driver/rados/rgw_cr_rados.h b/src/rgw/driver/rados/rgw_cr_rados.h
new file mode 100644 (file)
index 0000000..03c5303
--- /dev/null
@@ -0,0 +1,1595 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#ifndef CEPH_RGW_CR_RADOS_H
+#define CEPH_RGW_CR_RADOS_H
+
+#include <boost/intrusive_ptr.hpp>
+#include "include/ceph_assert.h"
+#include "rgw_coroutine.h"
+#include "rgw_sal.h"
+#include "rgw_sal_rados.h"
+#include "common/WorkQueue.h"
+#include "common/Throttle.h"
+
+#include <atomic>
+#include "common/ceph_time.h"
+
+#include "services/svc_sys_obj.h"
+#include "services/svc_bucket.h"
+
+struct rgw_http_param_pair;
+class RGWRESTConn;
+
+class RGWAsyncRadosRequest : public RefCountedObject {
+  RGWCoroutine *caller;
+  RGWAioCompletionNotifier *notifier;
+
+  int retcode;
+
+  ceph::mutex lock = ceph::make_mutex("RGWAsyncRadosRequest::lock");
+
+protected:
+  virtual int _send_request(const DoutPrefixProvider *dpp) = 0;
+public:
+  RGWAsyncRadosRequest(RGWCoroutine *_caller, RGWAioCompletionNotifier *_cn)
+    : caller(_caller), notifier(_cn), retcode(0) {
+  }
+  ~RGWAsyncRadosRequest() override {
+    if (notifier) {
+      notifier->put();
+    }
+  }
+
+  void send_request(const DoutPrefixProvider *dpp) {
+    get();
+    retcode = _send_request(dpp);
+    {
+      std::lock_guard l{lock};
+      if (notifier) {
+        notifier->cb(); // drops its own ref
+        notifier = nullptr;
+      }
+    }
+    put();
+  }
+
+  int get_ret_status() { return retcode; }
+
+  void finish() {
+    {
+      std::lock_guard l{lock};
+      if (notifier) {
+        // we won't call notifier->cb() to drop its ref, so drop it here
+        notifier->put();
+        notifier = nullptr;
+      }
+    }
+    put();
+  }
+};
+
+
+class RGWAsyncRadosProcessor {
+  std::deque<RGWAsyncRadosRequest *> m_req_queue;
+  std::atomic<bool> going_down = { false };
+protected:
+  CephContext *cct;
+  ThreadPool m_tp;
+  Throttle req_throttle;
+
+  struct RGWWQ : public DoutPrefixProvider, public ThreadPool::WorkQueue<RGWAsyncRadosRequest> {
+    RGWAsyncRadosProcessor *processor;
+    RGWWQ(RGWAsyncRadosProcessor *p,
+         ceph::timespan timeout, ceph::timespan suicide_timeout,
+         ThreadPool *tp)
+      : ThreadPool::WorkQueue<RGWAsyncRadosRequest>("RGWWQ", timeout, suicide_timeout, tp), processor(p) {}
+
+    bool _enqueue(RGWAsyncRadosRequest *req) override;
+    void _dequeue(RGWAsyncRadosRequest *req) override {
+      ceph_abort();
+    }
+    bool _empty() override;
+    RGWAsyncRadosRequest *_dequeue() override;
+    using ThreadPool::WorkQueue<RGWAsyncRadosRequest>::_process;
+    void _process(RGWAsyncRadosRequest *req, ThreadPool::TPHandle& handle) override;
+    void _dump_queue();
+    void _clear() override {
+      ceph_assert(processor->m_req_queue.empty());
+    }
+
+  CephContext *get_cct() const { return processor->cct; }
+  unsigned get_subsys() const { return ceph_subsys_rgw; }
+  std::ostream& gen_prefix(std::ostream& out) const { return out << "rgw async rados processor: ";}
+
+  } req_wq;
+
+public:
+  RGWAsyncRadosProcessor(CephContext *_cct, int num_threads);
+  ~RGWAsyncRadosProcessor() {}
+  void start();
+  void stop();
+  void handle_request(const DoutPrefixProvider *dpp, RGWAsyncRadosRequest *req);
+  void queue(RGWAsyncRadosRequest *req);
+
+  bool is_going_down() {
+    return going_down;
+  }
+
+};
+
+template <class P>
+class RGWSimpleWriteOnlyAsyncCR : public RGWSimpleCoroutine {
+  RGWAsyncRadosProcessor *async_rados;
+  rgw::sal::RadosStore* store;
+
+  P params;
+  const DoutPrefixProvider *dpp;
+
+  class Request : public RGWAsyncRadosRequest {
+    rgw::sal::RadosStore* store;
+    P params;
+    const DoutPrefixProvider *dpp;
+  protected:
+    int _send_request(const DoutPrefixProvider *dpp) override;
+  public:
+    Request(RGWCoroutine *caller,
+            RGWAioCompletionNotifier *cn,
+            rgw::sal::RadosStore* store,
+            const P& _params,
+            const DoutPrefixProvider *dpp) : RGWAsyncRadosRequest(caller, cn),
+                                store(store),
+                                params(_params),
+                                dpp(dpp) {}
+  } *req{nullptr};
+
+ public:
+  RGWSimpleWriteOnlyAsyncCR(RGWAsyncRadosProcessor *_async_rados,
+                           rgw::sal::RadosStore* _store,
+                           const P& _params,
+                            const DoutPrefixProvider *_dpp) : RGWSimpleCoroutine(_store->ctx()),
+                                                async_rados(_async_rados),
+                                                store(_store),
+                                               params(_params),
+                                                dpp(_dpp) {}
+
+  ~RGWSimpleWriteOnlyAsyncCR() override {
+    request_cleanup();
+  }
+  void request_cleanup() override {
+    if (req) {
+      req->finish();
+      req = NULL;
+    }
+  }
+
+  int send_request(const DoutPrefixProvider *dpp) override {
+    req = new Request(this,
+                      stack->create_completion_notifier(),
+                      store,
+                      params,
+                      dpp);
+
+    async_rados->queue(req);
+    return 0;
+  }
+  int request_complete() override {
+    return req->get_ret_status();
+  }
+};
+
+
+template <class P, class R>
+class RGWSimpleAsyncCR : public RGWSimpleCoroutine {
+  RGWAsyncRadosProcessor *async_rados;
+  rgw::sal::RadosStore* store;
+
+  P params;
+  std::shared_ptr<R> result;
+  const DoutPrefixProvider *dpp;
+
+  class Request : public RGWAsyncRadosRequest {
+    rgw::sal::RadosStore* store;
+    P params;
+    std::shared_ptr<R> result;
+    const DoutPrefixProvider *dpp;
+  protected:
+    int _send_request(const DoutPrefixProvider *dpp) override;
+  public:
+    Request(const DoutPrefixProvider *dpp,
+            RGWCoroutine *caller,
+            RGWAioCompletionNotifier *cn,
+            rgw::sal::RadosStore* _store,
+            const P& _params,
+            std::shared_ptr<R>& _result,
+            const DoutPrefixProvider *_dpp) : RGWAsyncRadosRequest(caller, cn),
+                                           store(_store),
+                                           params(_params),
+                                           result(_result),
+                                           dpp(_dpp) {}
+  } *req{nullptr};
+
+ public:
+  RGWSimpleAsyncCR(RGWAsyncRadosProcessor *_async_rados,
+                   rgw::sal::RadosStore* _store,
+                   const P& _params,
+                   std::shared_ptr<R>& _result,
+                   const DoutPrefixProvider *_dpp) : RGWSimpleCoroutine(_store->ctx()),
+                                                  async_rados(_async_rados),
+                                                  store(_store),
+                                                  params(_params),
+                                                  result(_result),
+                                                  dpp(_dpp) {}
+
+  ~RGWSimpleAsyncCR() override {
+    request_cleanup();
+  }
+  void request_cleanup() override {
+    if (req) {
+      req->finish();
+      req = NULL;
+    }
+  }
+
+  int send_request(const DoutPrefixProvider *dpp) override {
+    req = new Request(dpp,
+                      this,
+                      stack->create_completion_notifier(),
+                      store,
+                      params,
+                      result,
+                      dpp);
+
+    async_rados->queue(req);
+    return 0;
+  }
+  int request_complete() override {
+    return req->get_ret_status();
+  }
+};
+
+class RGWGenericAsyncCR : public RGWSimpleCoroutine {
+  RGWAsyncRadosProcessor *async_rados;
+  rgw::sal::RadosStore* store;
+
+
+public:
+  class Action {
+  public:
+    virtual ~Action() {}
+    virtual int operate() = 0;
+  };
+
+private:
+  std::shared_ptr<Action> action;
+
+  class Request : public RGWAsyncRadosRequest {
+    std::shared_ptr<Action> action;
+  protected:
+    int _send_request(const DoutPrefixProvider *dpp) override {
+      if (!action) {
+       return 0;
+      }
+      return action->operate();
+    }
+  public:
+    Request(const DoutPrefixProvider *dpp,
+            RGWCoroutine *caller,
+            RGWAioCompletionNotifier *cn,
+            std::shared_ptr<Action>& _action) : RGWAsyncRadosRequest(caller, cn),
+                                           action(_action) {}
+  } *req{nullptr};
+
+ public:
+  RGWGenericAsyncCR(CephContext *_cct,
+                   RGWAsyncRadosProcessor *_async_rados,
+                   std::shared_ptr<Action>& _action) : RGWSimpleCoroutine(_cct),
+                                                  async_rados(_async_rados),
+                                                  action(_action) {}
+  template<typename T>
+  RGWGenericAsyncCR(CephContext *_cct,
+                   RGWAsyncRadosProcessor *_async_rados,
+                   std::shared_ptr<T>& _action) : RGWSimpleCoroutine(_cct),
+                                                  async_rados(_async_rados),
+                                                  action(std::static_pointer_cast<Action>(_action)) {}
+
+  ~RGWGenericAsyncCR() override {
+    request_cleanup();
+  }
+  void request_cleanup() override {
+    if (req) {
+      req->finish();
+      req = NULL;
+    }
+  }
+
+  int send_request(const DoutPrefixProvider *dpp) override {
+    req = new Request(dpp, this,
+                      stack->create_completion_notifier(),
+                      action);
+
+    async_rados->queue(req);
+    return 0;
+  }
+  int request_complete() override {
+    return req->get_ret_status();
+  }
+};
+
+
+class RGWAsyncGetSystemObj : public RGWAsyncRadosRequest {
+  const DoutPrefixProvider *dpp;
+  RGWSI_SysObj* svc_sysobj;
+  rgw_raw_obj obj;
+  const bool want_attrs;
+  const bool raw_attrs;
+protected:
+  int _send_request(const DoutPrefixProvider *dpp) override;
+public:
+  RGWAsyncGetSystemObj(const DoutPrefixProvider *dpp, 
+                       RGWCoroutine *caller, RGWAioCompletionNotifier *cn, RGWSI_SysObj *_svc,
+                       RGWObjVersionTracker *_objv_tracker, const rgw_raw_obj& _obj,
+                       bool want_attrs, bool raw_attrs);
+
+  bufferlist bl;
+  std::map<std::string, bufferlist> attrs;
+  RGWObjVersionTracker objv_tracker;
+};
+
+class RGWAsyncPutSystemObj : public RGWAsyncRadosRequest {
+  const DoutPrefixProvider *dpp;
+  RGWSI_SysObj *svc;
+  rgw_raw_obj obj;
+  bool exclusive;
+  bufferlist bl;
+
+protected:
+  int _send_request(const DoutPrefixProvider *dpp) override;
+public:
+  RGWAsyncPutSystemObj(const DoutPrefixProvider *dpp, RGWCoroutine *caller, 
+                       RGWAioCompletionNotifier *cn, RGWSI_SysObj *_svc,
+                       RGWObjVersionTracker *_objv_tracker, const rgw_raw_obj& _obj,
+                       bool _exclusive, bufferlist _bl);
+
+  RGWObjVersionTracker objv_tracker;
+};
+
+class RGWAsyncPutSystemObjAttrs : public RGWAsyncRadosRequest {
+  const DoutPrefixProvider *dpp;
+  RGWSI_SysObj *svc;
+  rgw_raw_obj obj;
+  std::map<std::string, bufferlist> attrs;
+  bool exclusive;
+
+protected:
+  int _send_request(const DoutPrefixProvider *dpp) override;
+public:
+  RGWAsyncPutSystemObjAttrs(const DoutPrefixProvider *dpp, RGWCoroutine *caller, RGWAioCompletionNotifier *cn, RGWSI_SysObj *_svc,
+                           RGWObjVersionTracker *_objv_tracker, const rgw_raw_obj& _obj,
+                           std::map<std::string, bufferlist> _attrs, bool exclusive);
+
+  RGWObjVersionTracker objv_tracker;
+};
+
+class RGWAsyncLockSystemObj : public RGWAsyncRadosRequest {
+  rgw::sal::RadosStore* store;
+  rgw_raw_obj obj;
+  std::string lock_name;
+  std::string cookie;
+  uint32_t duration_secs;
+
+protected:
+  int _send_request(const DoutPrefixProvider *dpp) override;
+public:
+  RGWAsyncLockSystemObj(RGWCoroutine *caller, RGWAioCompletionNotifier *cn, rgw::sal::RadosStore* _store,
+                        RGWObjVersionTracker *_objv_tracker, const rgw_raw_obj& _obj,
+                       const std::string& _name, const std::string& _cookie, uint32_t _duration_secs);
+};
+
+class RGWAsyncUnlockSystemObj : public RGWAsyncRadosRequest {
+  rgw::sal::RadosStore* store;
+  rgw_raw_obj obj;
+  std::string lock_name;
+  std::string cookie;
+
+protected:
+  int _send_request(const DoutPrefixProvider *dpp) override;
+public:
+  RGWAsyncUnlockSystemObj(RGWCoroutine *caller, RGWAioCompletionNotifier *cn, rgw::sal::RadosStore* _store,
+                        RGWObjVersionTracker *_objv_tracker, const rgw_raw_obj& _obj,
+                       const std::string& _name, const std::string& _cookie);
+};
+
+template <class T>
+class RGWSimpleRadosReadCR : public RGWSimpleCoroutine {
+  const DoutPrefixProvider *dpp;
+  RGWAsyncRadosProcessor *async_rados;
+  RGWSI_SysObj *svc;
+
+  rgw_raw_obj obj;
+  T *result;
+  /// on ENOENT, call handle_data() with an empty object instead of failing
+  const bool empty_on_enoent;
+  RGWObjVersionTracker *objv_tracker;
+  RGWAsyncGetSystemObj *req{nullptr};
+
+public:
+  RGWSimpleRadosReadCR(const DoutPrefixProvider *_dpp, 
+                      RGWAsyncRadosProcessor *_async_rados, RGWSI_SysObj *_svc,
+                     const rgw_raw_obj& _obj,
+                     T *_result, bool empty_on_enoent = true,
+                     RGWObjVersionTracker *objv_tracker = nullptr)
+    : RGWSimpleCoroutine(_svc->ctx()), dpp(_dpp), async_rados(_async_rados), svc(_svc),
+      obj(_obj), result(_result),
+      empty_on_enoent(empty_on_enoent), objv_tracker(objv_tracker) {}
+  ~RGWSimpleRadosReadCR() override {
+    request_cleanup();
+  }
+
+  void request_cleanup() override {
+    if (req) {
+      req->finish();
+      req = NULL;
+    }
+  }
+
+  int send_request(const DoutPrefixProvider *dpp) override;
+  int request_complete() override;
+
+  virtual int handle_data(T& data) {
+    return 0;
+  }
+};
+
+template <class T>
+int RGWSimpleRadosReadCR<T>::send_request(const DoutPrefixProvider *dpp)
+{
+  req = new RGWAsyncGetSystemObj(dpp, this, stack->create_completion_notifier(), svc,
+                                objv_tracker, obj, false, false);
+  async_rados->queue(req);
+  return 0;
+}
+
+template <class T>
+int RGWSimpleRadosReadCR<T>::request_complete()
+{
+  int ret = req->get_ret_status();
+  retcode = ret;
+  if (ret == -ENOENT && empty_on_enoent) {
+    *result = T();
+  } else {
+    if (ret < 0) {
+      return ret;
+    }
+    if (objv_tracker) { // copy the updated version
+      *objv_tracker = req->objv_tracker;
+    }
+    try {
+      auto iter = req->bl.cbegin();
+      if (iter.end()) {
+        // allow successful reads with empty buffers. ReadSyncStatus coroutines
+        // depend on this to be able to read without locking, because the
+        // cls lock from InitSyncStatus will create an empty object if it didn't
+        // exist
+        *result = T();
+      } else {
+        decode(*result, iter);
+      }
+    } catch (buffer::error& err) {
+      return -EIO;
+    }
+  }
+
+  return handle_data(*result);
+}
+
+class RGWSimpleRadosReadAttrsCR : public RGWSimpleCoroutine {
+  const DoutPrefixProvider *dpp;
+  RGWAsyncRadosProcessor *async_rados;
+  RGWSI_SysObj *svc;
+
+  rgw_raw_obj obj;
+  std::map<std::string, bufferlist> *pattrs;
+  bool raw_attrs;
+  RGWObjVersionTracker* objv_tracker;
+  RGWAsyncGetSystemObj *req = nullptr;
+
+public:
+  RGWSimpleRadosReadAttrsCR(const DoutPrefixProvider *_dpp, RGWAsyncRadosProcessor *_async_rados, RGWSI_SysObj *_svc,
+                            const rgw_raw_obj& _obj, std::map<std::string, bufferlist> *_pattrs,
+                            bool _raw_attrs, RGWObjVersionTracker* objv_tracker = nullptr)
+    : RGWSimpleCoroutine(_svc->ctx()),
+      dpp(_dpp),
+      async_rados(_async_rados), svc(_svc),
+      obj(_obj),
+      pattrs(_pattrs),
+      raw_attrs(_raw_attrs),
+      objv_tracker(objv_tracker)
+  {}
+  ~RGWSimpleRadosReadAttrsCR() override {
+    request_cleanup();
+  }
+                                                         
+  void request_cleanup() override {
+    if (req) {
+      req->finish();
+      req = NULL;
+    }
+  }
+
+  int send_request(const DoutPrefixProvider *dpp) override;
+  int request_complete() override;
+};
+
+template <class T>
+class RGWSimpleRadosWriteCR : public RGWSimpleCoroutine {
+  const DoutPrefixProvider *dpp;
+  RGWAsyncRadosProcessor *async_rados;
+  RGWSI_SysObj *svc;
+  bufferlist bl;
+  rgw_raw_obj obj;
+  RGWObjVersionTracker *objv_tracker;
+  bool exclusive;
+  RGWAsyncPutSystemObj *req{nullptr};
+
+public:
+  RGWSimpleRadosWriteCR(const DoutPrefixProvider *_dpp, 
+                       RGWAsyncRadosProcessor *_async_rados, RGWSI_SysObj *_svc,
+                       const rgw_raw_obj& _obj, const T& _data,
+                       RGWObjVersionTracker *objv_tracker = nullptr,
+                       bool exclusive = false)
+    : RGWSimpleCoroutine(_svc->ctx()), dpp(_dpp), async_rados(_async_rados),
+      svc(_svc), obj(_obj), objv_tracker(objv_tracker), exclusive(exclusive) {
+    encode(_data, bl);
+  }
+
+  ~RGWSimpleRadosWriteCR() override {
+    request_cleanup();
+  }
+
+  void request_cleanup() override {
+    if (req) {
+      req->finish();
+      req = NULL;
+    }
+  }
+
+  int send_request(const DoutPrefixProvider *dpp) override {
+    req = new RGWAsyncPutSystemObj(dpp, this, stack->create_completion_notifier(),
+                                  svc, objv_tracker, obj, exclusive, std::move(bl));
+    async_rados->queue(req);
+    return 0;
+  }
+
+  int request_complete() override {
+    if (objv_tracker) { // copy the updated version
+      *objv_tracker = req->objv_tracker;
+    }
+    return req->get_ret_status();
+  }
+};
+
+class RGWSimpleRadosWriteAttrsCR : public RGWSimpleCoroutine {
+  const DoutPrefixProvider *dpp;
+  RGWAsyncRadosProcessor *async_rados;
+  RGWSI_SysObj *svc;
+  RGWObjVersionTracker *objv_tracker;
+
+  rgw_raw_obj obj;
+  std::map<std::string, bufferlist> attrs;
+  bool exclusive;
+  RGWAsyncPutSystemObjAttrs *req = nullptr;
+
+public:
+  RGWSimpleRadosWriteAttrsCR(const DoutPrefixProvider *_dpp,
+                             RGWAsyncRadosProcessor *_async_rados,
+                             RGWSI_SysObj *_svc, const rgw_raw_obj& _obj,
+                             std::map<std::string, bufferlist> _attrs,
+                             RGWObjVersionTracker *objv_tracker = nullptr,
+                             bool exclusive = false)
+                            : RGWSimpleCoroutine(_svc->ctx()), dpp(_dpp), async_rados(_async_rados),
+      svc(_svc), objv_tracker(objv_tracker), obj(_obj),
+      attrs(std::move(_attrs)), exclusive(exclusive) {
+  }
+  ~RGWSimpleRadosWriteAttrsCR() override {
+    request_cleanup();
+  }
+
+  void request_cleanup() override {
+    if (req) {
+      req->finish();
+      req = NULL;
+    }
+  }
+
+  int send_request(const DoutPrefixProvider *dpp) override {
+    req = new RGWAsyncPutSystemObjAttrs(dpp, this, stack->create_completion_notifier(),
+                                  svc, objv_tracker, obj, std::move(attrs),
+                                   exclusive);
+    async_rados->queue(req);
+    return 0;
+  }
+
+  int request_complete() override {
+    if (objv_tracker) { // copy the updated version
+      *objv_tracker = req->objv_tracker;
+    }
+    return req->get_ret_status();
+  }
+};
+
+class RGWRadosSetOmapKeysCR : public RGWSimpleCoroutine {
+  rgw::sal::RadosStore* store;
+  std::map<std::string, bufferlist> entries;
+
+  rgw_rados_ref ref;
+
+  rgw_raw_obj obj;
+
+  boost::intrusive_ptr<RGWAioCompletionNotifier> cn;
+
+public:
+  RGWRadosSetOmapKeysCR(rgw::sal::RadosStore* _store,
+                     const rgw_raw_obj& _obj,
+                     std::map<std::string, bufferlist>& _entries);
+
+  int send_request(const DoutPrefixProvider *dpp) override;
+  int request_complete() override;
+};
+
+class RGWRadosGetOmapKeysCR : public RGWSimpleCoroutine {
+ public:
+  struct Result {
+    rgw_rados_ref ref;
+    std::set<std::string> entries;
+    bool more = false;
+  };
+  using ResultPtr = std::shared_ptr<Result>;
+
+  RGWRadosGetOmapKeysCR(rgw::sal::RadosStore* _store, const rgw_raw_obj& _obj,
+                        const std::string& _marker, int _max_entries,
+                        ResultPtr result);
+
+  int send_request(const DoutPrefixProvider *dpp) override;
+  int request_complete() override;
+
+ private:
+  rgw::sal::RadosStore* store;
+  rgw_raw_obj obj;
+  std::string marker;
+  int max_entries;
+  ResultPtr result;
+  boost::intrusive_ptr<RGWAioCompletionNotifier> cn;
+};
+
+class RGWRadosGetOmapValsCR : public RGWSimpleCoroutine {
+ public:
+  struct Result {
+    rgw_rados_ref ref;
+    std::map<std::string, bufferlist> entries;
+    bool more = false;
+  };
+  using ResultPtr = std::shared_ptr<Result>;
+
+  RGWRadosGetOmapValsCR(rgw::sal::RadosStore* _store, const rgw_raw_obj& _obj,
+                        const std::string& _marker, int _max_entries,
+                        ResultPtr result);
+
+  int send_request(const DoutPrefixProvider *dpp) override;
+  int request_complete() override;
+
+ private:
+  rgw::sal::RadosStore* store;
+  rgw_raw_obj obj;
+  std::string marker;
+  int max_entries;
+  ResultPtr result;
+  boost::intrusive_ptr<RGWAioCompletionNotifier> cn;
+};
+
+class RGWRadosRemoveOmapKeysCR : public RGWSimpleCoroutine {
+  rgw::sal::RadosStore* store;
+
+  rgw_rados_ref ref;
+
+  std::set<std::string> keys;
+
+  rgw_raw_obj obj;
+
+  boost::intrusive_ptr<RGWAioCompletionNotifier> cn;
+
+public:
+  RGWRadosRemoveOmapKeysCR(rgw::sal::RadosStore* _store,
+                     const rgw_raw_obj& _obj,
+                     const std::set<std::string>& _keys);
+
+  int send_request(const DoutPrefixProvider *dpp) override;
+
+  int request_complete() override;
+};
+
+class RGWRadosRemoveCR : public RGWSimpleCoroutine {
+  rgw::sal::RadosStore* store;
+  librados::IoCtx ioctx;
+  const rgw_raw_obj obj;
+  RGWObjVersionTracker* objv_tracker;
+  boost::intrusive_ptr<RGWAioCompletionNotifier> cn;
+
+public:
+  RGWRadosRemoveCR(rgw::sal::RadosStore* store, const rgw_raw_obj& obj,
+                   RGWObjVersionTracker* objv_tracker = nullptr);
+
+  int send_request(const DoutPrefixProvider *dpp) override;
+  int request_complete() override;
+};
+
+class RGWRadosRemoveOidCR : public RGWSimpleCoroutine {
+  librados::IoCtx ioctx;
+  const std::string oid;
+  RGWObjVersionTracker* objv_tracker;
+  boost::intrusive_ptr<RGWAioCompletionNotifier> cn;
+
+public:
+  RGWRadosRemoveOidCR(rgw::sal::RadosStore* store,
+                     librados::IoCtx&& ioctx, std::string_view oid,
+                     RGWObjVersionTracker* objv_tracker = nullptr);
+
+  RGWRadosRemoveOidCR(rgw::sal::RadosStore* store,
+                     RGWSI_RADOS::Obj& obj,
+                     RGWObjVersionTracker* objv_tracker = nullptr);
+
+  RGWRadosRemoveOidCR(rgw::sal::RadosStore* store,
+                     RGWSI_RADOS::Obj&& obj,
+                     RGWObjVersionTracker* objv_tracker = nullptr);
+
+  int send_request(const DoutPrefixProvider *dpp) override;
+  int request_complete() override;
+};
+
+class RGWSimpleRadosLockCR : public RGWSimpleCoroutine {
+  RGWAsyncRadosProcessor *async_rados;
+  rgw::sal::RadosStore* store;
+  std::string lock_name;
+  std::string cookie;
+  uint32_t duration;
+
+  rgw_raw_obj obj;
+
+  RGWAsyncLockSystemObj *req;
+
+public:
+  RGWSimpleRadosLockCR(RGWAsyncRadosProcessor *_async_rados, rgw::sal::RadosStore* _store,
+                     const rgw_raw_obj& _obj,
+                      const std::string& _lock_name,
+                     const std::string& _cookie,
+                     uint32_t _duration);
+  ~RGWSimpleRadosLockCR() override {
+    request_cleanup();
+  }
+  void request_cleanup() override;
+
+  int send_request(const DoutPrefixProvider *dpp) override;
+  int request_complete() override;
+
+  static std::string gen_random_cookie(CephContext* cct) {
+#define COOKIE_LEN 16
+    char buf[COOKIE_LEN + 1];
+    gen_rand_alphanumeric(cct, buf, sizeof(buf) - 1);
+    return buf;
+  }
+};
+
+class RGWSimpleRadosUnlockCR : public RGWSimpleCoroutine {
+  RGWAsyncRadosProcessor *async_rados;
+  rgw::sal::RadosStore* store;
+  std::string lock_name;
+  std::string cookie;
+
+  rgw_raw_obj obj;
+
+  RGWAsyncUnlockSystemObj *req;
+
+public:
+  RGWSimpleRadosUnlockCR(RGWAsyncRadosProcessor *_async_rados, rgw::sal::RadosStore* _store,
+                     const rgw_raw_obj& _obj, 
+                      const std::string& _lock_name,
+                     const std::string& _cookie);
+  ~RGWSimpleRadosUnlockCR() override {
+    request_cleanup();
+  }
+  void request_cleanup() override;
+
+  int send_request(const DoutPrefixProvider *dpp) override;
+  int request_complete() override;
+};
+
+#define OMAP_APPEND_MAX_ENTRIES_DEFAULT 100
+
+class RGWOmapAppend : public RGWConsumerCR<std::string> {
+  RGWAsyncRadosProcessor *async_rados;
+  rgw::sal::RadosStore* store;
+
+  rgw_raw_obj obj;
+
+  bool going_down;
+
+  int num_pending_entries;
+  std::list<std::string> pending_entries;
+
+  std::map<std::string, bufferlist> entries;
+
+  uint64_t window_size;
+  uint64_t total_entries;
+public:
+  RGWOmapAppend(RGWAsyncRadosProcessor *_async_rados, rgw::sal::RadosStore* _store,
+                const rgw_raw_obj& _obj,
+                uint64_t _window_size = OMAP_APPEND_MAX_ENTRIES_DEFAULT);
+  int operate(const DoutPrefixProvider *dpp) override;
+  void flush_pending();
+  bool append(const std::string& s);
+  bool finish();
+
+  uint64_t get_total_entries() {
+    return total_entries;
+  }
+
+  const rgw_raw_obj& get_obj() {
+    return obj;
+  }
+};
+
+class RGWShardedOmapCRManager {
+  RGWAsyncRadosProcessor *async_rados;
+  rgw::sal::RadosStore* store;
+  RGWCoroutine *op;
+
+  int num_shards;
+
+  std::vector<RGWOmapAppend *> shards;
+public:
+  RGWShardedOmapCRManager(RGWAsyncRadosProcessor *_async_rados, rgw::sal::RadosStore* _store, RGWCoroutine *_op, int _num_shards, const rgw_pool& pool, const std::string& oid_prefix)
+                      : async_rados(_async_rados),
+                       store(_store), op(_op), num_shards(_num_shards) {
+    shards.reserve(num_shards);
+    for (int i = 0; i < num_shards; ++i) {
+      char buf[oid_prefix.size() + 16];
+      snprintf(buf, sizeof(buf), "%s.%d", oid_prefix.c_str(), i);
+      RGWOmapAppend *shard = new RGWOmapAppend(async_rados, store, rgw_raw_obj(pool, buf));
+      shard->get();
+      shards.push_back(shard);
+      op->spawn(shard, false);
+    }
+  }
+
+  ~RGWShardedOmapCRManager() {
+    for (auto shard : shards) {
+      shard->put();
+    }
+  }
+
+  bool append(const std::string& entry, int shard_id) {
+    return shards[shard_id]->append(entry);
+  }
+  bool finish() {
+    bool success = true;
+    for (auto& append_op : shards) {
+      success &= (append_op->finish() && (!append_op->is_error()));
+    }
+    return success;
+  }
+
+  uint64_t get_total_entries(int shard_id) {
+    return shards[shard_id]->get_total_entries();
+  }
+};
+
+class RGWAsyncGetBucketInstanceInfo : public RGWAsyncRadosRequest {
+  rgw::sal::RadosStore* store;
+  rgw_bucket bucket;
+  const DoutPrefixProvider *dpp;
+
+protected:
+  int _send_request(const DoutPrefixProvider *dpp) override;
+public:
+  RGWAsyncGetBucketInstanceInfo(RGWCoroutine *caller, RGWAioCompletionNotifier *cn,
+                                rgw::sal::RadosStore* _store, const rgw_bucket& bucket,
+                                const DoutPrefixProvider *dpp)
+    : RGWAsyncRadosRequest(caller, cn), store(_store), bucket(bucket), dpp(dpp) {}
+
+  RGWBucketInfo bucket_info;
+  std::map<std::string, bufferlist> attrs;
+};
+
+class RGWAsyncPutBucketInstanceInfo : public RGWAsyncRadosRequest {
+  rgw::sal::RadosStore* store;
+  RGWBucketInfo& bucket_info;
+  bool exclusive;
+  real_time mtime;
+  std::map<std::string, ceph::bufferlist>* attrs;
+  const DoutPrefixProvider *dpp;
+
+protected:
+  int _send_request(const DoutPrefixProvider *dpp) override;
+public:
+  RGWAsyncPutBucketInstanceInfo(RGWCoroutine* caller,
+                               RGWAioCompletionNotifier* cn,
+                                rgw::sal::RadosStore* store,
+                               RGWBucketInfo& bucket_info,
+                               bool exclusive,
+                               real_time mtime,
+                               std::map<std::string, ceph::bufferlist>* attrs,
+                                const DoutPrefixProvider* dpp)
+    : RGWAsyncRadosRequest(caller, cn), store(store), bucket_info(bucket_info),
+      exclusive(exclusive), mtime(mtime), attrs(attrs), dpp(dpp) {}
+};
+
+class RGWGetBucketInstanceInfoCR : public RGWSimpleCoroutine {
+  RGWAsyncRadosProcessor *async_rados;
+  rgw::sal::RadosStore* store;
+  rgw_bucket bucket;
+  RGWBucketInfo *bucket_info;
+  std::map<std::string, bufferlist> *pattrs;
+  const DoutPrefixProvider *dpp;
+
+  RGWAsyncGetBucketInstanceInfo *req{nullptr};
+
+public:
+  // rgw_bucket constructor
+  RGWGetBucketInstanceInfoCR(RGWAsyncRadosProcessor *_async_rados, rgw::sal::RadosStore* _store,
+                             const rgw_bucket& _bucket, RGWBucketInfo *_bucket_info,
+                             std::map<std::string, bufferlist> *_pattrs, const DoutPrefixProvider *dpp)
+    : RGWSimpleCoroutine(_store->ctx()), async_rados(_async_rados), store(_store),
+      bucket(_bucket), bucket_info(_bucket_info), pattrs(_pattrs), dpp(dpp) {}
+  ~RGWGetBucketInstanceInfoCR() override {
+    request_cleanup();
+  }
+  void request_cleanup() override {
+    if (req) {
+      req->finish();
+      req = NULL;
+    }
+  }
+
+  int send_request(const DoutPrefixProvider *dpp) override {
+    req = new RGWAsyncGetBucketInstanceInfo(this, stack->create_completion_notifier(), store, bucket, dpp);
+    async_rados->queue(req);
+    return 0;
+  }
+  int request_complete() override {
+    if (bucket_info) {
+      *bucket_info = std::move(req->bucket_info);
+    }
+    if (pattrs) {
+      *pattrs = std::move(req->attrs);
+    }
+    return req->get_ret_status();
+  }
+};
+
+class RGWPutBucketInstanceInfoCR : public RGWSimpleCoroutine {
+  RGWAsyncRadosProcessor *async_rados;
+  rgw::sal::RadosStore* store;
+  RGWBucketInfo& bucket_info;
+  bool exclusive;
+  real_time mtime;
+  std::map<std::string, ceph::bufferlist>* attrs;
+  const DoutPrefixProvider *dpp;
+
+  RGWAsyncPutBucketInstanceInfo* req = nullptr;
+
+public:
+  // rgw_bucket constructor
+  RGWPutBucketInstanceInfoCR(RGWAsyncRadosProcessor *async_rados,
+                            rgw::sal::RadosStore* store,
+                            RGWBucketInfo& bucket_info,
+                            bool exclusive,
+                            real_time mtime,
+                            std::map<std::string, ceph::bufferlist>* attrs,
+                             const DoutPrefixProvider *dpp)
+    : RGWSimpleCoroutine(store->ctx()), async_rados(async_rados), store(store),
+      bucket_info(bucket_info), exclusive(exclusive),
+      mtime(mtime), attrs(attrs), dpp(dpp) {}
+  ~RGWPutBucketInstanceInfoCR() override {
+    request_cleanup();
+  }
+  void request_cleanup() override {
+    if (req) {
+      req->finish();
+      req = nullptr;
+    }
+  }
+
+  int send_request(const DoutPrefixProvider *dpp) override {
+    req = new RGWAsyncPutBucketInstanceInfo(this,
+                                           stack->create_completion_notifier(),
+                                           store, bucket_info, exclusive,
+                                           mtime, attrs, dpp);
+    async_rados->queue(req);
+    return 0;
+  }
+  int request_complete() override {
+    return req->get_ret_status();
+  }
+};
+
+class RGWRadosBILogTrimCR : public RGWSimpleCoroutine {
+  const RGWBucketInfo& bucket_info;
+  int shard_id;
+  const rgw::bucket_index_layout_generation generation;
+  RGWRados::BucketShard bs;
+  std::string start_marker;
+  std::string end_marker;
+  boost::intrusive_ptr<RGWAioCompletionNotifier> cn;
+ public:
+  RGWRadosBILogTrimCR(const DoutPrefixProvider *dpp,
+                      rgw::sal::RadosStore* store, const RGWBucketInfo& bucket_info,
+                      int shard_id,
+                     const rgw::bucket_index_layout_generation& generation,
+                     const std::string& start_marker,
+                      const std::string& end_marker);
+
+  int send_request(const DoutPrefixProvider *dpp) override;
+  int request_complete() override;
+};
+
+class RGWAsyncFetchRemoteObj : public RGWAsyncRadosRequest {
+  rgw::sal::RadosStore* store;
+  rgw_zone_id source_zone;
+
+  std::optional<rgw_user> user_id;
+
+  rgw_bucket src_bucket;
+  std::optional<rgw_placement_rule> dest_placement_rule;
+  RGWBucketInfo dest_bucket_info;
+
+  rgw_obj_key key;
+  std::optional<rgw_obj_key> dest_key;
+  std::optional<uint64_t> versioned_epoch;
+
+  real_time src_mtime;
+
+  bool copy_if_newer;
+  std::shared_ptr<RGWFetchObjFilter> filter;
+  rgw_zone_set zones_trace;
+  PerfCounters* counters;
+  const DoutPrefixProvider *dpp;
+
+protected:
+  int _send_request(const DoutPrefixProvider *dpp) override;
+public:
+  RGWAsyncFetchRemoteObj(RGWCoroutine *caller, RGWAioCompletionNotifier *cn, rgw::sal::RadosStore* _store,
+                         const rgw_zone_id& _source_zone,
+                         std::optional<rgw_user>& _user_id,
+                         const rgw_bucket& _src_bucket,
+                        std::optional<rgw_placement_rule> _dest_placement_rule,
+                         const RGWBucketInfo& _dest_bucket_info,
+                         const rgw_obj_key& _key,
+                         const std::optional<rgw_obj_key>& _dest_key,
+                         std::optional<uint64_t> _versioned_epoch,
+                         bool _if_newer,
+                         std::shared_ptr<RGWFetchObjFilter> _filter,
+                         rgw_zone_set *_zones_trace,
+                         PerfCounters* counters, const DoutPrefixProvider *dpp)
+    : RGWAsyncRadosRequest(caller, cn), store(_store),
+      source_zone(_source_zone),
+      user_id(_user_id),
+      src_bucket(_src_bucket),
+      dest_placement_rule(_dest_placement_rule),
+      dest_bucket_info(_dest_bucket_info),
+      key(_key),
+      dest_key(_dest_key),
+      versioned_epoch(_versioned_epoch),
+      copy_if_newer(_if_newer),
+      filter(_filter),
+      counters(counters),
+      dpp(dpp)
+  {
+    if (_zones_trace) {
+      zones_trace = *_zones_trace;
+    }
+  }
+};
+
+class RGWFetchRemoteObjCR : public RGWSimpleCoroutine {
+  CephContext *cct;
+  RGWAsyncRadosProcessor *async_rados;
+  rgw::sal::RadosStore* store;
+  rgw_zone_id source_zone;
+
+  std::optional<rgw_user> user_id;
+
+  rgw_bucket src_bucket;
+  std::optional<rgw_placement_rule> dest_placement_rule;
+  RGWBucketInfo dest_bucket_info;
+
+  rgw_obj_key key;
+  std::optional<rgw_obj_key> dest_key;
+  std::optional<uint64_t> versioned_epoch;
+
+  real_time src_mtime;
+
+  bool copy_if_newer;
+
+  std::shared_ptr<RGWFetchObjFilter> filter;
+
+  RGWAsyncFetchRemoteObj *req;
+  rgw_zone_set *zones_trace;
+  PerfCounters* counters;
+  const DoutPrefixProvider *dpp;
+
+public:
+  RGWFetchRemoteObjCR(RGWAsyncRadosProcessor *_async_rados, rgw::sal::RadosStore* _store,
+                      const rgw_zone_id& _source_zone,
+                      std::optional<rgw_user> _user_id,
+                      const rgw_bucket& _src_bucket,
+                     std::optional<rgw_placement_rule> _dest_placement_rule,
+                      const RGWBucketInfo& _dest_bucket_info,
+                      const rgw_obj_key& _key,
+                      const std::optional<rgw_obj_key>& _dest_key,
+                      std::optional<uint64_t> _versioned_epoch,
+                      bool _if_newer,
+                      std::shared_ptr<RGWFetchObjFilter> _filter,
+                      rgw_zone_set *_zones_trace,
+                      PerfCounters* counters, const DoutPrefixProvider *dpp)
+    : RGWSimpleCoroutine(_store->ctx()), cct(_store->ctx()),
+      async_rados(_async_rados), store(_store),
+      source_zone(_source_zone),
+      user_id(_user_id),
+      src_bucket(_src_bucket),
+      dest_placement_rule(_dest_placement_rule),
+      dest_bucket_info(_dest_bucket_info),
+      key(_key),
+      dest_key(_dest_key),
+      versioned_epoch(_versioned_epoch),
+      copy_if_newer(_if_newer),
+      filter(_filter),
+      req(NULL),
+      zones_trace(_zones_trace), counters(counters), dpp(dpp) {}
+
+
+  ~RGWFetchRemoteObjCR() override {
+    request_cleanup();
+  }
+
+  void request_cleanup() override {
+    if (req) {
+      req->finish();
+      req = NULL;
+    }
+  }
+
+  int send_request(const DoutPrefixProvider *dpp) override {
+    req = new RGWAsyncFetchRemoteObj(this, stack->create_completion_notifier(), store,
+                                    source_zone, user_id, src_bucket, dest_placement_rule, dest_bucket_info,
+                                     key, dest_key, versioned_epoch, copy_if_newer, filter,
+                                     zones_trace, counters, dpp);
+    async_rados->queue(req);
+    return 0;
+  }
+
+  int request_complete() override {
+    return req->get_ret_status();
+  }
+};
+
+class RGWAsyncStatRemoteObj : public RGWAsyncRadosRequest {
+  rgw::sal::RadosStore* store;
+  rgw_zone_id source_zone;
+
+  rgw_bucket src_bucket;
+  rgw_obj_key key;
+
+  ceph::real_time *pmtime;
+  uint64_t *psize;
+  std::string *petag;
+  std::map<std::string, bufferlist> *pattrs;
+  std::map<std::string, std::string> *pheaders;
+
+protected:
+  int _send_request(const DoutPrefixProvider *dpp) override;
+public:
+  RGWAsyncStatRemoteObj(RGWCoroutine *caller, RGWAioCompletionNotifier *cn, rgw::sal::RadosStore* _store,
+                         const rgw_zone_id& _source_zone,
+                         rgw_bucket& _src_bucket,
+                         const rgw_obj_key& _key,
+                         ceph::real_time *_pmtime,
+                         uint64_t *_psize,
+                         std::string *_petag,
+                         std::map<std::string, bufferlist> *_pattrs,
+                         std::map<std::string, std::string> *_pheaders) : RGWAsyncRadosRequest(caller, cn), store(_store),
+                                                      source_zone(_source_zone),
+                                                      src_bucket(_src_bucket),
+                                                      key(_key),
+                                                      pmtime(_pmtime),
+                                                      psize(_psize),
+                                                      petag(_petag),
+                                                      pattrs(_pattrs),
+                                                      pheaders(_pheaders) {}
+};
+
+class RGWStatRemoteObjCR : public RGWSimpleCoroutine {
+  CephContext *cct;
+  RGWAsyncRadosProcessor *async_rados;
+  rgw::sal::RadosStore* store;
+  rgw_zone_id source_zone;
+
+  rgw_bucket src_bucket;
+  rgw_obj_key key;
+
+  ceph::real_time *pmtime;
+  uint64_t *psize;
+  std::string *petag;
+  std::map<std::string, bufferlist> *pattrs;
+  std::map<std::string, std::string> *pheaders;
+
+  RGWAsyncStatRemoteObj *req;
+
+public:
+  RGWStatRemoteObjCR(RGWAsyncRadosProcessor *_async_rados, rgw::sal::RadosStore* _store,
+                      const rgw_zone_id& _source_zone,
+                      rgw_bucket& _src_bucket,
+                      const rgw_obj_key& _key,
+                      ceph::real_time *_pmtime,
+                      uint64_t *_psize,
+                      std::string *_petag,
+                      std::map<std::string, bufferlist> *_pattrs,
+                      std::map<std::string, std::string> *_pheaders) : RGWSimpleCoroutine(_store->ctx()), cct(_store->ctx()),
+                                       async_rados(_async_rados), store(_store),
+                                       source_zone(_source_zone),
+                                       src_bucket(_src_bucket),
+                                       key(_key),
+                                       pmtime(_pmtime),
+                                       psize(_psize),
+                                       petag(_petag),
+                                       pattrs(_pattrs),
+                                       pheaders(_pheaders),
+                                       req(NULL) {}
+
+
+  ~RGWStatRemoteObjCR() override {
+    request_cleanup();
+  }
+
+  void request_cleanup() override {
+    if (req) {
+      req->finish();
+      req = NULL;
+    }
+  }
+
+  int send_request(const DoutPrefixProvider *dpp) override {
+    req = new RGWAsyncStatRemoteObj(this, stack->create_completion_notifier(), store, source_zone,
+                                    src_bucket, key, pmtime, psize, petag, pattrs, pheaders);
+    async_rados->queue(req);
+    return 0;
+  }
+
+  int request_complete() override {
+    return req->get_ret_status();
+  }
+};
+
+class RGWAsyncRemoveObj : public RGWAsyncRadosRequest {
+  const DoutPrefixProvider *dpp;
+  rgw::sal::RadosStore* store;
+  rgw_zone_id source_zone;
+
+  std::unique_ptr<rgw::sal::Bucket> bucket;
+  std::unique_ptr<rgw::sal::Object> obj;
+
+  std::string owner;
+  std::string owner_display_name;
+  bool versioned;
+  uint64_t versioned_epoch;
+  std::string marker_version_id;
+
+  bool del_if_older;
+  ceph::real_time timestamp;
+  rgw_zone_set zones_trace;
+
+protected:
+  int _send_request(const DoutPrefixProvider *dpp) override;
+public:
+  RGWAsyncRemoveObj(const DoutPrefixProvider *_dpp, RGWCoroutine *caller, RGWAioCompletionNotifier *cn, 
+                         rgw::sal::RadosStore* _store,
+                         const rgw_zone_id& _source_zone,
+                         RGWBucketInfo& _bucket_info,
+                         const rgw_obj_key& _key,
+                         const std::string& _owner,
+                         const std::string& _owner_display_name,
+                         bool _versioned,
+                         uint64_t _versioned_epoch,
+                         bool _delete_marker,
+                         bool _if_older,
+                         real_time& _timestamp,
+                         rgw_zone_set* _zones_trace) : RGWAsyncRadosRequest(caller, cn), dpp(_dpp), store(_store),
+                                                      source_zone(_source_zone),
+                                                      owner(_owner),
+                                                      owner_display_name(_owner_display_name),
+                                                      versioned(_versioned),
+                                                      versioned_epoch(_versioned_epoch),
+                                                      del_if_older(_if_older),
+                                                      timestamp(_timestamp) {
+    if (_delete_marker) {
+      marker_version_id = _key.instance;
+    }
+
+    if (_zones_trace) {
+      zones_trace = *_zones_trace;
+    }
+    store->get_bucket(nullptr, _bucket_info, &bucket);
+    obj = bucket->get_object(_key);
+  }
+};
+
+class RGWRemoveObjCR : public RGWSimpleCoroutine {
+  const DoutPrefixProvider *dpp;
+  CephContext *cct;
+  RGWAsyncRadosProcessor *async_rados;
+  rgw::sal::RadosStore* store;
+  rgw_zone_id source_zone;
+
+  RGWBucketInfo bucket_info;
+
+  rgw_obj_key key;
+  bool versioned;
+  uint64_t versioned_epoch;
+  bool delete_marker;
+  std::string owner;
+  std::string owner_display_name;
+
+  bool del_if_older;
+  real_time timestamp;
+
+  RGWAsyncRemoveObj *req;
+  
+  rgw_zone_set *zones_trace;
+
+public:
+  RGWRemoveObjCR(const DoutPrefixProvider *_dpp, RGWAsyncRadosProcessor *_async_rados, rgw::sal::RadosStore* _store,
+                      const rgw_zone_id& _source_zone,
+                      RGWBucketInfo& _bucket_info,
+                      const rgw_obj_key& _key,
+                      bool _versioned,
+                      uint64_t _versioned_epoch,
+                      std::string *_owner,
+                      std::string *_owner_display_name,
+                      bool _delete_marker,
+                      real_time *_timestamp,
+                      rgw_zone_set *_zones_trace) : RGWSimpleCoroutine(_store->ctx()), dpp(_dpp), cct(_store->ctx()),
+                                       async_rados(_async_rados), store(_store),
+                                       source_zone(_source_zone),
+                                       bucket_info(_bucket_info),
+                                       key(_key),
+                                       versioned(_versioned),
+                                       versioned_epoch(_versioned_epoch),
+                                       delete_marker(_delete_marker), req(NULL), zones_trace(_zones_trace) {
+    del_if_older = (_timestamp != NULL);
+    if (_timestamp) {
+      timestamp = *_timestamp;
+    }
+
+    if (_owner) {
+      owner = *_owner;
+    }
+
+    if (_owner_display_name) {
+      owner_display_name = *_owner_display_name;
+    }
+  }
+  ~RGWRemoveObjCR() override {
+    request_cleanup();
+  }
+
+  void request_cleanup() override {
+    if (req) {
+      req->finish();
+      req = NULL;
+    }
+  }
+
+  int send_request(const DoutPrefixProvider *dpp) override {
+    req = new RGWAsyncRemoveObj(dpp, this, stack->create_completion_notifier(), store, source_zone, bucket_info,
+                                key, owner, owner_display_name, versioned, versioned_epoch,
+                                delete_marker, del_if_older, timestamp, zones_trace);
+    async_rados->queue(req);
+    return 0;
+  }
+
+  int request_complete() override {
+    return req->get_ret_status();
+  }
+};
+
+class RGWContinuousLeaseCR : public RGWCoroutine {
+  RGWAsyncRadosProcessor *async_rados;
+  rgw::sal::RadosStore* store;
+
+  const rgw_raw_obj obj;
+
+  const std::string lock_name;
+  const std::string cookie;
+
+  int interval;
+  bool going_down{ false };
+  bool locked{false};
+  
+  const ceph::timespan interval_tolerance;
+  const ceph::timespan ts_interval;
+
+  RGWCoroutine *caller;
+
+  bool aborted{false};
+  
+  ceph::coarse_mono_time last_renew_try_time;
+  ceph::coarse_mono_time current_time;
+
+public:
+  RGWContinuousLeaseCR(RGWAsyncRadosProcessor *_async_rados, rgw::sal::RadosStore* _store,
+                       const rgw_raw_obj& _obj,
+                       const std::string& _lock_name, int _interval, RGWCoroutine *_caller)
+    : RGWCoroutine(_store->ctx()), async_rados(_async_rados), store(_store),
+    obj(_obj), lock_name(_lock_name),
+    cookie(RGWSimpleRadosLockCR::gen_random_cookie(cct)),
+    interval(_interval), interval_tolerance(ceph::make_timespan(9*interval/10)), ts_interval(ceph::make_timespan(interval)),
+      caller(_caller)
+  {}
+
+  virtual ~RGWContinuousLeaseCR() override;
+
+  int operate(const DoutPrefixProvider *dpp) override;
+
+  bool is_locked() const {
+    if (ceph::coarse_mono_clock::now() - last_renew_try_time > ts_interval) {
+      return false;
+    }
+    return locked;
+  }
+
+  void set_locked(bool status) {
+    locked = status;
+  }
+
+  void go_down() {
+    going_down = true;
+    wakeup();
+  }
+
+  void abort() {
+    aborted = true;
+  }
+};
+
+class RGWRadosTimelogAddCR : public RGWSimpleCoroutine {
+  const DoutPrefixProvider *dpp;
+  rgw::sal::RadosStore* store;
+  std::list<cls_log_entry> entries;
+
+  std::string oid;
+
+  boost::intrusive_ptr<RGWAioCompletionNotifier> cn;
+
+public:
+  RGWRadosTimelogAddCR(const DoutPrefixProvider *dpp, rgw::sal::RadosStore* _store, const std::string& _oid,
+                       const cls_log_entry& entry);
+
+  int send_request(const DoutPrefixProvider *dpp) override;
+  int request_complete() override;
+};
+
+class RGWRadosTimelogTrimCR : public RGWSimpleCoroutine {
+  const DoutPrefixProvider *dpp;
+  rgw::sal::RadosStore* store;
+  boost::intrusive_ptr<RGWAioCompletionNotifier> cn;
+ protected:
+  std::string oid;
+  real_time start_time;
+  real_time end_time;
+  std::string from_marker;
+  std::string to_marker;
+
+ public:
+  RGWRadosTimelogTrimCR(const DoutPrefixProvider *dpp, 
+                        rgw::sal::RadosStore* store, const std::string& oid,
+                        const real_time& start_time, const real_time& end_time,
+                        const std::string& from_marker,
+                        const std::string& to_marker);
+
+  int send_request(const DoutPrefixProvider *dpp) override;
+  int request_complete() override;
+};
+
+// wrapper to update last_trim_marker on success
+class RGWSyncLogTrimCR : public RGWRadosTimelogTrimCR {
+  CephContext *cct;
+  std::string *last_trim_marker;
+ public:
+  static constexpr const char* max_marker = "99999999";
+
+  RGWSyncLogTrimCR(const DoutPrefixProvider *dpp,
+                   rgw::sal::RadosStore* store, const std::string& oid,
+                   const std::string& to_marker, std::string *last_trim_marker);
+  int request_complete() override;
+};
+
+class RGWAsyncStatObj : public RGWAsyncRadosRequest {
+  const DoutPrefixProvider *dpp;
+  rgw::sal::RadosStore* store;
+  RGWBucketInfo bucket_info;
+  rgw_obj obj;
+  uint64_t *psize;
+  real_time *pmtime;
+  uint64_t *pepoch;
+  RGWObjVersionTracker *objv_tracker;
+protected:
+  int _send_request(const DoutPrefixProvider *dpp) override;
+public:
+  RGWAsyncStatObj(const DoutPrefixProvider *dpp, RGWCoroutine *caller, RGWAioCompletionNotifier *cn, rgw::sal::RadosStore* store,
+                  const RGWBucketInfo& _bucket_info, const rgw_obj& obj, uint64_t *psize = nullptr,
+                  real_time *pmtime = nullptr, uint64_t *pepoch = nullptr,
+                  RGWObjVersionTracker *objv_tracker = nullptr)
+         : RGWAsyncRadosRequest(caller, cn), dpp(dpp), store(store), obj(obj), psize(psize),
+         pmtime(pmtime), pepoch(pepoch), objv_tracker(objv_tracker) {}
+};
+
+class RGWStatObjCR : public RGWSimpleCoroutine {
+  const DoutPrefixProvider *dpp;
+  rgw::sal::RadosStore* store;
+  RGWAsyncRadosProcessor *async_rados;
+  RGWBucketInfo bucket_info;
+  rgw_obj obj;
+  uint64_t *psize;
+  real_time *pmtime;
+  uint64_t *pepoch;
+  RGWObjVersionTracker *objv_tracker;
+  RGWAsyncStatObj *req = nullptr;
+ public:
+  RGWStatObjCR(const DoutPrefixProvider *dpp, RGWAsyncRadosProcessor *async_rados, rgw::sal::RadosStore* store,
+         const RGWBucketInfo& _bucket_info, const rgw_obj& obj, uint64_t *psize = nullptr,
+         real_time* pmtime = nullptr, uint64_t *pepoch = nullptr,
+         RGWObjVersionTracker *objv_tracker = nullptr);
+  ~RGWStatObjCR() override {
+    request_cleanup();
+  }
+  void request_cleanup() override;
+
+  int send_request(const DoutPrefixProvider *dpp) override;
+  int request_complete() override;
+};
+
+/// coroutine wrapper for IoCtx::aio_notify()
+class RGWRadosNotifyCR : public RGWSimpleCoroutine {
+  rgw::sal::RadosStore* const store;
+  const rgw_raw_obj obj;
+  bufferlist request;
+  const uint64_t timeout_ms;
+  bufferlist *response;
+  rgw_rados_ref ref;
+  boost::intrusive_ptr<RGWAioCompletionNotifier> cn;
+
+public:
+  RGWRadosNotifyCR(rgw::sal::RadosStore* store, const rgw_raw_obj& obj,
+                   bufferlist& request, uint64_t timeout_ms,
+                   bufferlist *response);
+
+  int send_request(const DoutPrefixProvider *dpp) override;
+  int request_complete() override;
+};
+
+class RGWDataPostNotifyCR : public RGWCoroutine {
+  RGWRados *store;
+  RGWHTTPManager& http_manager;
+  bc::flat_map<int, bc::flat_set<rgw_data_notify_entry> >& shards;
+  const char *source_zone;
+  RGWRESTConn *conn;
+
+public:
+  RGWDataPostNotifyCR(RGWRados *_store, RGWHTTPManager& _http_manager, bc::flat_map<int,
+                    bc::flat_set<rgw_data_notify_entry> >& _shards, const char *_zone, RGWRESTConn *_conn)
+                    : RGWCoroutine(_store->ctx()), store(_store), http_manager(_http_manager),
+                      shards(_shards), source_zone(_zone), conn(_conn) {}
+
+  int operate(const DoutPrefixProvider* dpp) override;
+};
+
+#endif
diff --git a/src/rgw/driver/rados/rgw_cr_tools.cc b/src/rgw/driver/rados/rgw_cr_tools.cc
new file mode 100644 (file)
index 0000000..94665a3
--- /dev/null
@@ -0,0 +1,292 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "common/errno.h"
+
+#include "rgw_cr_tools.h"
+#include "rgw_bucket.h"
+#include "rgw_user.h"
+#include "rgw_op.h"
+#include "rgw_acl_s3.h"
+#include "rgw_zone.h"
+
+#include "services/svc_zone.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+
+template<>
+int RGWUserCreateCR::Request::_send_request(const DoutPrefixProvider *dpp)
+{
+  CephContext *cct = store->ctx();
+
+  const int32_t default_max_buckets =
+    cct->_conf.get_val<int64_t>("rgw_user_max_buckets");
+
+  RGWUserAdminOpState op_state(store);
+
+  auto& user = params.user;
+
+  op_state.set_user_id(user);
+  op_state.set_display_name(params.display_name);
+  op_state.set_user_email(params.email);
+  op_state.set_caps(params.caps);
+  op_state.set_access_key(params.access_key);
+  op_state.set_secret_key(params.secret_key);
+
+  if (!params.key_type.empty()) {
+    int32_t key_type = KEY_TYPE_S3;
+    if (params.key_type == "swift") {
+      key_type = KEY_TYPE_SWIFT;
+    }
+
+    op_state.set_key_type(key_type);
+  }
+
+  op_state.set_max_buckets(params.max_buckets.value_or(default_max_buckets));
+  op_state.set_suspension(params.suspended);
+  op_state.set_system(params.system);
+  op_state.set_exclusive(params.exclusive);
+
+  if (params.generate_key) {
+    op_state.set_generate_key();
+  }
+
+
+  if (params.apply_quota) {
+    RGWQuota quota;
+
+    if (cct->_conf->rgw_bucket_default_quota_max_objects >= 0) {
+      quota.bucket_quota.max_objects = cct->_conf->rgw_bucket_default_quota_max_objects;
+      quota.bucket_quota.enabled = true;
+    }
+
+    if (cct->_conf->rgw_bucket_default_quota_max_size >= 0) {
+      quota.bucket_quota.max_size = cct->_conf->rgw_bucket_default_quota_max_size;
+      quota.bucket_quota.enabled = true;
+    }
+
+    if (cct->_conf->rgw_user_default_quota_max_objects >= 0) {
+      quota.user_quota.max_objects = cct->_conf->rgw_user_default_quota_max_objects;
+      quota.user_quota.enabled = true;
+    }
+
+    if (cct->_conf->rgw_user_default_quota_max_size >= 0) {
+      quota.user_quota.max_size = cct->_conf->rgw_user_default_quota_max_size;
+      quota.user_quota.enabled = true;
+    }
+
+    if (quota.bucket_quota.enabled) {
+      op_state.set_bucket_quota(quota.bucket_quota);
+    }
+
+    if (quota.user_quota.enabled) {
+      op_state.set_user_quota(quota.user_quota);
+    }
+  }
+
+  RGWNullFlusher flusher;
+  return RGWUserAdminOp_User::create(dpp, store, op_state, flusher, null_yield);
+}
+
+template<>
+int RGWGetUserInfoCR::Request::_send_request(const DoutPrefixProvider *dpp)
+{
+  return store->ctl()->user->get_info_by_uid(dpp, params.user, result.get(), null_yield);
+}
+
+template<>
+int RGWGetBucketInfoCR::Request::_send_request(const DoutPrefixProvider *dpp)
+{
+  return store->get_bucket(dpp, nullptr, params.tenant, params.bucket_name, &result->bucket, null_yield);
+}
+
+template<>
+int RGWBucketCreateLocalCR::Request::_send_request(const DoutPrefixProvider *dpp)
+{
+  CephContext *cct = store->ctx();
+  auto& zone_svc = store->svc()->zone;
+
+  const auto& user_info = params.user_info.get();
+  const auto& user = user_info->user_id;
+  const auto& bucket_name = params.bucket_name;
+  auto& placement_rule = params.placement_rule;
+
+  if (!placement_rule.empty() &&
+      !zone_svc->get_zone_params().valid_placement(placement_rule)) {
+    ldpp_dout(dpp, 0) << "placement target (" << placement_rule << ")"
+      << " doesn't exist in the placement targets of zonegroup"
+      << " (" << zone_svc->get_zonegroup().api_name << ")" << dendl;
+    return -ERR_INVALID_LOCATION_CONSTRAINT;
+  }
+
+  /* we need to make sure we read bucket info, it's not read before for this
+   * specific request */
+  RGWBucketInfo bucket_info;
+  map<string, bufferlist> bucket_attrs;
+
+  int ret = store->getRados()->get_bucket_info(store->svc(), user.tenant, bucket_name,
+                                 bucket_info, nullptr, null_yield, dpp, &bucket_attrs);
+  if (ret < 0 && ret != -ENOENT)
+    return ret;
+  bool bucket_exists = (ret != -ENOENT);
+
+  RGWAccessControlPolicy old_policy(cct);
+  ACLOwner bucket_owner;
+  bucket_owner.set_id(user);
+  bucket_owner.set_name(user_info->display_name);
+  if (bucket_exists) {
+    ret = rgw_op_get_bucket_policy_from_attr(dpp, cct, store, bucket_info,
+                                             bucket_attrs, &old_policy, null_yield);
+    if (ret >= 0)  {
+      if (old_policy.get_owner().get_id().compare(user) != 0) {
+        return -EEXIST;
+      }
+    }
+  }
+
+  RGWBucketInfo master_info;
+  rgw_bucket *pmaster_bucket = nullptr;
+  uint32_t *pmaster_num_shards = nullptr;
+  real_time creation_time;
+
+  string zonegroup_id = zone_svc->get_zonegroup().get_id();
+
+  if (bucket_exists) {
+    rgw_placement_rule selected_placement_rule;
+    rgw_bucket bucket;
+    bucket.tenant = user.tenant;
+    bucket.name = bucket_name;
+    ret = zone_svc->select_bucket_placement(dpp, *user_info, zonegroup_id,
+                                           placement_rule,
+                                           &selected_placement_rule, nullptr, null_yield);
+    if (selected_placement_rule != bucket_info.placement_rule) {
+      ldpp_dout(dpp, 0) << "bucket already exists on a different placement rule: "
+        << " selected_rule= " << selected_placement_rule
+        << " existing_rule= " << bucket_info.placement_rule << dendl;
+      return -EEXIST;
+    }
+  }
+
+  /* Encode special metadata first as we're using std::map::emplace under
+   * the hood. This method will add the new items only if the map doesn't
+   * contain such keys yet. */
+  RGWAccessControlPolicy_S3 policy(cct);
+  policy.create_canned(bucket_owner, bucket_owner, string()); /* default private policy */
+  bufferlist aclbl;
+  policy.encode(aclbl);
+  map<string, buffer::list> attrs;
+  attrs.emplace(std::move(RGW_ATTR_ACL), std::move(aclbl));
+
+  RGWQuotaInfo quota_info;
+  const RGWQuotaInfo * pquota_info = nullptr;
+
+  rgw_bucket bucket;
+  bucket.tenant = user.tenant;
+  bucket.name = bucket_name;
+
+  RGWBucketInfo info;
+  obj_version ep_objv;
+
+  ret = store->getRados()->create_bucket(*user_info, bucket, zonegroup_id,
+                                placement_rule, bucket_info.swift_ver_location,
+                                pquota_info, attrs,
+                                info, nullptr, &ep_objv, creation_time,
+                               pmaster_bucket, pmaster_num_shards, null_yield, dpp, true);
+
+
+  if (ret && ret != -EEXIST)
+    return ret;
+
+  bool existed = (ret == -EEXIST);
+
+  if (existed) {
+    if (info.owner != user) {
+      ldpp_dout(dpp, 20) << "NOTICE: bucket already exists under a different user (bucket=" << bucket << " user=" << user << " bucket_owner=" << info.owner << dendl;
+      return -EEXIST;
+    }
+    bucket = info.bucket;
+  }
+
+  ret = store->ctl()->bucket->link_bucket(user, bucket, info.creation_time, null_yield, dpp, false);
+  if (ret && !existed && ret != -EEXIST) {
+    /* if it exists (or previously existed), don't remove it! */
+    int r = store->ctl()->bucket->unlink_bucket(user, bucket, null_yield, dpp);
+    if (r < 0) {
+      ldpp_dout(dpp, 0) << "WARNING: failed to unlink bucket: ret=" << r << dendl;
+    }
+  } else if (ret == -EEXIST || (ret == 0 && existed)) {
+    ret = -ERR_BUCKET_EXISTS;
+  }
+
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: bucket creation (bucket=" << bucket << ") return ret=" << ret << dendl;
+  }
+
+  return ret;
+}
+
+template<>
+int RGWObjectSimplePutCR::Request::_send_request(const DoutPrefixProvider *dpp)
+{
+  RGWDataAccess::ObjectRef obj;
+
+  CephContext *cct = store->ctx();
+
+  int ret = params.bucket->get_object(params.key, &obj);
+  if (ret < 0) {
+    lderr(cct) << "ERROR: failed to get object: " << cpp_strerror(-ret) << dendl;
+    return -ret;
+  }
+
+  if (params.user_data) {
+    obj->set_user_data(*params.user_data);
+  }
+
+  ret = obj->put(params.data, params.attrs, dpp, null_yield);
+  if (ret < 0) {
+    ldpp_dout(dpp, -1) << "ERROR: put object returned error: " << cpp_strerror(-ret) << dendl;
+  }
+
+  return 0;
+}
+
+template<>
+int RGWBucketLifecycleConfigCR::Request::_send_request(const DoutPrefixProvider *dpp)
+{
+  CephContext *cct = store->ctx();
+
+  RGWLC *lc = store->getRados()->get_lc();
+  if (!lc) {
+    lderr(cct) << "ERROR: lifecycle object is not initialized!" << dendl;
+    return -EIO;
+  }
+
+  int ret = lc->set_bucket_config(params.bucket,
+                                  params.bucket_attrs,
+                                  &params.config);
+  if (ret < 0) {
+    lderr(cct) << "ERROR: failed to set lifecycle on bucke: " << cpp_strerror(-ret) << dendl;
+    return -ret;
+  }
+
+  return 0;
+}
+
+template<>
+int RGWBucketGetSyncPolicyHandlerCR::Request::_send_request(const DoutPrefixProvider *dpp)
+{
+  int r = store->ctl()->bucket->get_sync_policy_handler(params.zone,
+                                                        params.bucket,
+                                                        &result->policy_handler,
+                                                        null_yield,
+                                                        dpp);
+  if (r < 0) {
+    ldpp_dout(dpp, -1) << "ERROR: " << __func__ << "(): get_sync_policy_handler() returned " << r << dendl;
+    return  r;
+  }
+
+  return 0;
+}
diff --git a/src/rgw/driver/rados/rgw_cr_tools.h b/src/rgw/driver/rados/rgw_cr_tools.h
new file mode 100644 (file)
index 0000000..ebdbfeb
--- /dev/null
@@ -0,0 +1,87 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#ifndef CEPH_RGW_CR_TOOLS_H
+#define CEPH_RGW_CR_TOOLS_H
+
+#include "rgw_cr_rados.h"
+#include "rgw_tools.h"
+#include "rgw_lc.h"
+
+#include "services/svc_bucket_sync.h"
+
+struct rgw_user_create_params {
+  rgw_user user;
+  std::string display_name;
+  std::string email;
+  std::string access_key;
+  std::string secret_key;
+  std::string key_type; /* "swift" or "s3" */
+  std::string caps;
+
+  bool generate_key{true};
+  bool suspended{false};
+  std::optional<int32_t> max_buckets;
+  bool system{false};
+  bool exclusive{false};
+  bool apply_quota{true};
+};
+
+using RGWUserCreateCR = RGWSimpleWriteOnlyAsyncCR<rgw_user_create_params>;
+
+struct rgw_get_user_info_params {
+  rgw_user user;
+};
+
+using RGWGetUserInfoCR = RGWSimpleAsyncCR<rgw_get_user_info_params, RGWUserInfo>;
+
+struct rgw_get_bucket_info_params {
+  std::string tenant;
+  std::string bucket_name;
+};
+
+struct rgw_get_bucket_info_result {
+  std::unique_ptr<rgw::sal::Bucket> bucket;
+};
+
+using RGWGetBucketInfoCR = RGWSimpleAsyncCR<rgw_get_bucket_info_params, rgw_get_bucket_info_result>;
+
+struct rgw_bucket_create_local_params {
+  std::shared_ptr<RGWUserInfo> user_info;
+  std::string bucket_name;
+  rgw_placement_rule placement_rule;
+};
+
+using RGWBucketCreateLocalCR = RGWSimpleWriteOnlyAsyncCR<rgw_bucket_create_local_params>;
+
+struct rgw_object_simple_put_params {
+  RGWDataAccess::BucketRef bucket;
+  rgw_obj_key key;
+  bufferlist data;
+  std::map<std::string, bufferlist> attrs;
+  std::optional<std::string> user_data;
+};
+
+using RGWObjectSimplePutCR = RGWSimpleWriteOnlyAsyncCR<rgw_object_simple_put_params>;
+
+
+struct rgw_bucket_lifecycle_config_params {
+  rgw::sal::Bucket* bucket;
+  rgw::sal::Attrs bucket_attrs;
+  RGWLifecycleConfiguration config;
+};
+
+using RGWBucketLifecycleConfigCR = RGWSimpleWriteOnlyAsyncCR<rgw_bucket_lifecycle_config_params>;
+
+struct rgw_bucket_get_sync_policy_params {
+  std::optional<rgw_zone_id> zone;
+  std::optional<rgw_bucket> bucket;
+};
+
+struct rgw_bucket_get_sync_policy_result {
+  RGWBucketSyncPolicyHandlerRef policy_handler;
+};
+
+using RGWBucketGetSyncPolicyHandlerCR = RGWSimpleAsyncCR<rgw_bucket_get_sync_policy_params, rgw_bucket_get_sync_policy_result>;
+
+#endif
diff --git a/src/rgw/driver/rados/rgw_d3n_datacache.cc b/src/rgw/driver/rados/rgw_d3n_datacache.cc
new file mode 100644 (file)
index 0000000..ed375e2
--- /dev/null
@@ -0,0 +1,369 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "rgw_d3n_datacache.h"
+#include "rgw_rest_client.h"
+#include "rgw_auth_s3.h"
+#include "rgw_op.h"
+#include "rgw_common.h"
+#include "rgw_auth_s3.h"
+#include "rgw_op.h"
+#include "rgw_crypt_sanitize.h"
+#if defined(__linux__)
+#include <features.h>
+#endif
+
+#if __has_include(<filesystem>)
+#include <filesystem>
+namespace efs = std::filesystem;
+#else
+#include <experimental/filesystem>
+namespace efs = std::experimental::filesystem;
+#endif
+
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+
+int D3nCacheAioWriteRequest::d3n_prepare_libaio_write_op(bufferlist& bl, unsigned int len, string oid, string cache_location)
+{
+  std::string location = cache_location + oid;
+  int r = 0;
+
+  lsubdout(g_ceph_context, rgw_datacache, 20) << "D3nDataCache: " << __func__ << "(): Write To Cache, location=" << location << dendl;
+  cb = new struct aiocb;
+  mode_t mode = S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH;
+  memset(cb, 0, sizeof(struct aiocb));
+  r = fd = ::open(location.c_str(), O_WRONLY | O_CREAT | O_TRUNC, mode);
+  if (fd < 0) {
+    ldout(cct, 0) << "ERROR: D3nCacheAioWriteRequest::create_io: open file failed, errno=" << errno << ", location='" << location.c_str() << "'" << dendl;
+    goto done;
+  }
+  if (g_conf()->rgw_d3n_l1_fadvise != POSIX_FADV_NORMAL)
+    posix_fadvise(fd, 0, 0, g_conf()->rgw_d3n_l1_fadvise);
+  cb->aio_fildes = fd;
+
+  data = malloc(len);
+  if (!data) {
+    ldout(cct, 0) << "ERROR: D3nCacheAioWriteRequest::create_io: memory allocation failed" << dendl;
+    goto close_file;
+  }
+  cb->aio_buf = data;
+  memcpy((void*)data, bl.c_str(), len);
+  cb->aio_nbytes = len;
+  goto done;
+
+close_file:
+  ::close(fd);
+done:
+  return r;
+}
+
+D3nDataCache::D3nDataCache()
+  : cct(nullptr), io_type(_io_type::ASYNC_IO), free_data_cache_size(0), outstanding_write_size(0)
+{
+  lsubdout(g_ceph_context, rgw_datacache, 5) << "D3nDataCache: " << __func__ << "()" << dendl;
+}
+
+void D3nDataCache::init(CephContext *_cct) {
+  cct = _cct;
+  free_data_cache_size = cct->_conf->rgw_d3n_l1_datacache_size;
+  head = nullptr;
+  tail = nullptr;
+  cache_location = cct->_conf->rgw_d3n_l1_datacache_persistent_path;
+  if(cache_location.back() != '/') {
+      cache_location += "/";
+  }
+  try {
+    if (efs::exists(cache_location)) {
+      // d3n: evict the cache storage directory
+      if (g_conf()->rgw_d3n_l1_evict_cache_on_start) {
+        lsubdout(g_ceph_context, rgw, 5) << "D3nDataCache: init: evicting the persistent storage directory on start" << dendl;
+        for (auto& p : efs::directory_iterator(cache_location)) {
+          efs::remove_all(p.path());
+        }
+      }
+    } else {
+      // create the cache storage directory
+      lsubdout(g_ceph_context, rgw, 5) << "D3nDataCache: init: creating the persistent storage directory on start" << dendl;
+      efs::create_directories(cache_location);
+    }
+  } catch (const efs::filesystem_error& e) {
+    lderr(g_ceph_context) << "D3nDataCache: init: ERROR initializing the cache storage directory '" << cache_location <<
+                              "' : " << e.what() << dendl;
+  }
+
+  auto conf_eviction_policy = cct->_conf.get_val<std::string>("rgw_d3n_l1_eviction_policy");
+  ceph_assert(conf_eviction_policy == "lru" || conf_eviction_policy == "random");
+  if (conf_eviction_policy == "lru")
+    eviction_policy = _eviction_policy::LRU;
+  if (conf_eviction_policy == "random")
+    eviction_policy = _eviction_policy::RANDOM;
+
+#if defined(HAVE_LIBAIO) && defined(__GLIBC__)
+  // libaio setup
+  struct aioinit ainit{0};
+  ainit.aio_threads = cct->_conf.get_val<int64_t>("rgw_d3n_libaio_aio_threads");
+  ainit.aio_num = cct->_conf.get_val<int64_t>("rgw_d3n_libaio_aio_num");
+  ainit.aio_idle_time = 120;
+  aio_init(&ainit);
+#endif
+}
+
+int D3nDataCache::d3n_io_write(bufferlist& bl, unsigned int len, std::string oid)
+{
+  D3nChunkDataInfo* chunk_info = new D3nChunkDataInfo;
+  std::string location = cache_location + oid;
+
+  lsubdout(g_ceph_context, rgw_datacache, 20) << "D3nDataCache: " << __func__ << "(): location=" << location << dendl;
+  FILE *cache_file = nullptr;
+  int r = 0;
+  size_t nbytes = 0;
+
+  cache_file = fopen(location.c_str(), "w+");
+  if (cache_file == nullptr) {
+    ldout(cct, 0) << "ERROR: D3nDataCache::fopen file has return error, errno=" << errno << dendl;
+    return -errno;
+  }
+
+  nbytes = fwrite(bl.c_str(), 1, len, cache_file);
+  if (nbytes != len) {
+    ldout(cct, 0) << "ERROR: D3nDataCache::io_write: fwrite has returned error: nbytes!=len, nbytes=" << nbytes << ", len=" << len << dendl;
+    return -EIO;
+  }
+
+  r = fclose(cache_file);
+  if (r != 0) {
+    ldout(cct, 0) << "ERROR: D3nDataCache::fclsoe file has return error, errno=" << errno << dendl;
+    return -errno;
+  }
+
+  { // update cahce_map entries for new chunk in cache
+    const std::lock_guard l(d3n_cache_lock);
+    chunk_info->oid = oid;
+    chunk_info->set_ctx(cct);
+    chunk_info->size = len;
+    d3n_cache_map.insert(pair<string, D3nChunkDataInfo*>(oid, chunk_info));
+  }
+
+  return r;
+}
+
+void d3n_libaio_write_cb(sigval sigval)
+{
+  lsubdout(g_ceph_context, rgw_datacache, 30) << "D3nDataCache: " << __func__ << "()" << dendl;
+  D3nCacheAioWriteRequest* c = static_cast<D3nCacheAioWriteRequest*>(sigval.sival_ptr);
+  c->priv_data->d3n_libaio_write_completion_cb(c);
+}
+
+
+void D3nDataCache::d3n_libaio_write_completion_cb(D3nCacheAioWriteRequest* c)
+{
+  D3nChunkDataInfo* chunk_info{nullptr};
+
+  ldout(cct, 5) << "D3nDataCache: " << __func__ << "(): oid=" << c->oid << dendl;
+
+  { // update cache_map entries for new chunk in cache
+    const std::lock_guard l(d3n_cache_lock);
+    d3n_outstanding_write_list.erase(c->oid);
+    chunk_info = new D3nChunkDataInfo;
+    chunk_info->oid = c->oid;
+    chunk_info->set_ctx(cct);
+    chunk_info->size = c->cb->aio_nbytes;
+    d3n_cache_map.insert(pair<string, D3nChunkDataInfo*>(c->oid, chunk_info));
+  }
+
+  { // update free size
+    const std::lock_guard l(d3n_eviction_lock);
+    free_data_cache_size -= c->cb->aio_nbytes;
+    outstanding_write_size -= c->cb->aio_nbytes;
+    lru_insert_head(chunk_info);
+  }
+  delete c;
+  c = nullptr;
+}
+
+int D3nDataCache::d3n_libaio_create_write_request(bufferlist& bl, unsigned int len, std::string oid)
+{
+  lsubdout(g_ceph_context, rgw_datacache, 30) << "D3nDataCache: " << __func__ << "(): Write To Cache, oid=" << oid << ", len=" << len << dendl;
+  struct D3nCacheAioWriteRequest* wr = new struct D3nCacheAioWriteRequest(cct);
+  int r=0;
+  if ((r = wr->d3n_prepare_libaio_write_op(bl, len, oid, cache_location)) < 0) {
+    ldout(cct, 0) << "ERROR: D3nDataCache: " << __func__ << "() prepare libaio write op r=" << r << dendl;
+    goto done;
+  }
+  wr->cb->aio_sigevent.sigev_notify = SIGEV_THREAD;
+  wr->cb->aio_sigevent.sigev_notify_function = d3n_libaio_write_cb;
+  wr->cb->aio_sigevent.sigev_notify_attributes = nullptr;
+  wr->cb->aio_sigevent.sigev_value.sival_ptr = (void*)wr;
+  wr->oid = oid;
+  wr->priv_data = this;
+
+  if ((r = ::aio_write(wr->cb)) != 0) {
+    ldout(cct, 0) << "ERROR: D3nDataCache: " << __func__ << "() aio_write r=" << r << dendl;
+    goto error;
+  }
+  return 0;
+
+error:
+  delete wr;
+done:
+  return r;
+}
+
+void D3nDataCache::put(bufferlist& bl, unsigned int len, std::string& oid)
+{
+  size_t sr = 0;
+  uint64_t freed_size = 0, _free_data_cache_size = 0, _outstanding_write_size = 0;
+
+  ldout(cct, 10) << "D3nDataCache::" << __func__ << "(): oid=" << oid << ", len=" << len << dendl;
+  {
+    const std::lock_guard l(d3n_cache_lock);
+    std::unordered_map<string, D3nChunkDataInfo*>::iterator iter = d3n_cache_map.find(oid);
+    if (iter != d3n_cache_map.end()) {
+      ldout(cct, 10) << "D3nDataCache::" << __func__ << "(): data already cached, no rewrite" << dendl;
+      return;
+    }
+    auto it = d3n_outstanding_write_list.find(oid);
+    if (it != d3n_outstanding_write_list.end()) {
+      ldout(cct, 10) << "D3nDataCache: NOTE: data put in cache already issued, no rewrite" << dendl;
+      return;
+    }
+    d3n_outstanding_write_list.insert(oid);
+  }
+  {
+    const std::lock_guard l(d3n_eviction_lock);
+    _free_data_cache_size = free_data_cache_size;
+    _outstanding_write_size = outstanding_write_size;
+  }
+  ldout(cct, 20) << "D3nDataCache: Before eviction _free_data_cache_size:" << _free_data_cache_size << ", _outstanding_write_size:" << _outstanding_write_size << ", freed_size:" << freed_size << dendl;
+  while (len > (_free_data_cache_size - _outstanding_write_size + freed_size)) {
+    ldout(cct, 20) << "D3nDataCache: enter eviction" << dendl;
+    if (eviction_policy == _eviction_policy::LRU) {
+      sr = lru_eviction();
+    } else if (eviction_policy == _eviction_policy::RANDOM) {
+      sr = random_eviction();
+    } else {
+      ldout(cct, 0) << "D3nDataCache: Warning: unknown cache eviction policy, defaulting to lru eviction" << dendl;
+      sr = lru_eviction();
+    }
+    if (sr == 0) {
+      ldout(cct, 2) << "D3nDataCache: Warning: eviction was not able to free disk space, not writing to cache" << dendl;
+      d3n_outstanding_write_list.erase(oid);
+      return;
+    }
+    ldout(cct, 20) << "D3nDataCache: completed eviction of " << sr << " bytes" << dendl;
+    freed_size += sr;
+  }
+  int r = 0;
+  r = d3n_libaio_create_write_request(bl, len, oid);
+  if (r < 0) {
+    const std::lock_guard l(d3n_cache_lock);
+    d3n_outstanding_write_list.erase(oid);
+    ldout(cct, 1) << "D3nDataCache: create_aio_write_request fail, r=" << r << dendl;
+    return;
+  }
+
+  const std::lock_guard l(d3n_eviction_lock);
+  free_data_cache_size += freed_size;
+  outstanding_write_size += len;
+}
+
+bool D3nDataCache::get(const string& oid, const off_t len)
+{
+  const std::lock_guard l(d3n_cache_lock);
+  bool exist = false;
+  string location = cache_location + oid;
+
+  lsubdout(g_ceph_context, rgw_datacache, 20) << "D3nDataCache: " << __func__ << "(): location=" << location << dendl;
+  std::unordered_map<string, D3nChunkDataInfo*>::iterator iter = d3n_cache_map.find(oid);
+  if (!(iter == d3n_cache_map.end())) {
+    // check inside cache whether file exists or not!!!! then make exist true;
+    struct D3nChunkDataInfo* chdo = iter->second;
+    struct stat st;
+    int r = stat(location.c_str(), &st);
+    if ( r != -1 && st.st_size == len) { // file exists and containes required data range length
+      exist = true;
+      /*LRU*/
+      /*get D3nChunkDataInfo*/
+      const std::lock_guard l(d3n_eviction_lock);
+      lru_remove(chdo);
+      lru_insert_head(chdo);
+    } else {
+      d3n_cache_map.erase(oid);
+      const std::lock_guard l(d3n_eviction_lock);
+      lru_remove(chdo);
+      delete chdo;
+      exist = false;
+    }
+  }
+  return exist;
+}
+
+size_t D3nDataCache::random_eviction()
+{
+  lsubdout(g_ceph_context, rgw_datacache, 20) << "D3nDataCache: " << __func__ << "()" << dendl;
+  int n_entries = 0;
+  int random_index = 0;
+  size_t freed_size = 0;
+  D3nChunkDataInfo* del_entry;
+  string del_oid, location;
+  {
+    const std::lock_guard l(d3n_cache_lock);
+    n_entries = d3n_cache_map.size();
+    if (n_entries <= 0) {
+      return -1;
+    }
+    srand (time(NULL));
+    random_index = ceph::util::generate_random_number<int>(0, n_entries-1);
+    std::unordered_map<string, D3nChunkDataInfo*>::iterator iter = d3n_cache_map.begin();
+    std::advance(iter, random_index);
+    del_oid = iter->first;
+    del_entry =  iter->second;
+    ldout(cct, 20) << "D3nDataCache: random_eviction: index:" << random_index << ", free size: " << del_entry->size << dendl;
+    freed_size = del_entry->size;
+    delete del_entry;
+    del_entry = nullptr;
+    d3n_cache_map.erase(del_oid); // oid
+  }
+
+  location = cache_location + del_oid;
+  ::remove(location.c_str());
+  return freed_size;
+}
+
+size_t D3nDataCache::lru_eviction()
+{
+  lsubdout(g_ceph_context, rgw_datacache, 20) << "D3nDataCache: " << __func__ << "()" << dendl;
+  int n_entries = 0;
+  size_t freed_size = 0;
+  D3nChunkDataInfo* del_entry;
+  string del_oid, location;
+
+  {
+    const std::lock_guard l(d3n_eviction_lock);
+    del_entry = tail;
+    if (del_entry == nullptr) {
+      ldout(cct, 2) << "D3nDataCache: lru_eviction: del_entry=null_ptr" << dendl;
+      return 0;
+    }
+    lru_remove(del_entry);
+  }
+
+  {
+    const std::lock_guard l(d3n_cache_lock);
+    n_entries = d3n_cache_map.size();
+    if (n_entries <= 0) {
+      ldout(cct, 2) << "D3nDataCache: lru_eviction: cache_map.size<=0" << dendl;
+      return -1;
+    }
+    del_oid = del_entry->oid;
+    ldout(cct, 20) << "D3nDataCache: lru_eviction: oid to remove: " << del_oid << dendl;
+    d3n_cache_map.erase(del_oid); // oid
+  }
+  freed_size = del_entry->size;
+  delete del_entry;
+  location = cache_location + del_oid;
+  ::remove(location.c_str());
+  return freed_size;
+}
diff --git a/src/rgw/driver/rados/rgw_d3n_datacache.h b/src/rgw/driver/rados/rgw_d3n_datacache.h
new file mode 100644 (file)
index 0000000..5d3537f
--- /dev/null
@@ -0,0 +1,261 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#ifndef CEPH_RGWD3NDATACACHE_H
+#define CEPH_RGWD3NDATACACHE_H
+
+#include "rgw_rados.h"
+#include <curl/curl.h>
+
+#include "rgw_common.h"
+
+#include <unistd.h>
+#include <signal.h>
+#include "include/Context.h"
+#include "include/lru.h"
+#include "rgw_d3n_cacherequest.h"
+
+
+/*D3nDataCache*/
+struct D3nDataCache;
+
+
+struct D3nChunkDataInfo : public LRUObject {
+       CephContext *cct;
+       uint64_t size;
+       time_t access_time;
+       std::string address;
+       std::string oid;
+       bool complete;
+       struct D3nChunkDataInfo* lru_prev;
+       struct D3nChunkDataInfo* lru_next;
+
+       D3nChunkDataInfo(): size(0) {}
+
+       void set_ctx(CephContext *_cct) {
+               cct = _cct;
+       }
+
+       void dump(Formatter *f) const;
+       static void generate_test_instances(std::list<D3nChunkDataInfo*>& o);
+};
+
+struct D3nCacheAioWriteRequest {
+       std::string oid;
+       void *data;
+       int fd;
+       struct aiocb *cb;
+       D3nDataCache *priv_data;
+       CephContext *cct;
+
+       D3nCacheAioWriteRequest(CephContext *_cct) : cct(_cct) {}
+       int d3n_prepare_libaio_write_op(bufferlist& bl, unsigned int len, std::string oid, std::string cache_location);
+
+  ~D3nCacheAioWriteRequest() {
+    ::close(fd);
+               cb->aio_buf = nullptr;
+               free(data);
+               data = nullptr;
+               delete(cb);
+  }
+};
+
+struct D3nDataCache {
+
+private:
+  std::unordered_map<std::string, D3nChunkDataInfo*> d3n_cache_map;
+  std::set<std::string> d3n_outstanding_write_list;
+  std::mutex d3n_cache_lock;
+  std::mutex d3n_eviction_lock;
+
+  CephContext *cct;
+  enum class _io_type {
+    SYNC_IO = 1,
+    ASYNC_IO = 2,
+    SEND_FILE = 3
+  } io_type;
+  enum class _eviction_policy {
+    LRU=0, RANDOM=1
+  } eviction_policy;
+
+  struct sigaction action;
+  uint64_t free_data_cache_size = 0;
+  uint64_t outstanding_write_size = 0;
+  struct D3nChunkDataInfo* head;
+  struct D3nChunkDataInfo* tail;
+
+private:
+  void add_io();
+
+public:
+  D3nDataCache();
+  ~D3nDataCache() {
+    while (lru_eviction() > 0);
+  }
+
+  std::string cache_location;
+
+  bool get(const std::string& oid, const off_t len);
+  void put(bufferlist& bl, unsigned int len, std::string& obj_key);
+  int d3n_io_write(bufferlist& bl, unsigned int len, std::string oid);
+  int d3n_libaio_create_write_request(bufferlist& bl, unsigned int len, std::string oid);
+  void d3n_libaio_write_completion_cb(D3nCacheAioWriteRequest* c);
+  size_t random_eviction();
+  size_t lru_eviction();
+
+  void init(CephContext *_cct);
+
+  void lru_insert_head(struct D3nChunkDataInfo* o) {
+    lsubdout(g_ceph_context, rgw_datacache, 30) << "D3nDataCache: " << __func__ << "()" << dendl;
+    o->lru_next = head;
+    o->lru_prev = nullptr;
+    if (head) {
+      head->lru_prev = o;
+    } else {
+      tail = o;
+    }
+    head = o;
+  }
+
+  void lru_insert_tail(struct D3nChunkDataInfo* o) {
+    lsubdout(g_ceph_context, rgw_datacache, 30) << "D3nDataCache: " << __func__ << "()" << dendl;
+    o->lru_next = nullptr;
+    o->lru_prev = tail;
+    if (tail) {
+      tail->lru_next = o;
+    } else {
+      head = o;
+    }
+    tail = o;
+  }
+
+  void lru_remove(struct D3nChunkDataInfo* o) {
+    lsubdout(g_ceph_context, rgw_datacache, 30) << "D3nDataCache: " << __func__ << "()" << dendl;
+    if (o->lru_next)
+      o->lru_next->lru_prev = o->lru_prev;
+    else
+      tail = o->lru_prev;
+    if (o->lru_prev)
+      o->lru_prev->lru_next = o->lru_next;
+    else
+      head = o->lru_next;
+    o->lru_next = o->lru_prev = nullptr;
+  }
+};
+
+
+template <class T>
+class D3nRGWDataCache : public T {
+
+public:
+  D3nRGWDataCache() {}
+
+  int init_rados() override {
+    int ret;
+    ret = T::init_rados();
+    if (ret < 0)
+      return ret;
+
+    return 0;
+  }
+
+  int get_obj_iterate_cb(const DoutPrefixProvider *dpp, const rgw_raw_obj& read_obj, off_t obj_ofs,
+                         off_t read_ofs, off_t len, bool is_head_obj,
+                         RGWObjState *astate, void *arg) override;
+};
+
+template<typename T>
+int D3nRGWDataCache<T>::get_obj_iterate_cb(const DoutPrefixProvider *dpp, const rgw_raw_obj& read_obj, off_t obj_ofs,
+                                 off_t read_ofs, off_t len, bool is_head_obj,
+                                 RGWObjState *astate, void *arg) {
+  lsubdout(g_ceph_context, rgw_datacache, 30) << "D3nDataCache::" << __func__ << "(): is head object : " << is_head_obj << dendl;
+  librados::ObjectReadOperation op;
+  struct get_obj_data* d = static_cast<struct get_obj_data*>(arg);
+  std::string oid, key;
+
+  if (is_head_obj) {
+    // only when reading from the head object do we need to do the atomic test
+    int r = T::append_atomic_test(dpp, astate, op);
+    if (r < 0)
+      return r;
+
+    if (astate &&
+        obj_ofs < astate->data.length()) {
+      unsigned chunk_len = std::min((uint64_t)astate->data.length() - obj_ofs, (uint64_t)len);
+
+      r = d->client_cb->handle_data(astate->data, obj_ofs, chunk_len);
+      if (r < 0)
+        return r;
+
+      len -= chunk_len;
+      d->offset += chunk_len;
+      read_ofs += chunk_len;
+      obj_ofs += chunk_len;
+      if (!len)
+        return 0;
+    }
+
+    auto obj = d->rgwrados->svc.rados->obj(read_obj);
+    r = obj.open(dpp);
+    if (r < 0) {
+      lsubdout(g_ceph_context, rgw, 4) << "failed to open rados context for " << read_obj << dendl;
+      return r;
+    }
+
+    ldpp_dout(dpp, 20) << "D3nDataCache::" << __func__ << "(): oid=" << read_obj.oid << " obj-ofs=" << obj_ofs << " read_ofs=" << read_ofs << " len=" << len << dendl;
+    op.read(read_ofs, len, nullptr, nullptr);
+
+    const uint64_t cost = len;
+    const uint64_t id = obj_ofs; // use logical object offset for sorting replies
+
+    auto completed = d->aio->get(obj, rgw::Aio::librados_op(std::move(op), d->yield), cost, id);
+    return d->flush(std::move(completed));
+  } else {
+    ldpp_dout(dpp, 20) << "D3nDataCache::" << __func__ << "(): oid=" << read_obj.oid << ", is_head_obj=" << is_head_obj << ", obj-ofs=" << obj_ofs << ", read_ofs=" << read_ofs << ", len=" << len << dendl;
+    int r;
+
+    op.read(read_ofs, len, nullptr, nullptr);
+
+    const uint64_t cost = len;
+    const uint64_t id = obj_ofs; // use logical object offset for sorting replies
+    oid = read_obj.oid;
+
+    auto obj = d->rgwrados->svc.rados->obj(read_obj);
+    r = obj.open(dpp);
+    if (r < 0) {
+      lsubdout(g_ceph_context, rgw, 0) << "D3nDataCache: Error: failed to open rados context for " << read_obj << ", r=" << r << dendl;
+      return r;
+    }
+
+    const bool is_compressed = (astate->attrset.find(RGW_ATTR_COMPRESSION) != astate->attrset.end());
+    const bool is_encrypted = (astate->attrset.find(RGW_ATTR_CRYPT_MODE) != astate->attrset.end());
+    if (read_ofs != 0 || astate->size != astate->accounted_size || is_compressed || is_encrypted) {
+      d->d3n_bypass_cache_write = true;
+      lsubdout(g_ceph_context, rgw, 5) << "D3nDataCache: " << __func__ << "(): Note - bypassing datacache: oid=" << read_obj.oid << ", read_ofs!=0 = " << read_ofs << ", size=" << astate->size << " != accounted_size=" << astate->accounted_size << ", is_compressed=" << is_compressed << ", is_encrypted=" << is_encrypted  << dendl;
+      auto completed = d->aio->get(obj, rgw::Aio::librados_op(std::move(op), d->yield), cost, id);
+      r = d->flush(std::move(completed));
+      return r;
+    }
+
+    if (d->rgwrados->d3n_data_cache->get(oid, len)) {
+      // Read From Cache
+      ldpp_dout(dpp, 20) << "D3nDataCache: " << __func__ << "(): READ FROM CACHE: oid=" << read_obj.oid << ", obj-ofs=" << obj_ofs << ", read_ofs=" << read_ofs << ", len=" << len << dendl;
+      auto completed = d->aio->get(obj, rgw::Aio::d3n_cache_op(dpp, d->yield, read_ofs, len, d->rgwrados->d3n_data_cache->cache_location), cost, id);
+      r = d->flush(std::move(completed));
+      if (r < 0) {
+        lsubdout(g_ceph_context, rgw, 0) << "D3nDataCache: " << __func__ << "(): Error: failed to drain/flush, r= " << r << dendl;
+      }
+      return r;
+    } else {
+      // Write To Cache
+      ldpp_dout(dpp, 20) << "D3nDataCache: " << __func__ << "(): WRITE TO CACHE: oid=" << read_obj.oid << ", obj-ofs=" << obj_ofs << ", read_ofs=" << read_ofs << " len=" << len << dendl;
+      auto completed = d->aio->get(obj, rgw::Aio::librados_op(std::move(op), d->yield), cost, id);
+      return d->flush(std::move(completed));
+    }
+  }
+  lsubdout(g_ceph_context, rgw, 1) << "D3nDataCache: " << __func__ << "(): Warning: Check head object cache handling flow, oid=" << read_obj.oid << dendl;
+
+  return 0;
+}
+
+#endif
diff --git a/src/rgw/driver/rados/rgw_data_sync.cc b/src/rgw/driver/rados/rgw_data_sync.cc
new file mode 100644 (file)
index 0000000..47573b7
--- /dev/null
@@ -0,0 +1,6460 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "common/ceph_json.h"
+#include "common/RefCountedObj.h"
+#include "common/WorkQueue.h"
+#include "common/Throttle.h"
+#include "common/errno.h"
+
+#include "rgw_common.h"
+#include "rgw_zone.h"
+#include "rgw_sync.h"
+#include "rgw_data_sync.h"
+#include "rgw_rest_conn.h"
+#include "rgw_cr_rados.h"
+#include "rgw_cr_rest.h"
+#include "rgw_cr_tools.h"
+#include "rgw_http_client.h"
+#include "rgw_bucket.h"
+#include "rgw_bucket_sync.h"
+#include "rgw_bucket_sync_cache.h"
+#include "rgw_datalog.h"
+#include "rgw_metadata.h"
+#include "rgw_sync_counters.h"
+#include "rgw_sync_error_repo.h"
+#include "rgw_sync_module.h"
+#include "rgw_sal.h"
+
+#include "cls/lock/cls_lock_client.h"
+#include "cls/rgw/cls_rgw_client.h"
+
+#include "services/svc_zone.h"
+#include "services/svc_sync_modules.h"
+#include "rgw_bucket.h"
+
+#include "include/common_fwd.h"
+#include "include/random.h"
+
+#include <boost/asio/yield.hpp>
+#include <string_view>
+
+#define dout_subsys ceph_subsys_rgw
+
+#undef dout_prefix
+#define dout_prefix (*_dout << "data sync: ")
+
+using namespace std;
+
+static const string datalog_sync_status_oid_prefix = "datalog.sync-status";
+static const string datalog_sync_status_shard_prefix = "datalog.sync-status.shard";
+static const string datalog_sync_full_sync_index_prefix = "data.full-sync.index";
+static const string bucket_full_status_oid_prefix = "bucket.full-sync-status";
+static const string bucket_status_oid_prefix = "bucket.sync-status";
+static const string object_status_oid_prefix = "bucket.sync-status";
+
+void rgw_datalog_info::decode_json(JSONObj *obj) {
+  JSONDecoder::decode_json("num_objects", num_shards, obj);
+}
+
+void rgw_datalog_entry::decode_json(JSONObj *obj) {
+  JSONDecoder::decode_json("key", key, obj);
+  utime_t ut;
+  JSONDecoder::decode_json("timestamp", ut, obj);
+  timestamp = ut.to_real_time();
+}
+
+void rgw_datalog_shard_data::decode_json(JSONObj *obj) {
+  JSONDecoder::decode_json("marker", marker, obj);
+  JSONDecoder::decode_json("truncated", truncated, obj);
+  JSONDecoder::decode_json("entries", entries, obj);
+};
+
+// print a bucket shard with [gen]
+std::string to_string(const rgw_bucket_shard& bs, std::optional<uint64_t> gen)
+{
+  constexpr auto digits10 = std::numeric_limits<uint64_t>::digits10;
+  constexpr auto reserve = 2 + digits10; // [value]
+  auto str = bs.get_key('/', ':', ':', reserve);
+  str.append(1, '[');
+  str.append(std::to_string(gen.value_or(0)));
+  str.append(1, ']');
+  return str;
+}
+
+class RGWReadDataSyncStatusMarkersCR : public RGWShardCollectCR {
+  static constexpr int MAX_CONCURRENT_SHARDS = 16;
+
+  RGWDataSyncCtx *sc;
+  RGWDataSyncEnv *env;
+  const int num_shards;
+  int shard_id{0};;
+
+  map<uint32_t, rgw_data_sync_marker>& markers;
+
+  int handle_result(int r) override {
+    if (r == -ENOENT) { // ENOENT is not a fatal error
+      return 0;
+    }
+    if (r < 0) {
+      ldout(cct, 4) << "failed to read data sync status: "
+          << cpp_strerror(r) << dendl;
+    }
+    return r;
+  }
+ public:
+  RGWReadDataSyncStatusMarkersCR(RGWDataSyncCtx *sc, int num_shards,
+                                 map<uint32_t, rgw_data_sync_marker>& markers)
+    : RGWShardCollectCR(sc->cct, MAX_CONCURRENT_SHARDS),
+      sc(sc), env(sc->env), num_shards(num_shards), markers(markers)
+  {}
+  bool spawn_next() override;
+};
+
+bool RGWReadDataSyncStatusMarkersCR::spawn_next()
+{
+  if (shard_id >= num_shards) {
+    return false;
+  }
+  using CR = RGWSimpleRadosReadCR<rgw_data_sync_marker>;
+  spawn(new CR(env->dpp, env->async_rados, env->svc->sysobj,
+               rgw_raw_obj(env->svc->zone->get_zone_params().log_pool, RGWDataSyncStatusManager::shard_obj_name(sc->source_zone, shard_id)),
+               &markers[shard_id]),
+        false);
+  shard_id++;
+  return true;
+}
+
+class RGWReadDataSyncRecoveringShardsCR : public RGWShardCollectCR {
+  static constexpr int MAX_CONCURRENT_SHARDS = 16;
+
+  RGWDataSyncCtx *sc;
+  RGWDataSyncEnv *env;
+
+  uint64_t max_entries;
+  int num_shards;
+  int shard_id{0};
+
+  string marker;
+  std::vector<RGWRadosGetOmapKeysCR::ResultPtr>& omapkeys;
+
+  int handle_result(int r) override {
+    if (r == -ENOENT) { // ENOENT is not a fatal error
+      return 0;
+    }
+    if (r < 0) {
+      ldout(cct, 4) << "failed to list recovering data sync: "
+          << cpp_strerror(r) << dendl;
+    }
+    return r;
+  }
+ public:
+  RGWReadDataSyncRecoveringShardsCR(RGWDataSyncCtx *sc, uint64_t _max_entries, int _num_shards,
+                                    std::vector<RGWRadosGetOmapKeysCR::ResultPtr>& omapkeys)
+    : RGWShardCollectCR(sc->cct, MAX_CONCURRENT_SHARDS), sc(sc), env(sc->env),
+      max_entries(_max_entries), num_shards(_num_shards), omapkeys(omapkeys)
+  {}
+  bool spawn_next() override;
+};
+
+bool RGWReadDataSyncRecoveringShardsCR::spawn_next()
+{
+  if (shard_id >= num_shards)
+    return false;
+  string error_oid = RGWDataSyncStatusManager::shard_obj_name(sc->source_zone, shard_id) + ".retry";
+  auto& shard_keys = omapkeys[shard_id];
+  shard_keys = std::make_shared<RGWRadosGetOmapKeysCR::Result>();
+  spawn(new RGWRadosGetOmapKeysCR(env->driver, rgw_raw_obj(env->svc->zone->get_zone_params().log_pool, error_oid),
+                                  marker, max_entries, shard_keys), false);
+
+  ++shard_id;
+  return true;
+}
+
+class RGWReadDataSyncStatusCoroutine : public RGWCoroutine {
+  RGWDataSyncCtx *sc;
+  RGWDataSyncEnv *sync_env;
+  rgw_data_sync_status *sync_status;
+
+public:
+  RGWReadDataSyncStatusCoroutine(RGWDataSyncCtx *_sc,
+                                 rgw_data_sync_status *_status)
+    : RGWCoroutine(_sc->cct), sc(_sc), sync_env(sc->env), sync_status(_status)
+  {}
+  int operate(const DoutPrefixProvider *dpp) override;
+};
+
+int RGWReadDataSyncStatusCoroutine::operate(const DoutPrefixProvider *dpp)
+{
+  reenter(this) {
+    // read sync info
+    using ReadInfoCR = RGWSimpleRadosReadCR<rgw_data_sync_info>;
+    yield {
+      bool empty_on_enoent = false; // fail on ENOENT
+      call(new ReadInfoCR(dpp, sync_env->async_rados, sync_env->svc->sysobj,
+                          rgw_raw_obj(sync_env->svc->zone->get_zone_params().log_pool, RGWDataSyncStatusManager::sync_status_oid(sc->source_zone)),
+                          &sync_status->sync_info, empty_on_enoent));
+    }
+    if (retcode < 0) {
+      ldpp_dout(dpp, 4) << "failed to read sync status info with "
+          << cpp_strerror(retcode) << dendl;
+      return set_cr_error(retcode);
+    }
+    // read shard markers
+    using ReadMarkersCR = RGWReadDataSyncStatusMarkersCR;
+    yield call(new ReadMarkersCR(sc, sync_status->sync_info.num_shards,
+                                 sync_status->sync_markers));
+    if (retcode < 0) {
+      ldpp_dout(dpp, 4) << "failed to read sync status markers with "
+          << cpp_strerror(retcode) << dendl;
+      return set_cr_error(retcode);
+    }
+    return set_cr_done();
+  }
+  return 0;
+}
+
+class RGWReadRemoteDataLogShardInfoCR : public RGWCoroutine {
+  RGWDataSyncCtx *sc;
+  RGWDataSyncEnv *sync_env;
+
+  RGWRESTReadResource *http_op;
+
+  int shard_id;
+  RGWDataChangesLogInfo *shard_info;
+
+public:
+  RGWReadRemoteDataLogShardInfoCR(RGWDataSyncCtx *_sc,
+                                  int _shard_id, RGWDataChangesLogInfo *_shard_info) : RGWCoroutine(_sc->cct),
+                                                      sc(_sc),
+                                                      sync_env(_sc->env),
+                                                      http_op(NULL),
+                                                      shard_id(_shard_id),
+                                                      shard_info(_shard_info) {
+  }
+
+  ~RGWReadRemoteDataLogShardInfoCR() override {
+    if (http_op) {
+      http_op->put();
+    }
+  }
+
+  int operate(const DoutPrefixProvider *dpp) override {
+    reenter(this) {
+      yield {
+       char buf[16];
+       snprintf(buf, sizeof(buf), "%d", shard_id);
+        rgw_http_param_pair pairs[] = { { "type" , "data" },
+                                       { "id", buf },
+                                       { "info" , NULL },
+                                       { NULL, NULL } };
+
+        string p = "/admin/log/";
+
+        http_op = new RGWRESTReadResource(sc->conn, p, pairs, NULL, sync_env->http_manager);
+
+        init_new_io(http_op);
+
+        int ret = http_op->aio_read(dpp);
+        if (ret < 0) {
+          ldpp_dout(dpp, 0) << "ERROR: failed to read from " << p << dendl;
+          log_error() << "failed to send http operation: " << http_op->to_str() << " ret=" << ret << std::endl;
+          return set_cr_error(ret);
+        }
+
+        return io_block(0);
+      }
+      yield {
+        int ret = http_op->wait(shard_info, null_yield);
+        if (ret < 0) {
+          return set_cr_error(ret);
+        }
+        return set_cr_done();
+      }
+    }
+    return 0;
+  }
+};
+
+struct read_remote_data_log_response {
+  string marker;
+  bool truncated;
+  vector<rgw_data_change_log_entry> entries;
+
+  read_remote_data_log_response() : truncated(false) {}
+
+  void decode_json(JSONObj *obj) {
+    JSONDecoder::decode_json("marker", marker, obj);
+    JSONDecoder::decode_json("truncated", truncated, obj);
+    JSONDecoder::decode_json("entries", entries, obj);
+  };
+};
+
+class RGWReadRemoteDataLogShardCR : public RGWCoroutine {
+  RGWDataSyncCtx *sc;
+  RGWDataSyncEnv *sync_env;
+
+  RGWRESTReadResource *http_op = nullptr;
+
+  int shard_id;
+  const std::string& marker;
+  string *pnext_marker;
+  vector<rgw_data_change_log_entry> *entries;
+  bool *truncated;
+
+  read_remote_data_log_response response;
+  std::optional<TOPNSPC::common::PerfGuard> timer;
+
+public:
+  RGWReadRemoteDataLogShardCR(RGWDataSyncCtx *_sc, int _shard_id,
+                              const std::string& marker, string *pnext_marker,
+                              vector<rgw_data_change_log_entry> *_entries,
+                              bool *_truncated)
+    : RGWCoroutine(_sc->cct), sc(_sc), sync_env(_sc->env),
+      shard_id(_shard_id), marker(marker), pnext_marker(pnext_marker),
+      entries(_entries), truncated(_truncated) {
+  }
+  ~RGWReadRemoteDataLogShardCR() override {
+    if (http_op) {
+      http_op->put();
+    }
+  }
+
+  int operate(const DoutPrefixProvider *dpp) override {
+    reenter(this) {
+      yield {
+       char buf[16];
+       snprintf(buf, sizeof(buf), "%d", shard_id);
+        rgw_http_param_pair pairs[] = { { "type" , "data" },
+                                       { "id", buf },
+                                       { "marker", marker.c_str() },
+                                       { "extra-info", "true" },
+                                       { NULL, NULL } };
+
+        string p = "/admin/log/";
+
+        http_op = new RGWRESTReadResource(sc->conn, p, pairs, NULL, sync_env->http_manager);
+
+        init_new_io(http_op);
+
+        if (sync_env->counters) {
+          timer.emplace(sync_env->counters, sync_counters::l_poll);
+        }
+        int ret = http_op->aio_read(dpp);
+        if (ret < 0) {
+          ldpp_dout(dpp, 0) << "ERROR: failed to read from " << p << dendl;
+          log_error() << "failed to send http operation: " << http_op->to_str() << " ret=" << ret << std::endl;
+          if (sync_env->counters) {
+            sync_env->counters->inc(sync_counters::l_poll_err);
+          }
+          return set_cr_error(ret);
+        }
+
+        return io_block(0);
+      }
+      yield {
+        timer.reset();
+        int ret = http_op->wait(&response, null_yield);
+        if (ret < 0) {
+          if (sync_env->counters && ret != -ENOENT) {
+            sync_env->counters->inc(sync_counters::l_poll_err);
+          }
+          return set_cr_error(ret);
+        }
+        entries->clear();
+        entries->swap(response.entries);
+        *pnext_marker = response.marker;
+        *truncated = response.truncated;
+        return set_cr_done();
+      }
+    }
+    return 0;
+  }
+};
+
+class RGWReadRemoteDataLogInfoCR : public RGWShardCollectCR {
+  RGWDataSyncCtx *sc;
+  RGWDataSyncEnv *sync_env;
+
+  int num_shards;
+  map<int, RGWDataChangesLogInfo> *datalog_info;
+
+  int shard_id;
+#define READ_DATALOG_MAX_CONCURRENT 10
+
+  int handle_result(int r) override {
+    if (r == -ENOENT) { // ENOENT is not a fatal error
+      return 0;
+    }
+    if (r < 0) {
+      ldout(cct, 4) << "failed to fetch remote datalog info: "
+          << cpp_strerror(r) << dendl;
+    }
+    return r;
+  }
+public:
+  RGWReadRemoteDataLogInfoCR(RGWDataSyncCtx *_sc,
+                     int _num_shards,
+                     map<int, RGWDataChangesLogInfo> *_datalog_info) : RGWShardCollectCR(_sc->cct, READ_DATALOG_MAX_CONCURRENT),
+                                                                 sc(_sc), sync_env(_sc->env), num_shards(_num_shards),
+                                                                 datalog_info(_datalog_info), shard_id(0) {}
+  bool spawn_next() override;
+};
+
+bool RGWReadRemoteDataLogInfoCR::spawn_next() {
+  if (shard_id >= num_shards) {
+    return false;
+  }
+  spawn(new RGWReadRemoteDataLogShardInfoCR(sc, shard_id, &(*datalog_info)[shard_id]), false);
+  shard_id++;
+  return true;
+}
+
+class RGWListRemoteDataLogShardCR : public RGWSimpleCoroutine {
+  RGWDataSyncCtx *sc;
+  RGWDataSyncEnv *sync_env;
+  RGWRESTReadResource *http_op;
+
+  int shard_id;
+  string marker;
+  uint32_t max_entries;
+  rgw_datalog_shard_data *result;
+
+public:
+  RGWListRemoteDataLogShardCR(RGWDataSyncCtx *sc, int _shard_id,
+                              const string& _marker, uint32_t _max_entries,
+                              rgw_datalog_shard_data *_result)
+    : RGWSimpleCoroutine(sc->cct), sc(sc), sync_env(sc->env), http_op(NULL),
+      shard_id(_shard_id), marker(_marker), max_entries(_max_entries), result(_result) {}
+
+  int send_request(const DoutPrefixProvider *dpp) override {
+    RGWRESTConn *conn = sc->conn;
+
+    char buf[32];
+    snprintf(buf, sizeof(buf), "%d", shard_id);
+
+    char max_entries_buf[32];
+    snprintf(max_entries_buf, sizeof(max_entries_buf), "%d", (int)max_entries);
+
+    const char *marker_key = (marker.empty() ? "" : "marker");
+
+    rgw_http_param_pair pairs[] = { { "type", "data" },
+      { "id", buf },
+      { "max-entries", max_entries_buf },
+      { marker_key, marker.c_str() },
+      { NULL, NULL } };
+
+    string p = "/admin/log/";
+
+    http_op = new RGWRESTReadResource(conn, p, pairs, NULL, sync_env->http_manager);
+    init_new_io(http_op);
+
+    int ret = http_op->aio_read(dpp);
+    if (ret < 0) {
+      ldpp_dout(dpp, 0) << "ERROR: failed to read from " << p << dendl;
+      log_error() << "failed to send http operation: " << http_op->to_str() << " ret=" << ret << std::endl;
+      http_op->put();
+      return ret;
+    }
+
+    return 0;
+  }
+
+  int request_complete() override {
+    int ret = http_op->wait(result, null_yield);
+    http_op->put();
+    if (ret < 0 && ret != -ENOENT) {
+      ldpp_dout(sync_env->dpp, 0) << "ERROR: failed to list remote datalog shard, ret=" << ret << dendl;
+      return ret;
+    }
+    return 0;
+  }
+};
+
+class RGWListRemoteDataLogCR : public RGWShardCollectCR {
+  RGWDataSyncCtx *sc;
+  RGWDataSyncEnv *sync_env;
+
+  map<int, string> shards;
+  int max_entries_per_shard;
+  map<int, rgw_datalog_shard_data> *result;
+
+  map<int, string>::iterator iter;
+#define READ_DATALOG_MAX_CONCURRENT 10
+
+  int handle_result(int r) override {
+    if (r == -ENOENT) { // ENOENT is not a fatal error
+      return 0;
+    }
+    if (r < 0) {
+      ldout(cct, 4) << "failed to list remote datalog: "
+          << cpp_strerror(r) << dendl;
+    }
+    return r;
+  }
+public:
+  RGWListRemoteDataLogCR(RGWDataSyncCtx *_sc,
+                     map<int, string>& _shards,
+                     int _max_entries_per_shard,
+                     map<int, rgw_datalog_shard_data> *_result) : RGWShardCollectCR(_sc->cct, READ_DATALOG_MAX_CONCURRENT),
+                                                                 sc(_sc), sync_env(_sc->env), max_entries_per_shard(_max_entries_per_shard),
+                                                                 result(_result) {
+    shards.swap(_shards);
+    iter = shards.begin();
+  }
+  bool spawn_next() override;
+};
+
+bool RGWListRemoteDataLogCR::spawn_next() {
+  if (iter == shards.end()) {
+    return false;
+  }
+
+  spawn(new RGWListRemoteDataLogShardCR(sc, iter->first, iter->second, max_entries_per_shard, &(*result)[iter->first]), false);
+  ++iter;
+  return true;
+}
+
+class RGWInitDataSyncStatusCoroutine : public RGWCoroutine {
+  static constexpr uint32_t lock_duration = 30;
+  RGWDataSyncCtx *sc;
+  RGWDataSyncEnv *sync_env;
+  rgw::sal::RadosStore* driver; // RGWDataSyncEnv also has a pointer to driver
+  const rgw_pool& pool;
+  const uint32_t num_shards;
+
+  string sync_status_oid;
+
+  string lock_name;
+  string cookie;
+  rgw_data_sync_status *status;
+  map<int, RGWDataChangesLogInfo> shards_info;
+
+  RGWSyncTraceNodeRef tn;
+public:
+  RGWInitDataSyncStatusCoroutine(RGWDataSyncCtx *_sc, uint32_t num_shards,
+                                 uint64_t instance_id,
+                                 RGWSyncTraceNodeRef& _tn_parent,
+                                 rgw_data_sync_status *status)
+    : RGWCoroutine(_sc->cct), sc(_sc), sync_env(_sc->env), driver(sync_env->driver),
+      pool(sync_env->svc->zone->get_zone_params().log_pool),
+      num_shards(num_shards), status(status),
+      tn(sync_env->sync_tracer->add_node(_tn_parent, "init_data_sync_status")) {
+    lock_name = "sync_lock";
+
+    status->sync_info.instance_id = instance_id;
+
+#define COOKIE_LEN 16
+    char buf[COOKIE_LEN + 1];
+
+    gen_rand_alphanumeric(cct, buf, sizeof(buf) - 1);
+    cookie = buf;
+
+    sync_status_oid = RGWDataSyncStatusManager::sync_status_oid(sc->source_zone);
+
+  }
+
+  int operate(const DoutPrefixProvider *dpp) override {
+    int ret;
+    reenter(this) {
+      using LockCR = RGWSimpleRadosLockCR;
+      yield call(new LockCR(sync_env->async_rados, driver,
+                            rgw_raw_obj{pool, sync_status_oid},
+                            lock_name, cookie, lock_duration));
+      if (retcode < 0) {
+        tn->log(0, SSTR("ERROR: failed to take a lock on " << sync_status_oid));
+        return set_cr_error(retcode);
+      }
+      using WriteInfoCR = RGWSimpleRadosWriteCR<rgw_data_sync_info>;
+      yield call(new WriteInfoCR(dpp, sync_env->async_rados, sync_env->svc->sysobj,
+                                 rgw_raw_obj{pool, sync_status_oid},
+                                 status->sync_info));
+      if (retcode < 0) {
+        tn->log(0, SSTR("ERROR: failed to write sync status info with " << retcode));
+        return set_cr_error(retcode);
+      }
+
+      /* take lock again, we just recreated the object */
+      yield call(new LockCR(sync_env->async_rados, driver,
+                            rgw_raw_obj{pool, sync_status_oid},
+                            lock_name, cookie, lock_duration));
+      if (retcode < 0) {
+        tn->log(0, SSTR("ERROR: failed to take a lock on " << sync_status_oid));
+        return set_cr_error(retcode);
+      }
+
+      tn->log(10, "took lease");
+
+      /* fetch current position in logs */
+      yield {
+        RGWRESTConn *conn = sync_env->svc->zone->get_zone_conn(sc->source_zone);
+        if (!conn) {
+          tn->log(0, SSTR("ERROR: connection to zone " << sc->source_zone << " does not exist!"));
+          return set_cr_error(-EIO);
+        }
+        for (uint32_t i = 0; i < num_shards; i++) {
+          spawn(new RGWReadRemoteDataLogShardInfoCR(sc, i, &shards_info[i]), true);
+        }
+      }
+      while (collect(&ret, NULL)) {
+        if (ret < 0) {
+          tn->log(0, SSTR("ERROR: failed to read remote data log shards"));
+          return set_state(RGWCoroutine_Error);
+        }
+        yield;
+      }
+      yield {
+        for (uint32_t i = 0; i < num_shards; i++) {
+          RGWDataChangesLogInfo& info = shards_info[i];
+          auto& marker = status->sync_markers[i];
+          marker.next_step_marker = info.marker;
+          marker.timestamp = info.last_update;
+          const auto& oid = RGWDataSyncStatusManager::shard_obj_name(sc->source_zone, i);
+          using WriteMarkerCR = RGWSimpleRadosWriteCR<rgw_data_sync_marker>;
+          spawn(new WriteMarkerCR(dpp, sync_env->async_rados, sync_env->svc->sysobj,
+                                  rgw_raw_obj{pool, oid}, marker), true);
+        }
+      }
+      while (collect(&ret, NULL)) {
+        if (ret < 0) {
+          tn->log(0, SSTR("ERROR: failed to write data sync status markers"));
+          return set_state(RGWCoroutine_Error);
+        }
+        yield;
+      }
+
+      status->sync_info.state = rgw_data_sync_info::StateBuildingFullSyncMaps;
+      yield call(new WriteInfoCR(dpp, sync_env->async_rados, sync_env->svc->sysobj,
+                                 rgw_raw_obj{pool, sync_status_oid},
+                                 status->sync_info));
+      if (retcode < 0) {
+        tn->log(0, SSTR("ERROR: failed to write sync status info with " << retcode));
+        return set_cr_error(retcode);
+      }
+      yield call(new RGWSimpleRadosUnlockCR(sync_env->async_rados, driver,
+                                            rgw_raw_obj{pool, sync_status_oid},
+                                            lock_name, cookie));
+      return set_cr_done();
+    }
+    return 0;
+  }
+};
+
+RGWRemoteDataLog::RGWRemoteDataLog(const DoutPrefixProvider *dpp,
+                                   rgw::sal::RadosStore* driver,
+                                   RGWAsyncRadosProcessor *async_rados)
+  : RGWCoroutinesManager(driver->ctx(), driver->getRados()->get_cr_registry()),
+      dpp(dpp), driver(driver),
+      cct(driver->ctx()), cr_registry(driver->getRados()->get_cr_registry()),
+      async_rados(async_rados),
+      http_manager(driver->ctx(), completion_mgr),
+      data_sync_cr(NULL),
+      initialized(false)
+{
+}
+
+int RGWRemoteDataLog::read_log_info(const DoutPrefixProvider *dpp, rgw_datalog_info *log_info)
+{
+  rgw_http_param_pair pairs[] = { { "type", "data" },
+                                  { NULL, NULL } };
+
+  int ret = sc.conn->get_json_resource(dpp, "/admin/log", pairs, null_yield, *log_info);
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: failed to fetch datalog info" << dendl;
+    return ret;
+  }
+
+  ldpp_dout(dpp, 20) << "remote datalog, num_shards=" << log_info->num_shards << dendl;
+
+  return 0;
+}
+
+int RGWRemoteDataLog::read_source_log_shards_info(const DoutPrefixProvider *dpp, map<int, RGWDataChangesLogInfo> *shards_info)
+{
+  rgw_datalog_info log_info;
+  int ret = read_log_info(dpp, &log_info);
+  if (ret < 0) {
+    return ret;
+  }
+
+  return run(dpp, new RGWReadRemoteDataLogInfoCR(&sc, log_info.num_shards, shards_info));
+}
+
+int RGWRemoteDataLog::read_source_log_shards_next(const DoutPrefixProvider *dpp, map<int, string> shard_markers, map<int, rgw_datalog_shard_data> *result)
+{
+  return run(dpp, new RGWListRemoteDataLogCR(&sc, shard_markers, 1, result));
+}
+
+int RGWRemoteDataLog::init(const rgw_zone_id& _source_zone, RGWRESTConn *_conn, RGWSyncErrorLogger *_error_logger,
+                           RGWSyncTraceManager *_sync_tracer, RGWSyncModuleInstanceRef& _sync_module,
+                           PerfCounters* counters)
+{
+  sync_env.init(dpp, cct, driver, driver->svc(), async_rados, &http_manager, _error_logger,
+                _sync_tracer, _sync_module, counters);
+  sc.init(&sync_env, _conn, _source_zone);
+
+  if (initialized) {
+    return 0;
+  }
+
+  int ret = http_manager.start();
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "failed in http_manager.start() ret=" << ret << dendl;
+    return ret;
+  }
+
+  tn = sync_env.sync_tracer->add_node(sync_env.sync_tracer->root_node, "data");
+
+  initialized = true;
+
+  return 0;
+}
+
+void RGWRemoteDataLog::finish()
+{
+  stop();
+}
+
+int RGWRemoteDataLog::read_sync_status(const DoutPrefixProvider *dpp, rgw_data_sync_status *sync_status)
+{
+  // cannot run concurrently with run_sync(), so run in a separate manager
+  RGWCoroutinesManager crs(cct, cr_registry);
+  RGWHTTPManager http_manager(cct, crs.get_completion_mgr());
+  int ret = http_manager.start();
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "failed in http_manager.start() ret=" << ret << dendl;
+    return ret;
+  }
+  RGWDataSyncEnv sync_env_local = sync_env;
+  sync_env_local.http_manager = &http_manager;
+
+  RGWDataSyncCtx sc_local = sc;
+  sc_local.env = &sync_env_local;
+
+  ret = crs.run(dpp, new RGWReadDataSyncStatusCoroutine(&sc_local, sync_status));
+  http_manager.stop();
+  return ret;
+}
+
+int RGWRemoteDataLog::read_recovering_shards(const DoutPrefixProvider *dpp, const int num_shards, set<int>& recovering_shards)
+{
+  // cannot run concurrently with run_sync(), so run in a separate manager
+  RGWCoroutinesManager crs(cct, cr_registry);
+  RGWHTTPManager http_manager(cct, crs.get_completion_mgr());
+  int ret = http_manager.start();
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "failed in http_manager.start() ret=" << ret << dendl;
+    return ret;
+  }
+  RGWDataSyncEnv sync_env_local = sync_env;
+  sync_env_local.http_manager = &http_manager;
+
+  RGWDataSyncCtx sc_local = sc;
+  sc_local.env = &sync_env_local;
+
+  std::vector<RGWRadosGetOmapKeysCR::ResultPtr> omapkeys;
+  omapkeys.resize(num_shards);
+  uint64_t max_entries{1};
+
+  ret = crs.run(dpp, new RGWReadDataSyncRecoveringShardsCR(&sc_local, max_entries, num_shards, omapkeys));
+  http_manager.stop();
+
+  if (ret == 0) {
+    for (int i = 0; i < num_shards; i++) {
+      if (omapkeys[i]->entries.size() != 0) {
+        recovering_shards.insert(i);
+      }
+    }
+  }
+
+  return ret;
+}
+
+int RGWRemoteDataLog::init_sync_status(const DoutPrefixProvider *dpp, int num_shards)
+{
+  rgw_data_sync_status sync_status;
+  sync_status.sync_info.num_shards = num_shards;
+
+  RGWCoroutinesManager crs(cct, cr_registry);
+  RGWHTTPManager http_manager(cct, crs.get_completion_mgr());
+  int ret = http_manager.start();
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "failed in http_manager.start() ret=" << ret << dendl;
+    return ret;
+  }
+  RGWDataSyncEnv sync_env_local = sync_env;
+  sync_env_local.http_manager = &http_manager;
+  auto instance_id = ceph::util::generate_random_number<uint64_t>();
+  RGWDataSyncCtx sc_local = sc;
+  sc_local.env = &sync_env_local;
+  ret = crs.run(dpp, new RGWInitDataSyncStatusCoroutine(&sc_local, num_shards, instance_id, tn, &sync_status));
+  http_manager.stop();
+  return ret;
+}
+
+static string full_data_sync_index_shard_oid(const rgw_zone_id& source_zone, int shard_id)
+{
+  char buf[datalog_sync_full_sync_index_prefix.size() + 1 + source_zone.id.size() + 1 + 16];
+  snprintf(buf, sizeof(buf), "%s.%s.%d", datalog_sync_full_sync_index_prefix.c_str(), source_zone.id.c_str(), shard_id);
+  return string(buf);
+}
+
+struct read_metadata_list {
+  string marker;
+  bool truncated;
+  list<string> keys;
+  int count;
+
+  read_metadata_list() : truncated(false), count(0) {}
+
+  void decode_json(JSONObj *obj) {
+    JSONDecoder::decode_json("marker", marker, obj);
+    JSONDecoder::decode_json("truncated", truncated, obj);
+    JSONDecoder::decode_json("keys", keys, obj);
+    JSONDecoder::decode_json("count", count, obj);
+  }
+};
+
+struct bucket_instance_meta_info {
+  string key;
+  obj_version ver;
+  utime_t mtime;
+  RGWBucketInstanceMetadataObject data;
+
+  bucket_instance_meta_info() {}
+
+  void decode_json(JSONObj *obj) {
+    JSONDecoder::decode_json("key", key, obj);
+    JSONDecoder::decode_json("ver", ver, obj);
+    JSONDecoder::decode_json("mtime", mtime, obj);
+    JSONDecoder::decode_json("data", data, obj);
+  }
+};
+
+class RGWReadRemoteBucketIndexLogInfoCR : public RGWCoroutine {
+  RGWDataSyncCtx *sc;
+  RGWDataSyncEnv *sync_env;
+  const string instance_key;
+
+  rgw_bucket_index_marker_info *info;
+
+public:
+  RGWReadRemoteBucketIndexLogInfoCR(RGWDataSyncCtx *_sc,
+                                   const rgw_bucket& bucket,
+                                   rgw_bucket_index_marker_info *_info)
+    : RGWCoroutine(_sc->cct), sc(_sc), sync_env(_sc->env),
+      instance_key(bucket.get_key()), info(_info) {}
+
+  int operate(const DoutPrefixProvider *dpp) override {
+    reenter(this) {
+      yield {
+        rgw_http_param_pair pairs[] = { { "type" , "bucket-index" },
+                                       { "bucket-instance", instance_key.c_str() },
+                                       { "info" , NULL },
+                                       { NULL, NULL } };
+
+        string p = "/admin/log/";
+        call(new RGWReadRESTResourceCR<rgw_bucket_index_marker_info>(sync_env->cct, sc->conn, sync_env->http_manager, p, pairs, info));
+      }
+      if (retcode < 0) {
+        return set_cr_error(retcode);
+      }
+
+      return set_cr_done();
+    }
+    return 0;
+  }
+};
+
+
+class RGWListBucketIndexesCR : public RGWCoroutine {
+  RGWDataSyncCtx *sc;
+  RGWDataSyncEnv *sync_env = sc->env;
+
+  rgw::sal::RadosStore* driver = sync_env->driver;
+
+  rgw_data_sync_status *sync_status;
+
+  int req_ret = 0;
+  int ret = 0;
+
+  list<string>::iterator iter;
+
+  unique_ptr<RGWShardedOmapCRManager> entries_index;
+  string oid_prefix =
+    datalog_sync_full_sync_index_prefix + "." + sc->source_zone.id;
+
+  string path = "/admin/metadata/bucket.instance";
+  bucket_instance_meta_info meta_info;
+  string key;
+
+  bool failed = false;
+  bool truncated = false;
+  read_metadata_list result;
+
+public:
+  RGWListBucketIndexesCR(RGWDataSyncCtx* sc,
+                         rgw_data_sync_status* sync_status)
+    : RGWCoroutine(sc->cct), sc(sc), sync_status(sync_status) {}
+  ~RGWListBucketIndexesCR() override { }
+
+  int operate(const DoutPrefixProvider *dpp) override {
+    reenter(this) {
+      entries_index = std::make_unique<RGWShardedOmapCRManager>(
+       sync_env->async_rados, driver, this,
+       cct->_conf->rgw_data_log_num_shards,
+       sync_env->svc->zone->get_zone_params().log_pool,
+       oid_prefix);
+      yield; // yield so OmapAppendCRs can start
+
+      do {
+        yield {
+          string entrypoint = "/admin/metadata/bucket.instance"s;
+
+          rgw_http_param_pair pairs[] = {{"max-entries", "1000"},
+                                         {"marker", result.marker.c_str()},
+                                         {NULL, NULL}};
+
+          call(new RGWReadRESTResourceCR<read_metadata_list>(
+                sync_env->cct, sc->conn, sync_env->http_manager,
+                entrypoint, pairs, &result));
+       }
+       if (retcode < 0) {
+         ldpp_dout(dpp, 0)
+           << "ERROR: failed to fetch metadata for section bucket.instance"
+           << dendl;
+          return set_cr_error(retcode);
+        }
+
+        for (iter = result.keys.begin(); iter != result.keys.end(); ++iter) {
+          ldpp_dout(dpp, 20) << "list metadata: section=bucket.instance key="
+                            << *iter << dendl;
+          key = *iter;
+
+          yield {
+            rgw_http_param_pair pairs[] = {{"key", key.c_str()},
+                                           {NULL, NULL}};
+
+            call(new RGWReadRESTResourceCR<bucket_instance_meta_info>(
+                  sync_env->cct, sc->conn, sync_env->http_manager, path, pairs,
+                  &meta_info));
+          }
+         if (retcode < 0) {
+           ldpp_dout(dpp, 0) << "ERROR: failed to fetch metadata for key: "
+                             << key << dendl;
+           return set_cr_error(retcode);
+         }
+         // Now that bucket full sync is bucket-wide instead of
+         // per-shard, we only need to register a single shard of
+         // each bucket to guarantee that sync will see everything
+         // that happened before data full sync starts. This also
+         // means we don't have to care about the bucket's current
+         // shard count.
+         yield entries_index->append(
+           fmt::format("{}:{}", key, 0),
+           sync_env->svc->datalog_rados->get_log_shard_id(
+             meta_info.data.get_bucket_info().bucket, 0));
+       }
+       truncated = result.truncated;
+      } while (truncated);
+
+      yield {
+        if (!entries_index->finish()) {
+          failed = true;
+        }
+      }
+      if (!failed) {
+        for (auto iter = sync_status->sync_markers.begin();
+            iter != sync_status->sync_markers.end();
+            ++iter) {
+          int shard_id = (int)iter->first;
+          rgw_data_sync_marker& marker = iter->second;
+          marker.total_entries = entries_index->get_total_entries(shard_id);
+          spawn(new RGWSimpleRadosWriteCR<rgw_data_sync_marker>(
+                 dpp, sync_env->async_rados, sync_env->svc->sysobj,
+                 rgw_raw_obj(sync_env->svc->zone->get_zone_params().log_pool,
+                             RGWDataSyncStatusManager::shard_obj_name(
+                               sc->source_zone, shard_id)),
+                 marker),
+               true);
+       }
+      } else {
+        yield call(sync_env->error_logger->log_error_cr(
+                    dpp, sc->conn->get_remote_id(), "data.init", "",
+                    EIO, string("failed to build bucket instances map")));
+      }
+      while (collect(&ret, NULL)) {
+       if (ret < 0) {
+          yield call(sync_env->error_logger->log_error_cr(
+                      dpp, sc->conn->get_remote_id(), "data.init", "",
+                      -ret, string("failed to driver sync status: ") +
+                      cpp_strerror(-ret)));
+         req_ret = ret;
+       }
+       yield;
+      }
+      drain_all();
+      if (req_ret < 0) {
+        yield return set_cr_error(req_ret);
+      }
+       yield return set_cr_done();
+    }
+    return 0;
+  }
+};
+
+#define DATA_SYNC_UPDATE_MARKER_WINDOW 1
+
+class RGWDataSyncShardMarkerTrack : public RGWSyncShardMarkerTrack<string, string> {
+  RGWDataSyncCtx *sc;
+  RGWDataSyncEnv *sync_env;
+  string marker_oid;
+  rgw_data_sync_marker sync_marker;
+  RGWSyncTraceNodeRef tn;
+
+public:
+  RGWDataSyncShardMarkerTrack(RGWDataSyncCtx *_sc,
+                         const string& _marker_oid,
+                         const rgw_data_sync_marker& _marker,
+                         RGWSyncTraceNodeRef& _tn) : RGWSyncShardMarkerTrack(DATA_SYNC_UPDATE_MARKER_WINDOW),
+                                                                sc(_sc), sync_env(_sc->env),
+                                                                marker_oid(_marker_oid),
+                                                                sync_marker(_marker),
+                                                                tn(_tn) {}
+
+  RGWCoroutine* store_marker(const string& new_marker, uint64_t index_pos, const real_time& timestamp) override {
+    sync_marker.marker = new_marker;
+    sync_marker.pos = index_pos;
+    sync_marker.timestamp = timestamp;
+
+    tn->log(20, SSTR("updating marker marker_oid=" << marker_oid << " marker=" << new_marker));
+
+    return new RGWSimpleRadosWriteCR<rgw_data_sync_marker>(sync_env->dpp, sync_env->async_rados, sync_env->svc->sysobj,
+                                                           rgw_raw_obj(sync_env->svc->zone->get_zone_params().log_pool, marker_oid),
+                                                           sync_marker);
+  }
+
+  RGWOrderCallCR *allocate_order_control_cr() override {
+    return new RGWLastCallerWinsCR(sync_env->cct);
+  }
+};
+
+// ostream wrappers to print buckets without copying strings
+struct bucket_str {
+  const rgw_bucket& b;
+  explicit bucket_str(const rgw_bucket& b) : b(b) {}
+};
+std::ostream& operator<<(std::ostream& out, const bucket_str& rhs) {
+  auto& b = rhs.b;
+  if (!b.tenant.empty()) {
+    out << b.tenant << '/';
+  }
+  out << b.name;
+  if (!b.bucket_id.empty()) {
+    out << ':' << b.bucket_id;
+  }
+  return out;
+}
+
+struct bucket_str_noinstance {
+  const rgw_bucket& b;
+  explicit bucket_str_noinstance(const rgw_bucket& b) : b(b) {}
+};
+std::ostream& operator<<(std::ostream& out, const bucket_str_noinstance& rhs) {
+  auto& b = rhs.b;
+  if (!b.tenant.empty()) {
+    out << b.tenant << '/';
+  }
+  out << b.name;
+  return out;
+}
+
+struct bucket_shard_str {
+  const rgw_bucket_shard& bs;
+  explicit bucket_shard_str(const rgw_bucket_shard& bs) : bs(bs) {}
+};
+std::ostream& operator<<(std::ostream& out, const bucket_shard_str& rhs) {
+  auto& bs = rhs.bs;
+  out << bucket_str{bs.bucket};
+  if (bs.shard_id >= 0) {
+    out << ':' << bs.shard_id;
+  }
+  return out;
+}
+
+struct all_bucket_info {
+  RGWBucketInfo bucket_info;
+  map<string, bufferlist> attrs;
+};
+
+struct rgw_sync_pipe_info_entity
+{
+private:
+  RGWBucketInfo bucket_info;
+  map<string, bufferlist> bucket_attrs;
+  bool _has_bucket_info{false};
+
+public:
+  rgw_zone_id zone;
+
+  rgw_sync_pipe_info_entity() {}
+  rgw_sync_pipe_info_entity(const rgw_sync_bucket_entity& e,
+                            std::optional<all_bucket_info>& binfo) {
+    if (e.zone) {
+      zone = *e.zone;
+    }
+    if (!e.bucket) {
+      return;
+    }
+    if (!binfo ||
+        binfo->bucket_info.bucket != *e.bucket) {
+      bucket_info.bucket = *e.bucket;
+    } else {
+      set_bucket_info(*binfo);
+    }
+  }
+
+  void update_empty_bucket_info(const std::map<rgw_bucket, all_bucket_info>& buckets_info) {
+    if (_has_bucket_info) {
+      return;
+    }
+    if (bucket_info.bucket.name.empty()) {
+      return;
+    }
+
+    auto iter = buckets_info.find(bucket_info.bucket);
+    if (iter == buckets_info.end()) {
+      return;
+    }
+
+    set_bucket_info(iter->second);
+  }
+
+  bool has_bucket_info() const {
+    return _has_bucket_info;
+  }
+
+  void set_bucket_info(const all_bucket_info& all_info) {
+    bucket_info = all_info.bucket_info;
+    bucket_attrs = all_info.attrs;
+    _has_bucket_info = true;
+  }
+
+  const RGWBucketInfo& get_bucket_info() const {
+    return bucket_info;
+  }
+
+  const rgw_bucket& get_bucket() const {
+    return bucket_info.bucket;
+  }
+
+  bool operator<(const rgw_sync_pipe_info_entity& e) const {
+    if (zone < e.zone) {
+      return false;
+    }
+    if (zone > e.zone) {
+      return true;
+    }
+    return (bucket_info.bucket < e.bucket_info.bucket);
+  }
+};
+
+std::ostream& operator<<(std::ostream& out, const rgw_sync_pipe_info_entity& e) {
+  auto& bucket = e.get_bucket_info().bucket;
+
+  out << e.zone << ":" << bucket.get_key();
+  return out;
+}
+
+struct rgw_sync_pipe_handler_info {
+  RGWBucketSyncFlowManager::pipe_handler handler;
+  rgw_sync_pipe_info_entity source;
+  rgw_sync_pipe_info_entity target;
+
+  rgw_sync_pipe_handler_info() {}
+  rgw_sync_pipe_handler_info(const RGWBucketSyncFlowManager::pipe_handler& _handler,
+                     std::optional<all_bucket_info> source_bucket_info,
+                     std::optional<all_bucket_info> target_bucket_info) : handler(_handler),
+                                                                          source(handler.source, source_bucket_info),
+                                                                          target(handler.dest, target_bucket_info) {
+  }
+
+  bool operator<(const rgw_sync_pipe_handler_info& p) const {
+    if (source < p.source) {
+      return true;
+    }
+    if (p.source < source) {
+      return false;
+    }
+    return (target < p.target);
+  }
+
+  void update_empty_bucket_info(const std::map<rgw_bucket, all_bucket_info>& buckets_info) {
+    source.update_empty_bucket_info(buckets_info);
+    target.update_empty_bucket_info(buckets_info);
+  }
+};
+
+std::ostream& operator<<(std::ostream& out, const rgw_sync_pipe_handler_info& p) {
+  out << p.source << ">" << p.target;
+  return out;
+}
+
+struct rgw_sync_pipe_info_set {
+  std::set<rgw_sync_pipe_handler_info> handlers;
+
+  using iterator = std::set<rgw_sync_pipe_handler_info>::iterator;
+
+  void clear() {
+    handlers.clear();
+  }
+
+  void insert(const RGWBucketSyncFlowManager::pipe_handler& handler,
+              std::optional<all_bucket_info>& source_bucket_info,
+              std::optional<all_bucket_info>& target_bucket_info) {
+    rgw_sync_pipe_handler_info p(handler, source_bucket_info, target_bucket_info);
+    handlers.insert(p);
+  }
+
+  iterator begin() {
+    return handlers.begin();
+  }
+
+  iterator end() {
+    return handlers.end();
+  }
+
+  size_t size() const {
+    return handlers.size();
+  }
+
+  bool empty() const {
+    return handlers.empty();
+  }
+
+  void update_empty_bucket_info(const std::map<rgw_bucket, all_bucket_info>& buckets_info) {
+    if (buckets_info.empty()) {
+      return;
+    }
+
+    std::set<rgw_sync_pipe_handler_info> p;
+
+    for (auto pipe : handlers) {
+      pipe.update_empty_bucket_info(buckets_info);
+      p.insert(pipe);
+    }
+
+    handlers = std::move(p);
+  }
+};
+
+class RGWRunBucketSourcesSyncCR : public RGWCoroutine {
+  RGWDataSyncCtx *sc;
+  RGWDataSyncEnv *sync_env;
+  boost::intrusive_ptr<const RGWContinuousLeaseCR> lease_cr;
+
+  rgw_sync_pipe_info_set pipes;
+  rgw_sync_pipe_info_set::iterator siter;
+
+  rgw_bucket_sync_pair_info sync_pair;
+
+  RGWSyncTraceNodeRef tn;
+  ceph::real_time* progress;
+  std::vector<ceph::real_time> shard_progress;
+  std::vector<ceph::real_time>::iterator cur_shard_progress;
+
+  RGWRESTConn *conn{nullptr};
+  rgw_zone_id last_zone;
+
+  std::optional<uint64_t> gen;
+  rgw_bucket_index_marker_info marker_info;
+  BucketIndexShardsManager marker_mgr;
+
+public:
+  RGWRunBucketSourcesSyncCR(RGWDataSyncCtx *_sc,
+                            boost::intrusive_ptr<const RGWContinuousLeaseCR> lease_cr,
+                            const rgw_bucket_shard& source_bs,
+                            const RGWSyncTraceNodeRef& _tn_parent,
+                           std::optional<uint64_t> gen,
+                            ceph::real_time* progress);
+
+  int operate(const DoutPrefixProvider *dpp) override;
+};
+
+class RGWDataSyncSingleEntryCR : public RGWCoroutine {
+  RGWDataSyncCtx *sc;
+  RGWDataSyncEnv *sync_env;
+  rgw::bucket_sync::Handle state; // cached bucket-shard state
+  rgw_data_sync_obligation obligation; // input obligation
+  std::optional<rgw_data_sync_obligation> complete; // obligation to complete
+  uint32_t obligation_counter = 0;
+  RGWDataSyncShardMarkerTrack *marker_tracker;
+  rgw_raw_obj error_repo;
+  boost::intrusive_ptr<const RGWContinuousLeaseCR> lease_cr;
+  RGWSyncTraceNodeRef tn;
+
+  ceph::real_time progress;
+  int sync_status = 0;
+public:
+  RGWDataSyncSingleEntryCR(RGWDataSyncCtx *_sc, rgw::bucket_sync::Handle state,
+                           rgw_data_sync_obligation _obligation,
+                           RGWDataSyncShardMarkerTrack *_marker_tracker,
+                           const rgw_raw_obj& error_repo,
+                           boost::intrusive_ptr<const RGWContinuousLeaseCR> lease_cr,
+                           const RGWSyncTraceNodeRef& _tn_parent)
+    : RGWCoroutine(_sc->cct), sc(_sc), sync_env(_sc->env),
+      state(std::move(state)), obligation(std::move(_obligation)),
+      marker_tracker(_marker_tracker), error_repo(error_repo),
+      lease_cr(std::move(lease_cr)) {
+    set_description() << "data sync single entry (source_zone=" << sc->source_zone << ") " << obligation;
+    tn = sync_env->sync_tracer->add_node(_tn_parent, "entry", to_string(obligation.bs, obligation.gen));
+  }
+
+  int operate(const DoutPrefixProvider *dpp) override {
+    reenter(this) {
+      if (state->obligation) {
+        // this is already syncing in another DataSyncSingleEntryCR
+        if (state->obligation->timestamp < obligation.timestamp) {
+          // cancel existing obligation and overwrite it
+          tn->log(10, SSTR("canceling existing obligation " << *state->obligation));
+          complete = std::move(*state->obligation);
+          *state->obligation = std::move(obligation);
+          state->counter++;
+        } else {
+          // cancel new obligation
+          tn->log(10, SSTR("canceling new obligation " << obligation));
+          complete = std::move(obligation);
+        }
+      } else {
+        // start syncing a new obligation
+        state->obligation = obligation;
+        obligation_counter = state->counter;
+        state->counter++;
+
+        // loop until the latest obligation is satisfied, because other callers
+        // may update the obligation while we're syncing
+        while ((state->obligation->timestamp == ceph::real_time() ||
+                state->progress_timestamp < state->obligation->timestamp) &&
+               obligation_counter != state->counter) {
+          obligation_counter = state->counter;
+          progress = ceph::real_time{};
+
+          ldout(cct, 4) << "starting sync on " << bucket_shard_str{state->key.first}
+              << ' ' << *state->obligation << " progress timestamp " << state->progress_timestamp
+              << " progress " << progress << dendl;
+          yield call(new RGWRunBucketSourcesSyncCR(sc, lease_cr,
+                                                   state->key.first, tn,
+                                                   state->obligation->gen,
+                                                  &progress));
+          if (retcode < 0) {
+            break;
+          }
+          state->progress_timestamp = std::max(progress, state->progress_timestamp);
+        }
+        // any new obligations will process themselves
+        complete = std::move(*state->obligation);
+        state->obligation.reset();
+
+        tn->log(10, SSTR("sync finished on " << bucket_shard_str{state->key.first}
+                         << " progress=" << progress << ' ' << complete << " r=" << retcode));
+      }
+      sync_status = retcode;
+
+      if (sync_status == -ENOENT) {
+        // this was added when 'tenant/' was added to datalog entries, because
+        // preexisting tenant buckets could never sync and would stay in the
+        // error_repo forever
+        tn->log(0, SSTR("WARNING: skipping data log entry for missing bucket " << complete->bs));
+        sync_status = 0;
+      }
+
+      if (sync_status < 0) {
+        // write actual sync failures for 'radosgw-admin sync error list'
+        if (sync_status != -EBUSY && sync_status != -EAGAIN) {
+          yield call(sync_env->error_logger->log_error_cr(dpp, sc->conn->get_remote_id(), "data",
+                                                          to_string(complete->bs, complete->gen),
+                                                          -sync_status, string("failed to sync bucket instance: ") + cpp_strerror(-sync_status)));
+          if (retcode < 0) {
+            tn->log(0, SSTR("ERROR: failed to log sync failure: retcode=" << retcode));
+          }
+        }
+        if (complete->timestamp != ceph::real_time{}) {
+          tn->log(10, SSTR("writing " << *complete << " to error repo for retry"));
+          yield call(rgw::error_repo::write_cr(sync_env->driver->svc()->rados, error_repo,
+                                              rgw::error_repo::encode_key(complete->bs, complete->gen),
+                                              complete->timestamp));
+          if (retcode < 0) {
+            tn->log(0, SSTR("ERROR: failed to log sync failure in error repo: retcode=" << retcode));
+          }
+        }
+      } else if (complete->retry) {
+        yield call(rgw::error_repo::remove_cr(sync_env->driver->svc()->rados, error_repo,
+                                              rgw::error_repo::encode_key(complete->bs, complete->gen),
+                                              complete->timestamp));
+        if (retcode < 0) {
+          tn->log(0, SSTR("ERROR: failed to remove omap key from error repo ("
+             << error_repo << " retcode=" << retcode));
+        }
+      }
+      /* FIXME: what do do in case of error */
+      if (marker_tracker && !complete->marker.empty()) {
+        /* update marker */
+        yield call(marker_tracker->finish(complete->marker));
+      }
+      if (sync_status == 0) {
+        sync_status = retcode;
+      }
+      if (sync_status < 0) {
+        return set_cr_error(sync_status);
+      }
+      return set_cr_done();
+    }
+    return 0;
+  }
+};
+
+rgw_raw_obj datalog_oid_for_error_repo(RGWDataSyncCtx *sc, rgw::sal::RadosStore* driver,
+                                      rgw_pool& pool, rgw_bucket_shard& bs) {
+  int datalog_shard = driver->svc()->datalog_rados->choose_oid(bs);
+  string oid = RGWDataSyncStatusManager::shard_obj_name(sc->source_zone, datalog_shard);
+  return rgw_raw_obj(pool, oid + ".retry");
+  }
+
+class RGWDataIncrementalSyncFullObligationCR: public RGWCoroutine {
+  RGWDataSyncCtx *sc;
+  RGWDataSyncEnv *sync_env;
+  rgw_bucket_shard source_bs;
+  rgw_raw_obj error_repo;
+  std::string error_marker;
+  ceph::real_time timestamp;
+  RGWSyncTraceNodeRef tn;
+  rgw_bucket_index_marker_info remote_info;
+  rgw_pool pool;
+  uint32_t sid;
+  rgw_bucket_shard bs;
+  std::vector<store_gen_shards>::const_iterator each;
+
+public:
+  RGWDataIncrementalSyncFullObligationCR(RGWDataSyncCtx *_sc, rgw_bucket_shard& _source_bs,
+                                         const rgw_raw_obj& error_repo, const std::string& _error_marker,
+                                         ceph::real_time& _timestamp, RGWSyncTraceNodeRef& _tn)
+    : RGWCoroutine(_sc->cct), sc(_sc), sync_env(_sc->env), source_bs(_source_bs),
+      error_repo(error_repo), error_marker(_error_marker), timestamp(_timestamp),
+      tn(sync_env->sync_tracer->add_node(_tn, "error_repo", SSTR(bucket_shard_str(source_bs))))
+  {}
+
+  int operate(const DoutPrefixProvider *dpp) override {
+    reenter(this) {
+      yield call(new RGWReadRemoteBucketIndexLogInfoCR(sc, source_bs.bucket, &remote_info));
+      if (retcode < 0) {
+        return set_cr_error(retcode);
+      }
+
+      each = remote_info.generations.cbegin();
+      for (; each != remote_info.generations.cend(); each++) {
+        for (sid = 0; sid < each->num_shards; sid++) {
+          bs.bucket = source_bs.bucket;
+          bs.shard_id = sid;
+          error_repo = datalog_oid_for_error_repo(sc, sync_env->driver, pool, source_bs);
+          tn->log(10, SSTR("writing shard_id " << sid << " of gen " << each->gen << " to error repo for retry"));
+          yield_spawn_window(rgw::error_repo::write_cr(sync_env->driver->svc()->rados, error_repo,
+                            rgw::error_repo::encode_key(bs, each->gen),
+                            timestamp), cct->_conf->rgw_data_sync_spawn_window,
+                            [&](uint64_t stack_id, int ret) {
+                              if (ret < 0) {
+                                retcode = ret;
+                              }
+                              return 0;
+                            });
+        }
+      }
+      drain_all_cb([&](uint64_t stack_id, int ret) {
+                   if (ret < 0) {
+                     tn->log(10, SSTR("writing to error repo returned error: " << ret));
+                   }
+                   return ret;
+                 });
+
+      // once everything succeeds, remove the full sync obligation from the error repo
+      yield call(rgw::error_repo::remove_cr(sync_env->driver->svc()->rados, error_repo,
+                                            error_marker, timestamp));
+      return set_cr_done();
+    }
+    return 0;
+  }
+};
+
+RGWCoroutine* data_sync_single_entry(RGWDataSyncCtx *sc, const rgw_bucket_shard& src,
+                                std::optional<uint64_t> gen,
+                                const std::string marker,
+                                ceph::real_time timestamp,
+                                boost::intrusive_ptr<const RGWContinuousLeaseCR> lease_cr,
+                                boost::intrusive_ptr<rgw::bucket_sync::Cache> bucket_shard_cache,
+                                RGWDataSyncShardMarkerTrack* marker_tracker,
+                                rgw_raw_obj error_repo,
+                                RGWSyncTraceNodeRef& tn,
+                                bool retry) {
+  auto state = bucket_shard_cache->get(src, gen);
+  auto obligation = rgw_data_sync_obligation{src, gen, marker, timestamp, retry};
+  return new RGWDataSyncSingleEntryCR(sc, std::move(state), std::move(obligation),
+                                      &*marker_tracker, error_repo,
+                                      lease_cr.get(), tn);
+}
+
+static ceph::real_time timestamp_for_bucket_shard(rgw::sal::RadosStore* driver,
+                                                const rgw_data_sync_status& sync_status,
+                                                const rgw_bucket_shard& bs) {
+  int datalog_shard = driver->svc()->datalog_rados->choose_oid(bs);
+  auto status = sync_status.sync_markers.find(datalog_shard);
+  if (status == sync_status.sync_markers.end()) {
+    return ceph::real_clock::zero();
+  }
+  return status->second.timestamp;
+}
+
+class RGWDataFullSyncSingleEntryCR : public RGWCoroutine {
+  RGWDataSyncCtx *sc;
+  RGWDataSyncEnv *sync_env;
+  rgw_pool pool;
+  rgw_bucket_shard source_bs;
+  const std::string key;
+  rgw_data_sync_status sync_status;
+  rgw_raw_obj error_repo;
+  ceph::real_time timestamp;
+  boost::intrusive_ptr<const RGWContinuousLeaseCR> lease_cr;
+  boost::intrusive_ptr<rgw::bucket_sync::Cache> bucket_shard_cache;
+  RGWDataSyncShardMarkerTrack* marker_tracker;
+  RGWSyncTraceNodeRef tn;
+  rgw_bucket_index_marker_info remote_info;
+  uint32_t sid;
+  std::vector<store_gen_shards>::iterator each;
+  uint64_t i{0};
+  RGWCoroutine* shard_cr = nullptr;
+  bool first_shard = true;
+  bool error_inject;
+
+public:
+  RGWDataFullSyncSingleEntryCR(RGWDataSyncCtx *_sc, const rgw_pool& _pool, const rgw_bucket_shard& _source_bs,
+                      const std::string& _key, const rgw_data_sync_status& sync_status, const rgw_raw_obj& _error_repo,
+                      ceph::real_time _timestamp, boost::intrusive_ptr<const RGWContinuousLeaseCR> _lease_cr,
+                      boost::intrusive_ptr<rgw::bucket_sync::Cache> _bucket_shard_cache,
+                      RGWDataSyncShardMarkerTrack* _marker_tracker,
+                      RGWSyncTraceNodeRef& _tn)
+    : RGWCoroutine(_sc->cct), sc(_sc), sync_env(_sc->env), pool(_pool), source_bs(_source_bs), key(_key),
+      error_repo(_error_repo), timestamp(_timestamp), lease_cr(std::move(_lease_cr)),
+      bucket_shard_cache(_bucket_shard_cache), marker_tracker(_marker_tracker), tn(_tn) {
+        error_inject = (sync_env->cct->_conf->rgw_sync_data_full_inject_err_probability > 0);
+      }
+
+
+  int operate(const DoutPrefixProvider *dpp) override {
+    reenter(this) {
+      if (error_inject &&
+          rand() % 10000 < cct->_conf->rgw_sync_data_full_inject_err_probability * 10000.0) {
+        tn->log(0, SSTR("injecting read bilog info error on key=" << key));
+        retcode = -ENOENT;
+      } else {
+        tn->log(0, SSTR("read bilog info key=" << key));
+        yield call(new RGWReadRemoteBucketIndexLogInfoCR(sc, source_bs.bucket, &remote_info));
+      }
+
+      if (retcode < 0) {
+        tn->log(10, SSTR("full sync: failed to read remote bucket info. Writing "
+                        << source_bs.shard_id << " to error repo for retry"));
+        yield call(rgw::error_repo::write_cr(sync_env->driver->svc()->rados, error_repo,
+                                            rgw::error_repo::encode_key(source_bs, std::nullopt),
+                                            timestamp));
+        if (retcode < 0) {
+          tn->log(0, SSTR("ERROR: failed to log " << source_bs.shard_id << " in error repo: retcode=" << retcode));
+        }
+        yield call(marker_tracker->finish(key));
+        return set_cr_error(retcode);
+      }
+
+      //wait to sync the first shard of the oldest generation and then sync all other shards.
+      //if any of the operations fail at any time, write them into error repo for later retry.
+
+      each = remote_info.generations.begin();
+      for (; each != remote_info.generations.end(); each++) {
+        for (sid = 0; sid < each->num_shards; sid++) {
+          source_bs.shard_id = sid;
+          // use the error repo and sync status timestamp from the datalog shard corresponding to source_bs
+          error_repo = datalog_oid_for_error_repo(sc, sync_env->driver, pool, source_bs);
+          timestamp = timestamp_for_bucket_shard(sync_env->driver, sync_status, source_bs);
+          if (retcode < 0) {
+            tn->log(10, SSTR("Write " << source_bs.shard_id << " to error repo for retry"));
+            yield_spawn_window(rgw::error_repo::write_cr(sync_env->driver->svc()->rados, error_repo,
+                rgw::error_repo::encode_key(source_bs, each->gen),
+                timestamp), cct->_conf->rgw_data_sync_spawn_window, std::nullopt);
+          } else {
+          shard_cr = data_sync_single_entry(sc, source_bs, each->gen, key, timestamp,
+                      lease_cr, bucket_shard_cache, nullptr, error_repo, tn, false);
+          tn->log(10, SSTR("full sync: syncing shard_id " << sid << " of gen " << each->gen));
+          if (first_shard) {
+            yield call(shard_cr);
+            first_shard = false;
+          } else {
+            yield_spawn_window(shard_cr, cct->_conf->rgw_data_sync_spawn_window,
+                              [&](uint64_t stack_id, int ret) {
+                                if (ret < 0) {
+                                  retcode = ret;
+                                }
+                                return retcode;
+                                });
+            }
+          }
+        }
+        drain_all_cb([&](uint64_t stack_id, int ret) {
+                if (ret < 0) {
+                  retcode = ret;
+                }
+                return retcode;
+              });
+      }
+
+      yield call(marker_tracker->finish(key));
+
+      return set_cr_done();
+    }
+    return 0;
+  }
+};
+
+class RGWDataBaseSyncShardCR : public RGWCoroutine {
+protected:
+  RGWDataSyncCtx *const sc;
+  const rgw_pool& pool;
+  const uint32_t shard_id;
+  rgw_data_sync_marker& sync_marker;
+  RGWSyncTraceNodeRef tn;
+  const string& status_oid;
+  const rgw_raw_obj& error_repo;
+  boost::intrusive_ptr<const RGWContinuousLeaseCR> lease_cr;
+  const rgw_data_sync_status& sync_status;
+  boost::intrusive_ptr<rgw::bucket_sync::Cache> bucket_shard_cache;
+
+  std::optional<RGWDataSyncShardMarkerTrack> marker_tracker;
+  RGWRadosGetOmapValsCR::ResultPtr omapvals;
+  rgw_bucket_shard source_bs;
+
+  int parse_bucket_key(const std::string& key, rgw_bucket_shard& bs) const {
+    return rgw_bucket_parse_bucket_key(sc->env->cct, key,
+                                       &bs.bucket, &bs.shard_id);
+  }
+
+  RGWDataBaseSyncShardCR(
+    RGWDataSyncCtx *const _sc, const rgw_pool& pool, const uint32_t shard_id,
+    rgw_data_sync_marker& sync_marker, RGWSyncTraceNodeRef tn,
+    const string& status_oid, const rgw_raw_obj& error_repo,
+    boost::intrusive_ptr<const RGWContinuousLeaseCR> lease_cr,
+    const rgw_data_sync_status& sync_status,
+    const boost::intrusive_ptr<rgw::bucket_sync::Cache>& bucket_shard_cache)
+    : RGWCoroutine(_sc->cct), sc(_sc), pool(pool), shard_id(shard_id),
+      sync_marker(sync_marker), tn(tn), status_oid(status_oid),
+      error_repo(error_repo), lease_cr(std::move(lease_cr)),
+      sync_status(sync_status), bucket_shard_cache(bucket_shard_cache) {}
+};
+
+class RGWDataFullSyncShardCR : public RGWDataBaseSyncShardCR {
+  static constexpr auto OMAP_GET_MAX_ENTRIES = 100;
+
+  string oid;
+  uint64_t total_entries = 0;
+  ceph::real_time entry_timestamp;
+  std::map<std::string, bufferlist> entries;
+  std::map<std::string, bufferlist>::iterator iter;
+  string error_marker;
+
+public:
+
+  RGWDataFullSyncShardCR(
+    RGWDataSyncCtx *const sc, const rgw_pool& pool, const uint32_t shard_id,
+    rgw_data_sync_marker& sync_marker, RGWSyncTraceNodeRef tn,
+    const string& status_oid, const rgw_raw_obj& error_repo,
+    boost::intrusive_ptr<const RGWContinuousLeaseCR> lease_cr,
+    const rgw_data_sync_status& sync_status,
+    const boost::intrusive_ptr<rgw::bucket_sync::Cache>& bucket_shard_cache)
+    : RGWDataBaseSyncShardCR(sc, pool, shard_id, sync_marker, tn,
+                            status_oid, error_repo, std::move(lease_cr),
+                            sync_status, bucket_shard_cache) {}
+
+  int operate(const DoutPrefixProvider *dpp) override {
+    reenter(this) {
+      tn->log(10, "start full sync");
+      oid = full_data_sync_index_shard_oid(sc->source_zone, shard_id);
+      marker_tracker.emplace(sc, status_oid, sync_marker, tn);
+      total_entries = sync_marker.pos;
+      entry_timestamp = sync_marker.timestamp; // time when full sync started
+      do {
+        if (!lease_cr->is_locked()) {
+          drain_all();
+          tn->log(1, "lease is lost, abort");
+          return set_cr_error(-ECANCELED);
+        }
+        omapvals = std::make_shared<RGWRadosGetOmapValsCR::Result>();
+        yield call(new RGWRadosGetOmapValsCR(sc->env->driver,
+                                            rgw_raw_obj(pool, oid),
+                                             sync_marker.marker,
+                                            OMAP_GET_MAX_ENTRIES, omapvals));
+        if (retcode < 0) {
+          drain_all();
+          return set_cr_error(retcode);
+        }
+        entries = std::move(omapvals->entries);
+        if (entries.size() > 0) {
+          tn->set_flag(RGW_SNS_FLAG_ACTIVE); /* actually have entries to sync */
+        }
+        tn->log(20, SSTR("retrieved " << entries.size() << " entries to sync"));
+        iter = entries.begin();
+        for (; iter != entries.end(); ++iter) {
+          retcode = parse_bucket_key(iter->first, source_bs);
+          if (retcode < 0) {
+            tn->log(1, SSTR("failed to parse bucket shard: " << iter->first));
+            marker_tracker->try_update_high_marker(iter->first, 0,
+                                                  entry_timestamp);
+            continue;
+          }
+          tn->log(20, SSTR("full sync: " << iter->first));
+          total_entries++;
+          if (!marker_tracker->start(iter->first, total_entries,
+                                    entry_timestamp)) {
+            tn->log(0, SSTR("ERROR: cannot start syncing " << iter->first
+                           << ". Duplicate entry?"));
+          } else {
+            tn->log(10, SSTR("timestamp for " << iter->first << " is :" << entry_timestamp));
+            yield_spawn_window(new RGWDataFullSyncSingleEntryCR(
+                                sc, pool, source_bs, iter->first, sync_status,
+                                error_repo, entry_timestamp, lease_cr,
+                                bucket_shard_cache, &*marker_tracker, tn),
+                              cct->_conf->rgw_data_sync_spawn_window,
+                              std::nullopt);
+          }
+         sync_marker.marker = iter->first;
+        }
+      } while (omapvals->more);
+      omapvals.reset();
+
+      drain_all();
+
+      tn->unset_flag(RGW_SNS_FLAG_ACTIVE);
+
+      /* update marker to reflect we're done with full sync */
+      sync_marker.state = rgw_data_sync_marker::IncrementalSync;
+      sync_marker.marker = sync_marker.next_step_marker;
+      sync_marker.next_step_marker.clear();
+      yield call(new RGWSimpleRadosWriteCR<rgw_data_sync_marker>(
+             sc->env->dpp,sc->env->async_rados, sc->env->svc->sysobj,
+             rgw_raw_obj(pool, status_oid), sync_marker));
+      if (retcode < 0) {
+        tn->log(0, SSTR("ERROR: failed to set sync marker: retcode=" << retcode));
+        return set_cr_error(retcode);
+      }
+
+      // clean up full sync index, ignoring errors
+      yield call(new RGWRadosRemoveCR(sc->env->driver, {pool, oid}));
+
+      // transition to incremental sync
+      return set_cr_done();
+    }
+    return 0;
+  }
+};
+
+class RGWDataIncSyncShardCR : public RGWDataBaseSyncShardCR {
+  static constexpr int max_error_entries = 10;
+  static constexpr uint32_t retry_backoff_secs = 60;
+
+  ceph::mutex& inc_lock;
+  bc::flat_set<rgw_data_notify_entry>& modified_shards;
+
+  bc::flat_set<rgw_data_notify_entry> current_modified;
+  decltype(current_modified)::iterator modified_iter;
+
+  ceph::coarse_real_time error_retry_time;
+  string error_marker;
+  std::map<std::string, bufferlist> error_entries;
+  decltype(error_entries)::iterator iter;
+  ceph::real_time entry_timestamp;
+  std::optional<uint64_t> gen;
+
+  string next_marker;
+  vector<rgw_data_change_log_entry> log_entries;
+  decltype(log_entries)::iterator log_iter;
+  bool truncated = false;
+
+  utime_t get_idle_interval() const {
+    ceph::timespan interval = std::chrono::seconds(cct->_conf->rgw_data_sync_poll_interval);
+    if (!ceph::coarse_real_clock::is_zero(error_retry_time)) {
+      auto now = ceph::coarse_real_clock::now();
+      if (error_retry_time > now) {
+        auto d = error_retry_time - now;
+        if (interval > d) {
+          interval = d;
+        }
+      }
+    }
+    // convert timespan -> time_point -> utime_t
+    return utime_t(ceph::coarse_real_clock::zero() + interval);
+  }
+
+
+public:
+
+  RGWDataIncSyncShardCR(
+    RGWDataSyncCtx *const sc, const rgw_pool& pool, const uint32_t shard_id,
+    rgw_data_sync_marker& sync_marker, RGWSyncTraceNodeRef tn,
+    const string& status_oid, const rgw_raw_obj& error_repo,
+    boost::intrusive_ptr<const RGWContinuousLeaseCR> lease_cr,
+    const rgw_data_sync_status& sync_status,
+    const boost::intrusive_ptr<rgw::bucket_sync::Cache>& bucket_shard_cache,
+    ceph::mutex& inc_lock,
+    bc::flat_set<rgw_data_notify_entry>& modified_shards)
+    : RGWDataBaseSyncShardCR(sc, pool, shard_id, sync_marker, tn,
+                            status_oid, error_repo, std::move(lease_cr),
+                            sync_status, bucket_shard_cache),
+      inc_lock(inc_lock), modified_shards(modified_shards) {}
+
+  int operate(const DoutPrefixProvider *dpp) override {
+    reenter(this) {
+      tn->log(10, "start incremental sync");
+      marker_tracker.emplace(sc, status_oid, sync_marker, tn);
+      do {
+        if (!lease_cr->is_locked()) {
+          drain_all();
+          tn->log(1, "lease is lost, abort");
+          return set_cr_error(-ECANCELED);
+        }
+       {
+         current_modified.clear();
+         std::unique_lock il(inc_lock);
+         current_modified.swap(modified_shards);
+         il.unlock();
+       }
+
+        if (current_modified.size() > 0) {
+          tn->set_flag(RGW_SNS_FLAG_ACTIVE); /* actually have entries to sync */
+        }
+        /* process out of band updates */
+        for (modified_iter = current_modified.begin();
+            modified_iter != current_modified.end();
+            ++modified_iter) {
+          retcode = parse_bucket_key(modified_iter->key, source_bs);
+          if (retcode < 0) {
+            tn->log(1, SSTR("failed to parse bucket shard: "
+                           << modified_iter->key));
+           continue;
+          }
+          tn->log(20, SSTR("received async update notification: "
+                          << modified_iter->key));
+          spawn(data_sync_single_entry(sc, source_bs, modified_iter->gen, {},
+                                      ceph::real_time{}, lease_cr,
+                                      bucket_shard_cache, &*marker_tracker,
+                                      error_repo, tn, false), false);
+       }
+
+        if (error_retry_time <= ceph::coarse_real_clock::now()) {
+          /* process bucket shards that previously failed */
+          omapvals = std::make_shared<RGWRadosGetOmapValsCR::Result>();
+          yield call(new RGWRadosGetOmapValsCR(sc->env->driver, error_repo,
+                                               error_marker, max_error_entries,
+                                              omapvals));
+          error_entries = std::move(omapvals->entries);
+          tn->log(20, SSTR("read error repo, got " << error_entries.size()
+                          << " entries"));
+          iter = error_entries.begin();
+          for (; iter != error_entries.end(); ++iter) {
+            error_marker = iter->first;
+            entry_timestamp = rgw::error_repo::decode_value(iter->second);
+            retcode = rgw::error_repo::decode_key(iter->first, source_bs, gen);
+            if (retcode == -EINVAL) {
+              // backward compatibility for string keys that don't encode a gen
+              retcode = parse_bucket_key(error_marker, source_bs);
+            }
+            if (retcode < 0) {
+              tn->log(1, SSTR("failed to parse bucket shard: " << error_marker));
+              spawn(rgw::error_repo::remove_cr(sc->env->driver->svc()->rados,
+                                              error_repo, error_marker,
+                                              entry_timestamp),
+                   false);
+              continue;
+            }
+            tn->log(10, SSTR("gen is " << gen));
+            if (!gen) {
+              // write all full sync obligations for the bucket to error repo
+              spawn(new RGWDataIncrementalSyncFullObligationCR(sc, source_bs,
+                     error_repo, error_marker, entry_timestamp, tn), false);
+            } else {
+              tn->log(20, SSTR("handle error entry key="
+                              << to_string(source_bs, gen)
+                              << " timestamp=" << entry_timestamp));
+              spawn(data_sync_single_entry(sc, source_bs, gen, "",
+                                          entry_timestamp, lease_cr,
+                                          bucket_shard_cache, &*marker_tracker,
+                                          error_repo, tn, true), false);
+            }
+          }
+          if (!omapvals->more) {
+            error_retry_time = ceph::coarse_real_clock::now() +
+             make_timespan(retry_backoff_secs);
+            error_marker.clear();
+          }
+        }
+        omapvals.reset();
+
+        tn->log(20, SSTR("shard_id=" << shard_id << " sync_marker="
+                        << sync_marker.marker));
+        yield call(new RGWReadRemoteDataLogShardCR(sc, shard_id,
+                                                  sync_marker.marker,
+                                                   &next_marker, &log_entries,
+                                                  &truncated));
+        if (retcode < 0 && retcode != -ENOENT) {
+          tn->log(0, SSTR("ERROR: failed to read remote data log info: ret="
+                         << retcode));
+          drain_all();
+          return set_cr_error(retcode);
+        }
+
+        if (log_entries.size() > 0) {
+          tn->set_flag(RGW_SNS_FLAG_ACTIVE); /* actually have entries to sync */
+        }
+
+        for (log_iter = log_entries.begin();
+            log_iter != log_entries.end();
+            ++log_iter) {
+          tn->log(20, SSTR("shard_id=" << shard_id << " log_entry: "
+                          << log_iter->log_id << ":" << log_iter->log_timestamp
+                          << ":" << log_iter->entry.key));
+          retcode = parse_bucket_key(log_iter->entry.key, source_bs);
+          if (retcode < 0) {
+            tn->log(1, SSTR("failed to parse bucket shard: "
+                           << log_iter->entry.key));
+            marker_tracker->try_update_high_marker(log_iter->log_id, 0,
+                                                  log_iter->log_timestamp);
+            continue;
+          }
+          if (!marker_tracker->start(log_iter->log_id, 0,
+                                    log_iter->log_timestamp)) {
+            tn->log(0, SSTR("ERROR: cannot start syncing " << log_iter->log_id
+                           << ". Duplicate entry?"));
+          } else {
+            tn->log(1, SSTR("incremental sync on " << log_iter->entry.key
+                           << "shard: " << shard_id << "on gen "
+                           << log_iter->entry.gen));
+            yield_spawn_window(
+             data_sync_single_entry(sc, source_bs,log_iter->entry.gen,
+                                    log_iter->log_id, log_iter->log_timestamp,
+                                    lease_cr,bucket_shard_cache,
+                                    &*marker_tracker, error_repo, tn, false),
+             cct->_conf->rgw_data_sync_spawn_window, std::nullopt);
+          }
+        }
+
+        tn->log(20, SSTR("shard_id=" << shard_id <<
+                        " sync_marker="<< sync_marker.marker
+                        << " next_marker=" << next_marker
+                        << " truncated=" << truncated));
+        if (!next_marker.empty()) {
+          sync_marker.marker = next_marker;
+        } else if (!log_entries.empty()) {
+          sync_marker.marker = log_entries.back().log_id;
+        }
+        if (!truncated) {
+          // we reached the end, wait a while before checking for more
+          tn->unset_flag(RGW_SNS_FLAG_ACTIVE);
+         yield wait(get_idle_interval());
+       }
+      } while (true);
+    }
+    return 0;
+  }
+};
+
+class RGWDataSyncShardCR : public RGWCoroutine {
+  RGWDataSyncCtx *const sc;
+  const rgw_pool pool;
+  const uint32_t shard_id;
+  rgw_data_sync_marker& sync_marker;
+  rgw_data_sync_status sync_status;
+  const RGWSyncTraceNodeRef tn;
+  bool *reset_backoff;
+
+  ceph::mutex inc_lock = ceph::make_mutex("RGWDataSyncShardCR::inc_lock");
+  ceph::condition_variable inc_cond;
+
+  RGWDataSyncEnv *const sync_env{ sc->env };
+
+  const string status_oid{ RGWDataSyncStatusManager::shard_obj_name(
+      sc->source_zone, shard_id) };
+  const rgw_raw_obj error_repo{ pool, status_oid + ".retry" };
+
+  // target number of entries to cache before recycling idle ones
+  static constexpr size_t target_cache_size = 256;
+  boost::intrusive_ptr<rgw::bucket_sync::Cache> bucket_shard_cache {
+    rgw::bucket_sync::Cache::create(target_cache_size) };
+
+  boost::intrusive_ptr<RGWContinuousLeaseCR> lease_cr;
+  boost::intrusive_ptr<RGWCoroutinesStack> lease_stack;
+
+  bc::flat_set<rgw_data_notify_entry> modified_shards;
+
+public:
+  RGWDataSyncShardCR(RGWDataSyncCtx* const _sc, const rgw_pool& pool,
+                     const uint32_t shard_id, rgw_data_sync_marker& marker,
+                     const rgw_data_sync_status& sync_status,
+                     RGWSyncTraceNodeRef& tn, bool *reset_backoff)
+    : RGWCoroutine(_sc->cct), sc(_sc), pool(pool), shard_id(shard_id),
+      sync_marker(marker), sync_status(sync_status), tn(tn),
+      reset_backoff(reset_backoff) {
+    set_description() << "data sync shard source_zone=" << sc->source_zone
+                     << " shard_id=" << shard_id;
+  }
+
+  ~RGWDataSyncShardCR() override {
+    if (lease_cr) {
+      lease_cr->abort();
+    }
+  }
+
+  void append_modified_shards(bc::flat_set<rgw_data_notify_entry>& entries) {
+    std::lock_guard l{inc_lock};
+    modified_shards.insert(entries.begin(), entries.end());
+  }
+
+  int operate(const DoutPrefixProvider *dpp) override {
+    reenter(this) {
+      yield init_lease_cr();
+      while (!lease_cr->is_locked()) {
+        if (lease_cr->is_done()) {
+          tn->log(5, "failed to take lease");
+          set_status("lease lock failed, early abort");
+          drain_all();
+          return set_cr_error(lease_cr->get_ret_status());
+        }
+        set_sleeping(true);
+        yield;
+      }
+      *reset_backoff = true;
+      tn->log(10, "took lease");
+
+      while (true) {
+       if (sync_marker.state == rgw_data_sync_marker::FullSync) {
+         yield call(new RGWDataFullSyncShardCR(sc, pool, shard_id,
+                                               sync_marker, tn,
+                                               status_oid, error_repo,
+                                               lease_cr, sync_status,
+                                               bucket_shard_cache));
+         if (retcode < 0) {
+           if (retcode != -EBUSY) {
+             tn->log(10, SSTR("full sync failed (retcode=" << retcode << ")"));
+           }
+           lease_cr->go_down();
+           drain_all();
+           return set_cr_error(retcode);
+         }
+       } else if (sync_marker.state == rgw_data_sync_marker::IncrementalSync) {
+         yield call(new RGWDataIncSyncShardCR(sc, pool, shard_id,
+                                              sync_marker, tn,
+                                              status_oid, error_repo,
+                                              lease_cr, sync_status,
+                                              bucket_shard_cache,
+                                              inc_lock, modified_shards));
+         if (retcode < 0) {
+           if (retcode != -EBUSY) {
+             tn->log(10, SSTR("incremental sync failed (retcode=" << retcode
+                              << ")"));
+           }
+           lease_cr->go_down();
+           drain_all();
+           return set_cr_error(retcode);
+         }
+       } else {
+         lease_cr->go_down();
+         drain_all();
+         return set_cr_error(-EIO);
+       }
+      }
+    }
+    return 0;
+  }
+
+  void init_lease_cr() {
+    set_status("acquiring sync lock");
+    uint32_t lock_duration = cct->_conf->rgw_sync_lease_period;
+    string lock_name = "sync_lock";
+    if (lease_cr) {
+      lease_cr->abort();
+    }
+    auto driver = sync_env->driver;
+    lease_cr.reset(new RGWContinuousLeaseCR(sync_env->async_rados, driver,
+                                            rgw_raw_obj(pool, status_oid),
+                                            lock_name, lock_duration, this));
+    lease_stack.reset(spawn(lease_cr.get(), false));
+  }
+};
+
+class RGWDataSyncShardControlCR : public RGWBackoffControlCR {
+  RGWDataSyncCtx *sc;
+  RGWDataSyncEnv *sync_env;
+
+  rgw_pool pool;
+
+  uint32_t shard_id;
+  rgw_data_sync_marker sync_marker;
+  rgw_data_sync_status sync_status;
+
+  RGWSyncTraceNodeRef tn;
+public:
+  RGWDataSyncShardControlCR(RGWDataSyncCtx *_sc, const rgw_pool& _pool,
+                    uint32_t _shard_id, rgw_data_sync_marker& _marker, const rgw_data_sync_status& sync_status,
+                     RGWSyncTraceNodeRef& _tn_parent) : RGWBackoffControlCR(_sc->cct, false),
+                                                      sc(_sc), sync_env(_sc->env),
+                                                     pool(_pool),
+                                                     shard_id(_shard_id),
+                                                     sync_marker(_marker) {
+    tn = sync_env->sync_tracer->add_node(_tn_parent, "shard", std::to_string(shard_id));
+  }
+
+  RGWCoroutine *alloc_cr() override {
+    return new RGWDataSyncShardCR(sc, pool, shard_id, sync_marker, sync_status, tn, backoff_ptr());
+  }
+
+  RGWCoroutine *alloc_finisher_cr() override {
+    return new RGWSimpleRadosReadCR<rgw_data_sync_marker>(sync_env->dpp, sync_env->async_rados, sync_env->svc->sysobj,
+                                                          rgw_raw_obj(sync_env->svc->zone->get_zone_params().log_pool, RGWDataSyncStatusManager::shard_obj_name(sc->source_zone, shard_id)),
+                                                          &sync_marker);
+  }
+
+  void append_modified_shards(bc::flat_set<rgw_data_notify_entry>& keys) {
+    std::lock_guard l{cr_lock()};
+
+    RGWDataSyncShardCR *cr = static_cast<RGWDataSyncShardCR *>(get_cr());
+    if (!cr) {
+      return;
+    }
+
+    cr->append_modified_shards(keys);
+  }
+};
+
+class RGWDataSyncCR : public RGWCoroutine {
+  RGWDataSyncCtx *sc;
+  RGWDataSyncEnv *sync_env;
+  uint32_t num_shards;
+
+  rgw_data_sync_status sync_status;
+
+  ceph::mutex shard_crs_lock =
+    ceph::make_mutex("RGWDataSyncCR::shard_crs_lock");
+  map<int, RGWDataSyncShardControlCR *> shard_crs;
+
+  bool *reset_backoff;
+
+  RGWSyncTraceNodeRef tn;
+
+  RGWDataSyncModule *data_sync_module{nullptr};
+public:
+  RGWDataSyncCR(RGWDataSyncCtx *_sc, uint32_t _num_shards, RGWSyncTraceNodeRef& _tn, bool *_reset_backoff) : RGWCoroutine(_sc->cct),
+                                                      sc(_sc), sync_env(_sc->env),
+                                                      num_shards(_num_shards),
+                                                      reset_backoff(_reset_backoff), tn(_tn) {
+
+  }
+
+  ~RGWDataSyncCR() override {
+    for (auto iter : shard_crs) {
+      iter.second->put();
+    }
+  }
+
+  int operate(const DoutPrefixProvider *dpp) override {
+    reenter(this) {
+
+      /* read sync status */
+      yield call(new RGWReadDataSyncStatusCoroutine(sc, &sync_status));
+
+      data_sync_module = sync_env->sync_module->get_data_handler();
+
+      if (retcode < 0 && retcode != -ENOENT) {
+        tn->log(0, SSTR("ERROR: failed to fetch sync status, retcode=" << retcode));
+        return set_cr_error(retcode);
+      }
+
+      /* state: init status */
+      if ((rgw_data_sync_info::SyncState)sync_status.sync_info.state == rgw_data_sync_info::StateInit) {
+        tn->log(20, SSTR("init"));
+        sync_status.sync_info.num_shards = num_shards;
+        uint64_t instance_id;
+        instance_id = ceph::util::generate_random_number<uint64_t>();
+        yield call(new RGWInitDataSyncStatusCoroutine(sc, num_shards, instance_id, tn, &sync_status));
+        if (retcode < 0) {
+          tn->log(0, SSTR("ERROR: failed to init sync, retcode=" << retcode));
+          return set_cr_error(retcode);
+        }
+        // sets state = StateBuildingFullSyncMaps
+
+        *reset_backoff = true;
+      }
+
+      data_sync_module->init(sc, sync_status.sync_info.instance_id);
+
+      if  ((rgw_data_sync_info::SyncState)sync_status.sync_info.state == rgw_data_sync_info::StateBuildingFullSyncMaps) {
+        tn->log(10, SSTR("building full sync maps"));
+        /* call sync module init here */
+        sync_status.sync_info.num_shards = num_shards;
+        yield call(data_sync_module->init_sync(dpp, sc));
+        if (retcode < 0) {
+          tn->log(0, SSTR("ERROR: sync module init_sync() failed, retcode=" << retcode));
+          return set_cr_error(retcode);
+        }
+        /* state: building full sync maps */
+        yield call(new RGWListBucketIndexesCR(sc, &sync_status));
+        if (retcode < 0) {
+          tn->log(0, SSTR("ERROR: failed to build full sync maps, retcode=" << retcode));
+          return set_cr_error(retcode);
+        }
+        sync_status.sync_info.state = rgw_data_sync_info::StateSync;
+
+        /* update new state */
+        yield call(set_sync_info_cr());
+        if (retcode < 0) {
+          tn->log(0, SSTR("ERROR: failed to write sync status, retcode=" << retcode));
+          return set_cr_error(retcode);
+        }
+
+        *reset_backoff = true;
+      }
+
+      yield call(data_sync_module->start_sync(dpp, sc));
+      if (retcode < 0) {
+        tn->log(0, SSTR("ERROR: failed to start sync, retcode=" << retcode));
+        return set_cr_error(retcode);
+      }
+      
+      yield {
+        if  ((rgw_data_sync_info::SyncState)sync_status.sync_info.state == rgw_data_sync_info::StateSync) {
+          tn->log(10, SSTR("spawning " << num_shards << " shards sync"));
+          for (map<uint32_t, rgw_data_sync_marker>::iterator iter = sync_status.sync_markers.begin();
+               iter != sync_status.sync_markers.end(); ++iter) {
+            RGWDataSyncShardControlCR *cr = new RGWDataSyncShardControlCR(sc, sync_env->svc->zone->get_zone_params().log_pool,
+                                                                          iter->first, iter->second, sync_status, tn);
+            cr->get();
+            shard_crs_lock.lock();
+            shard_crs[iter->first] = cr;
+            shard_crs_lock.unlock();
+            spawn(cr, true);
+          }
+        }
+      }
+
+      return set_cr_done();
+    }
+    return 0;
+  }
+
+  RGWCoroutine *set_sync_info_cr() {
+    return new RGWSimpleRadosWriteCR<rgw_data_sync_info>(sync_env->dpp, sync_env->async_rados, sync_env->svc->sysobj,
+                                                         rgw_raw_obj(sync_env->svc->zone->get_zone_params().log_pool, RGWDataSyncStatusManager::sync_status_oid(sc->source_zone)),
+                                                         sync_status.sync_info);
+  }
+
+  void wakeup(int shard_id, bc::flat_set<rgw_data_notify_entry>& entries) {
+    std::lock_guard l{shard_crs_lock};
+    map<int, RGWDataSyncShardControlCR *>::iterator iter = shard_crs.find(shard_id);
+    if (iter == shard_crs.end()) {
+      return;
+    }
+    iter->second->append_modified_shards(entries);
+    iter->second->wakeup();
+  }
+};
+
+class RGWDefaultDataSyncModule : public RGWDataSyncModule {
+public:
+  RGWDefaultDataSyncModule() {}
+
+  RGWCoroutine *sync_object(const DoutPrefixProvider *dpp, RGWDataSyncCtx *sc, rgw_bucket_sync_pipe& sync_pipe, rgw_obj_key& key, std::optional<uint64_t> versioned_epoch, rgw_zone_set *zones_trace) override;
+  RGWCoroutine *remove_object(const DoutPrefixProvider *dpp, RGWDataSyncCtx *sc, rgw_bucket_sync_pipe& sync_pipe, rgw_obj_key& key, real_time& mtime, bool versioned, uint64_t versioned_epoch, rgw_zone_set *zones_trace) override;
+  RGWCoroutine *create_delete_marker(const DoutPrefixProvider *dpp, RGWDataSyncCtx *sc, rgw_bucket_sync_pipe& sync_pipe, rgw_obj_key& key, real_time& mtime,
+                                     rgw_bucket_entry_owner& owner, bool versioned, uint64_t versioned_epoch, rgw_zone_set *zones_trace) override;
+};
+
+class RGWDefaultSyncModuleInstance : public RGWSyncModuleInstance {
+  RGWDefaultDataSyncModule data_handler;
+public:
+  RGWDefaultSyncModuleInstance() {}
+  RGWDataSyncModule *get_data_handler() override {
+    return &data_handler;
+  }
+  bool supports_user_writes() override {
+    return true;
+  }
+};
+
+int RGWDefaultSyncModule::create_instance(const DoutPrefixProvider *dpp, CephContext *cct, const JSONFormattable& config, RGWSyncModuleInstanceRef *instance)
+{
+  instance->reset(new RGWDefaultSyncModuleInstance());
+  return 0;
+}
+
+class RGWUserPermHandler {
+  friend struct Init;
+  friend class Bucket;
+
+  RGWDataSyncEnv *sync_env;
+  rgw_user uid;
+
+  struct _info {
+    RGWUserInfo user_info;
+    rgw::IAM::Environment env;
+    std::unique_ptr<rgw::auth::Identity> identity;
+    RGWAccessControlPolicy user_acl;
+  };
+
+  std::shared_ptr<_info> info;
+
+  struct Init;
+
+  std::shared_ptr<Init> init_action;
+
+  struct Init : public RGWGenericAsyncCR::Action {
+    RGWDataSyncEnv *sync_env;
+
+    rgw_user uid;
+    std::shared_ptr<RGWUserPermHandler::_info> info;
+
+    int ret{0};
+    
+    Init(RGWUserPermHandler *handler) : sync_env(handler->sync_env),
+                                        uid(handler->uid),
+                                        info(handler->info) {}
+    int operate() override {
+      auto user_ctl = sync_env->driver->getRados()->ctl.user;
+
+      ret = user_ctl->get_info_by_uid(sync_env->dpp, uid, &info->user_info, null_yield);
+      if (ret < 0) {
+        return ret;
+      }
+
+      info->identity = rgw::auth::transform_old_authinfo(sync_env->cct,
+                                                         uid,
+                                                         RGW_PERM_FULL_CONTROL,
+                                                         false, /* system_request? */
+                                                         TYPE_RGW);
+
+      map<string, bufferlist> uattrs;
+
+      ret = user_ctl->get_attrs_by_uid(sync_env->dpp, uid, &uattrs, null_yield);
+      if (ret == 0) {
+        ret = RGWUserPermHandler::policy_from_attrs(sync_env->cct, uattrs, &info->user_acl);
+      }
+      if (ret == -ENOENT) {
+        info->user_acl.create_default(uid, info->user_info.display_name);
+      }
+
+      return 0;
+    }
+  };
+
+public:
+  RGWUserPermHandler(RGWDataSyncEnv *_sync_env,
+                     const rgw_user& _uid) : sync_env(_sync_env),
+                                             uid(_uid) {}
+
+  RGWCoroutine *init_cr() {
+    info = make_shared<_info>();
+    init_action = make_shared<Init>(this);
+
+    return new RGWGenericAsyncCR(sync_env->cct,
+                                 sync_env->async_rados,
+                                 init_action);
+  }
+
+  class Bucket {
+    RGWDataSyncEnv *sync_env;
+    std::shared_ptr<_info> info;
+    RGWAccessControlPolicy bucket_acl;
+    std::optional<perm_state> ps;
+  public:
+    Bucket() {}
+
+    int init(RGWUserPermHandler *handler,
+             const RGWBucketInfo& bucket_info,
+             const map<string, bufferlist>& bucket_attrs);
+
+    bool verify_bucket_permission(int perm);
+    bool verify_object_permission(const map<string, bufferlist>& obj_attrs,
+                                  int perm);
+  };
+
+  static int policy_from_attrs(CephContext *cct,
+                               const map<string, bufferlist>& attrs,
+                               RGWAccessControlPolicy *acl) {
+    acl->set_ctx(cct);
+
+    auto aiter = attrs.find(RGW_ATTR_ACL);
+    if (aiter == attrs.end()) {
+      return -ENOENT;
+    }
+    auto iter = aiter->second.begin();
+    try {
+      acl->decode(iter);
+    } catch (buffer::error& err) {
+      ldout(cct, 0) << "ERROR: " << __func__ << "(): could not decode policy, caught buffer::error" << dendl;
+      return -EIO;
+    }
+
+    return 0;
+  }
+
+  int init_bucket(const RGWBucketInfo& bucket_info,
+                  const map<string, bufferlist>& bucket_attrs,
+                  Bucket *bs) {
+    return bs->init(this, bucket_info, bucket_attrs);
+  }
+};
+
+int RGWUserPermHandler::Bucket::init(RGWUserPermHandler *handler,
+                                     const RGWBucketInfo& bucket_info,
+                                     const map<string, bufferlist>& bucket_attrs)
+{
+  sync_env = handler->sync_env;
+  info = handler->info;
+
+  int r = RGWUserPermHandler::policy_from_attrs(sync_env->cct, bucket_attrs, &bucket_acl);
+  if (r < 0) {
+    return r;
+  }
+
+  ps.emplace(sync_env->cct,
+             info->env,
+             info->identity.get(),
+             bucket_info,
+             info->identity->get_perm_mask(),
+             false, /* defer to bucket acls */
+             nullptr, /* referer */
+             false); /* request_payer */
+
+  return 0;
+}
+
+bool RGWUserPermHandler::Bucket::verify_bucket_permission(int perm)
+{
+  return verify_bucket_permission_no_policy(sync_env->dpp,
+                                            &(*ps),
+                                            &info->user_acl,
+                                            &bucket_acl,
+                                            perm);
+}
+
+bool RGWUserPermHandler::Bucket::verify_object_permission(const map<string, bufferlist>& obj_attrs,
+                                                          int perm)
+{
+  RGWAccessControlPolicy obj_acl;
+
+  int r = policy_from_attrs(sync_env->cct, obj_attrs, &obj_acl);
+  if (r < 0) {
+    return r;
+  }
+
+  return verify_bucket_permission_no_policy(sync_env->dpp,
+                                            &(*ps),
+                                            &bucket_acl,
+                                            &obj_acl,
+                                            perm);
+}
+
+class RGWFetchObjFilter_Sync : public RGWFetchObjFilter_Default {
+  rgw_bucket_sync_pipe sync_pipe;
+
+  std::shared_ptr<RGWUserPermHandler::Bucket> bucket_perms;
+  std::optional<rgw_sync_pipe_dest_params> verify_dest_params;
+
+  std::optional<ceph::real_time> mtime;
+  std::optional<string> etag;
+  std::optional<uint64_t> obj_size;
+
+  std::unique_ptr<rgw::auth::Identity> identity;
+
+  std::shared_ptr<bool> need_retry;
+
+public:
+  RGWFetchObjFilter_Sync(rgw_bucket_sync_pipe& _sync_pipe,
+                         std::shared_ptr<RGWUserPermHandler::Bucket>& _bucket_perms,
+                         std::optional<rgw_sync_pipe_dest_params>&& _verify_dest_params,
+                         std::shared_ptr<bool>& _need_retry) : sync_pipe(_sync_pipe),
+                                         bucket_perms(_bucket_perms),
+                                         verify_dest_params(std::move(_verify_dest_params)),
+                                         need_retry(_need_retry) {
+    *need_retry = false;
+  }
+
+  int filter(CephContext *cct,
+             const rgw_obj_key& source_key,
+             const RGWBucketInfo& dest_bucket_info,
+             std::optional<rgw_placement_rule> dest_placement_rule,
+             const map<string, bufferlist>& obj_attrs,
+             std::optional<rgw_user> *poverride_owner,
+             const rgw_placement_rule **prule) override;
+};
+
+int RGWFetchObjFilter_Sync::filter(CephContext *cct,
+                                   const rgw_obj_key& source_key,
+                                   const RGWBucketInfo& dest_bucket_info,
+                                   std::optional<rgw_placement_rule> dest_placement_rule,
+                                   const map<string, bufferlist>& obj_attrs,
+                                   std::optional<rgw_user> *poverride_owner,
+                                   const rgw_placement_rule **prule)
+{
+  int abort_err = -ERR_PRECONDITION_FAILED;
+
+  rgw_sync_pipe_params params;
+
+  RGWObjTags obj_tags;
+
+  auto iter = obj_attrs.find(RGW_ATTR_TAGS);
+  if (iter != obj_attrs.end()) {
+    try {
+      auto it = iter->second.cbegin();
+      obj_tags.decode(it);
+    } catch (buffer::error &err) {
+      ldout(cct, 0) << "ERROR: " << __func__ << ": caught buffer::error couldn't decode TagSet " << dendl;
+    }
+  }
+
+  if (!sync_pipe.info.handler.find_obj_params(source_key,
+                                              obj_tags.get_tags(),
+                                              &params)) {
+    return abort_err;
+  }
+
+  if (verify_dest_params &&
+      !(*verify_dest_params == params.dest)) {
+    /* raced! original dest params were different, will need to retry */
+    ldout(cct, 0) << "WARNING: " << __func__ << ": pipe dest params are different than original params, must have raced with object rewrite, retrying" << dendl;
+    *need_retry = true;
+    return -ECANCELED;
+  }
+
+  std::optional<std::map<string, bufferlist> > new_attrs;
+
+  if (params.dest.acl_translation) {
+    rgw_user& acl_translation_owner = params.dest.acl_translation->owner;
+    if (!acl_translation_owner.empty()) {
+      if (params.mode == rgw_sync_pipe_params::MODE_USER &&
+          acl_translation_owner != dest_bucket_info.owner) {
+        ldout(cct, 0) << "ERROR: " << __func__ << ": acl translation was requested, but user (" << acl_translation_owner
+          << ") is not dest bucket owner (" << dest_bucket_info.owner << ")" << dendl;
+        return -EPERM;
+      }
+      *poverride_owner = acl_translation_owner;
+    }
+  }
+  if (params.mode == rgw_sync_pipe_params::MODE_USER) {
+    if (!bucket_perms->verify_object_permission(obj_attrs, RGW_PERM_READ)) {
+      ldout(cct, 0) << "ERROR: " << __func__ << ": permission check failed: user not allowed to fetch object" << dendl;
+      return -EPERM;
+    }
+  }
+
+  if (!dest_placement_rule &&
+      params.dest.storage_class) {
+    dest_rule.storage_class = *params.dest.storage_class;
+    dest_rule.inherit_from(dest_bucket_info.placement_rule);
+    dest_placement_rule = dest_rule;
+    *prule = &dest_rule;
+  }
+
+  return RGWFetchObjFilter_Default::filter(cct,
+                                           source_key,
+                                           dest_bucket_info,
+                                           dest_placement_rule,
+                                           obj_attrs,
+                                           poverride_owner,
+                                           prule);
+}
+
+class RGWObjFetchCR : public RGWCoroutine {
+  RGWDataSyncCtx *sc;
+  RGWDataSyncEnv *sync_env;
+  rgw_bucket_sync_pipe& sync_pipe;
+  rgw_obj_key& key;
+  std::optional<rgw_obj_key> dest_key;
+  std::optional<uint64_t> versioned_epoch;
+  rgw_zone_set *zones_trace;
+
+  bool need_more_info{false};
+  bool check_change{false};
+
+  ceph::real_time src_mtime;
+  uint64_t src_size;
+  string src_etag;
+  map<string, bufferlist> src_attrs;
+  map<string, string> src_headers;
+
+  std::optional<rgw_user> param_user;
+  rgw_sync_pipe_params::Mode param_mode;
+
+  std::optional<RGWUserPermHandler> user_perms;
+  std::shared_ptr<RGWUserPermHandler::Bucket> source_bucket_perms;
+  RGWUserPermHandler::Bucket dest_bucket_perms;
+
+  std::optional<rgw_sync_pipe_dest_params> dest_params;
+
+  int try_num{0};
+  std::shared_ptr<bool> need_retry;
+public:
+  RGWObjFetchCR(RGWDataSyncCtx *_sc,
+                rgw_bucket_sync_pipe& _sync_pipe,
+                rgw_obj_key& _key,
+                std::optional<rgw_obj_key> _dest_key,
+                std::optional<uint64_t> _versioned_epoch,
+                rgw_zone_set *_zones_trace) : RGWCoroutine(_sc->cct),
+                                              sc(_sc), sync_env(_sc->env),
+                                              sync_pipe(_sync_pipe),
+                                              key(_key),
+                                              dest_key(_dest_key),
+                                              versioned_epoch(_versioned_epoch),
+                                              zones_trace(_zones_trace) {
+  }
+
+
+  int operate(const DoutPrefixProvider *dpp) override {
+    reenter(this) {
+
+#define MAX_RACE_RETRIES_OBJ_FETCH 10
+      for (try_num = 0; try_num < MAX_RACE_RETRIES_OBJ_FETCH; ++try_num) {
+
+        {
+          std::optional<rgw_user> param_acl_translation;
+          std::optional<string> param_storage_class;
+
+          if (!sync_pipe.info.handler.find_basic_info_without_tags(key,
+                                                                   &param_user,
+                                                                   &param_acl_translation,
+                                                                   &param_storage_class,
+                                                                   &param_mode,
+                                                                   &need_more_info)) {
+            if (!need_more_info) {
+              return set_cr_error(-ERR_PRECONDITION_FAILED);
+            }
+          }
+        }
+
+        if (need_more_info) {
+          ldout(cct, 20) << "Could not determine exact policy rule for obj=" << key << ", will read source object attributes" << dendl;
+          /*
+           * we need to fetch info about source object, so that we can determine
+           * the correct policy configuration. This can happen if there are multiple
+           * policy rules, and some depend on the object tagging */
+          yield call(new RGWStatRemoteObjCR(sync_env->async_rados,
+                                            sync_env->driver,
+                                            sc->source_zone,
+                                            sync_pipe.info.source_bs.bucket,
+                                            key,
+                                            &src_mtime,
+                                            &src_size,
+                                            &src_etag,
+                                            &src_attrs,
+                                            &src_headers));
+          if (retcode < 0) {
+            return set_cr_error(retcode);
+          }
+
+          RGWObjTags obj_tags;
+
+          auto iter = src_attrs.find(RGW_ATTR_TAGS);
+          if (iter != src_attrs.end()) {
+            try {
+              auto it = iter->second.cbegin();
+              obj_tags.decode(it);
+            } catch (buffer::error &err) {
+              ldout(cct, 0) << "ERROR: " << __func__ << ": caught buffer::error couldn't decode TagSet " << dendl;
+            }
+          }
+
+          rgw_sync_pipe_params params;
+          if (!sync_pipe.info.handler.find_obj_params(key,
+                                                      obj_tags.get_tags(),
+                                                      &params)) {
+            return set_cr_error(-ERR_PRECONDITION_FAILED);
+          }
+
+          param_user = params.user;
+          param_mode = params.mode;
+
+          dest_params = params.dest;
+        }
+
+        if (param_mode == rgw_sync_pipe_params::MODE_USER) {
+          if (!param_user) {
+            ldout(cct, 20) << "ERROR: " << __func__ << ": user level sync but user param not set" << dendl;
+            return set_cr_error(-EPERM);
+          }
+          user_perms.emplace(sync_env, *param_user);
+
+          yield call(user_perms->init_cr());
+          if (retcode < 0) {
+            ldout(cct, 20) << "ERROR: " << __func__ << ": failed to init user perms manager for uid=" << *param_user << dendl;
+            return set_cr_error(retcode);
+          }
+
+          /* verify that user is allowed to write at the target bucket */
+          int r = user_perms->init_bucket(sync_pipe.dest_bucket_info,
+                                          sync_pipe.dest_bucket_attrs,
+                                          &dest_bucket_perms);
+          if (r < 0) {
+            ldout(cct, 20) << "ERROR: " << __func__ << ": failed to init bucket perms manager for uid=" << *param_user << " bucket=" << sync_pipe.source_bucket_info.bucket.get_key() << dendl;
+            return set_cr_error(retcode);
+          }
+
+          if (!dest_bucket_perms.verify_bucket_permission(RGW_PERM_WRITE)) {
+            ldout(cct, 0) << "ERROR: " << __func__ << ": permission check failed: user not allowed to write into bucket (bucket=" << sync_pipe.info.dest_bucket.get_key() << ")" << dendl;
+            return -EPERM;
+          }
+
+          /* init source bucket permission structure */
+          source_bucket_perms = make_shared<RGWUserPermHandler::Bucket>();
+          r = user_perms->init_bucket(sync_pipe.source_bucket_info,
+                                      sync_pipe.source_bucket_attrs,
+                                      source_bucket_perms.get());
+          if (r < 0) {
+            ldout(cct, 20) << "ERROR: " << __func__ << ": failed to init bucket perms manager for uid=" << *param_user << " bucket=" << sync_pipe.source_bucket_info.bucket.get_key() << dendl;
+            return set_cr_error(retcode);
+          }
+        }
+
+        yield {
+          if (!need_retry) {
+            need_retry = make_shared<bool>();
+          }
+          auto filter = make_shared<RGWFetchObjFilter_Sync>(sync_pipe,
+                                                            source_bucket_perms,
+                                                            std::move(dest_params),
+                                                            need_retry);
+
+          call(new RGWFetchRemoteObjCR(sync_env->async_rados, sync_env->driver, sc->source_zone,
+                                       nullopt,
+                                       sync_pipe.info.source_bs.bucket,
+                                       std::nullopt, sync_pipe.dest_bucket_info,
+                                       key, dest_key, versioned_epoch,
+                                       true,
+                                       std::static_pointer_cast<RGWFetchObjFilter>(filter),
+                                       zones_trace, sync_env->counters, dpp));
+        }
+        if (retcode < 0) {
+          if (*need_retry) {
+            continue;
+          }
+          return set_cr_error(retcode);
+        }
+
+        return set_cr_done();
+      }
+
+      ldout(cct, 0) << "ERROR: " << __func__ << ": Too many retries trying to fetch object, possibly a bug: bucket=" << sync_pipe.source_bucket_info.bucket.get_key() << " key=" << key << dendl;
+
+      return set_cr_error(-EIO);
+    }
+    return 0;
+  }
+};
+
+RGWCoroutine *RGWDefaultDataSyncModule::sync_object(const DoutPrefixProvider *dpp, RGWDataSyncCtx *sc, rgw_bucket_sync_pipe& sync_pipe, rgw_obj_key& key, std::optional<uint64_t> versioned_epoch, rgw_zone_set *zones_trace)
+{
+  return new RGWObjFetchCR(sc, sync_pipe, key, std::nullopt, versioned_epoch, zones_trace);
+}
+
+RGWCoroutine *RGWDefaultDataSyncModule::remove_object(const DoutPrefixProvider *dpp, RGWDataSyncCtx *sc, rgw_bucket_sync_pipe& sync_pipe, rgw_obj_key& key,
+                                                      real_time& mtime, bool versioned, uint64_t versioned_epoch, rgw_zone_set *zones_trace)
+{
+  auto sync_env = sc->env;
+  return new RGWRemoveObjCR(sync_env->dpp, sync_env->async_rados, sync_env->driver, sc->source_zone,
+                            sync_pipe.dest_bucket_info, key, versioned, versioned_epoch,
+                            NULL, NULL, false, &mtime, zones_trace);
+}
+
+RGWCoroutine *RGWDefaultDataSyncModule::create_delete_marker(const DoutPrefixProvider *dpp, RGWDataSyncCtx *sc, rgw_bucket_sync_pipe& sync_pipe, rgw_obj_key& key, real_time& mtime,
+                                                             rgw_bucket_entry_owner& owner, bool versioned, uint64_t versioned_epoch, rgw_zone_set *zones_trace)
+{
+  auto sync_env = sc->env;
+  return new RGWRemoveObjCR(sync_env->dpp, sync_env->async_rados, sync_env->driver, sc->source_zone,
+                            sync_pipe.dest_bucket_info, key, versioned, versioned_epoch,
+                            &owner.id, &owner.display_name, true, &mtime, zones_trace);
+}
+
+class RGWArchiveDataSyncModule : public RGWDefaultDataSyncModule {
+public:
+  RGWArchiveDataSyncModule() {}
+
+  RGWCoroutine *sync_object(const DoutPrefixProvider *dpp, RGWDataSyncCtx *sc, rgw_bucket_sync_pipe& sync_pipe, rgw_obj_key& key, std::optional<uint64_t> versioned_epoch, rgw_zone_set *zones_trace) override;
+  RGWCoroutine *remove_object(const DoutPrefixProvider *dpp, RGWDataSyncCtx *sc, rgw_bucket_sync_pipe& sync_pipe, rgw_obj_key& key, real_time& mtime, bool versioned, uint64_t versioned_epoch, rgw_zone_set *zones_trace) override;
+  RGWCoroutine *create_delete_marker(const DoutPrefixProvider *dpp, RGWDataSyncCtx *sc, rgw_bucket_sync_pipe& sync_pipe, rgw_obj_key& key, real_time& mtime,
+                                     rgw_bucket_entry_owner& owner, bool versioned, uint64_t versioned_epoch, rgw_zone_set *zones_trace) override;
+};
+
+class RGWArchiveSyncModuleInstance : public RGWDefaultSyncModuleInstance {
+  RGWArchiveDataSyncModule data_handler;
+public:
+  RGWArchiveSyncModuleInstance() {}
+  RGWDataSyncModule *get_data_handler() override {
+    return &data_handler;
+  }
+  RGWMetadataHandler *alloc_bucket_meta_handler() override {
+    return RGWArchiveBucketMetaHandlerAllocator::alloc();
+  }
+  RGWBucketInstanceMetadataHandlerBase *alloc_bucket_instance_meta_handler(rgw::sal::Driver* driver) override {
+    return RGWArchiveBucketInstanceMetaHandlerAllocator::alloc(driver);
+  }
+};
+
+int RGWArchiveSyncModule::create_instance(const DoutPrefixProvider *dpp, CephContext *cct, const JSONFormattable& config, RGWSyncModuleInstanceRef *instance)
+{
+  instance->reset(new RGWArchiveSyncModuleInstance());
+  return 0;
+}
+
+RGWCoroutine *RGWArchiveDataSyncModule::sync_object(const DoutPrefixProvider *dpp, RGWDataSyncCtx *sc, rgw_bucket_sync_pipe& sync_pipe, rgw_obj_key& key, std::optional<uint64_t> versioned_epoch, rgw_zone_set *zones_trace)
+{
+  auto sync_env = sc->env;
+  ldout(sc->cct, 5) << "SYNC_ARCHIVE: sync_object: b=" << sync_pipe.info.source_bs.bucket << " k=" << key << " versioned_epoch=" << versioned_epoch.value_or(0) << dendl;
+  if (!sync_pipe.dest_bucket_info.versioned() ||
+     (sync_pipe.dest_bucket_info.flags & BUCKET_VERSIONS_SUSPENDED)) {
+      ldout(sc->cct, 0) << "SYNC_ARCHIVE: sync_object: enabling object versioning for archive bucket" << dendl;
+      sync_pipe.dest_bucket_info.flags = (sync_pipe.dest_bucket_info.flags & ~BUCKET_VERSIONS_SUSPENDED) | BUCKET_VERSIONED;
+      int op_ret = sync_env->driver->getRados()->put_bucket_instance_info(sync_pipe.dest_bucket_info, false, real_time(), NULL, sync_env->dpp);
+      if (op_ret < 0) {
+         ldpp_dout(sync_env->dpp, 0) << "SYNC_ARCHIVE: sync_object: error versioning archive bucket" << dendl;
+         return NULL;
+      }
+  }
+
+  std::optional<rgw_obj_key> dest_key;
+
+  if (versioned_epoch.value_or(0) == 0) { /* force version if not set */
+    versioned_epoch = 0;
+    dest_key = key;
+    if (key.instance.empty()) {
+      sync_env->driver->getRados()->gen_rand_obj_instance_name(&(*dest_key));
+    }
+  }
+
+  return new RGWObjFetchCR(sc, sync_pipe, key, dest_key, versioned_epoch, zones_trace);
+}
+
+RGWCoroutine *RGWArchiveDataSyncModule::remove_object(const DoutPrefixProvider *dpp, RGWDataSyncCtx *sc, rgw_bucket_sync_pipe& sync_pipe, rgw_obj_key& key,
+                                                     real_time& mtime, bool versioned, uint64_t versioned_epoch, rgw_zone_set *zones_trace)
+{
+  ldout(sc->cct, 0) << "SYNC_ARCHIVE: remove_object: b=" << sync_pipe.info.source_bs.bucket << " k=" << key << " versioned_epoch=" << versioned_epoch << dendl;
+  return NULL;
+}
+
+RGWCoroutine *RGWArchiveDataSyncModule::create_delete_marker(const DoutPrefixProvider *dpp, RGWDataSyncCtx *sc, rgw_bucket_sync_pipe& sync_pipe, rgw_obj_key& key, real_time& mtime,
+                                                            rgw_bucket_entry_owner& owner, bool versioned, uint64_t versioned_epoch, rgw_zone_set *zones_trace)
+{
+  ldout(sc->cct, 0) << "SYNC_ARCHIVE: create_delete_marker: b=" << sync_pipe.info.source_bs.bucket << " k=" << key << " mtime=" << mtime
+                                   << " versioned=" << versioned << " versioned_epoch=" << versioned_epoch << dendl;
+  auto sync_env = sc->env;
+  return new RGWRemoveObjCR(sync_env->dpp, sync_env->async_rados, sync_env->driver, sc->source_zone,
+                            sync_pipe.dest_bucket_info, key, versioned, versioned_epoch,
+                            &owner.id, &owner.display_name, true, &mtime, zones_trace);
+}
+
+class RGWDataSyncControlCR : public RGWBackoffControlCR
+{
+  RGWDataSyncCtx *sc;
+  RGWDataSyncEnv *sync_env;
+  uint32_t num_shards;
+
+  RGWSyncTraceNodeRef tn;
+
+  static constexpr bool exit_on_error = false; // retry on all errors
+public:
+  RGWDataSyncControlCR(RGWDataSyncCtx *_sc, uint32_t _num_shards,
+                       RGWSyncTraceNodeRef& _tn_parent) : RGWBackoffControlCR(_sc->cct, exit_on_error),
+                                                          sc(_sc), sync_env(_sc->env), num_shards(_num_shards) {
+    tn = sync_env->sync_tracer->add_node(_tn_parent, "sync");
+  }
+
+  RGWCoroutine *alloc_cr() override {
+    return new RGWDataSyncCR(sc, num_shards, tn, backoff_ptr());
+  }
+
+  void wakeup(int shard_id, bc::flat_set<rgw_data_notify_entry>& entries) {
+    ceph::mutex& m = cr_lock();
+
+    m.lock();
+    RGWDataSyncCR *cr = static_cast<RGWDataSyncCR *>(get_cr());
+    if (!cr) {
+      m.unlock();
+      return;
+    }
+
+    cr->get();
+    m.unlock();
+
+    if (cr) {
+      cr->wakeup(shard_id, entries);
+    }
+
+    cr->put();
+  }
+};
+
+void RGWRemoteDataLog::wakeup(int shard_id, bc::flat_set<rgw_data_notify_entry>& entries) {
+  std::shared_lock rl{lock};
+  if (!data_sync_cr) {
+    return;
+  }
+  data_sync_cr->wakeup(shard_id, entries);
+}
+
+int RGWRemoteDataLog::run_sync(const DoutPrefixProvider *dpp, int num_shards)
+{
+  lock.lock();
+  data_sync_cr = new RGWDataSyncControlCR(&sc, num_shards, tn);
+  data_sync_cr->get(); // run() will drop a ref, so take another
+  lock.unlock();
+
+  int r = run(dpp, data_sync_cr);
+
+  lock.lock();
+  data_sync_cr->put();
+  data_sync_cr = NULL;
+  lock.unlock();
+
+  if (r < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: failed to run sync" << dendl;
+    return r;
+  }
+  return 0;
+}
+
+CephContext *RGWDataSyncStatusManager::get_cct() const
+{
+  return driver->ctx();
+}
+
+int RGWDataSyncStatusManager::init(const DoutPrefixProvider *dpp)
+{
+  RGWZone *zone_def;
+
+  if (!(zone_def = driver->svc()->zone->find_zone(source_zone))) {
+    ldpp_dout(this, 0) << "ERROR: failed to find zone config info for zone=" << source_zone << dendl;
+    return -EIO;
+  }
+
+  if (!driver->svc()->sync_modules->get_manager()->supports_data_export(zone_def->tier_type)) {
+    return -ENOTSUP;
+  }
+
+  const RGWZoneParams& zone_params = driver->svc()->zone->get_zone_params();
+
+  if (sync_module == nullptr) {
+    sync_module = driver->get_sync_module();
+  }
+
+  conn = driver->svc()->zone->get_zone_conn(source_zone);
+  if (!conn) {
+    ldpp_dout(this, 0) << "connection object to zone " << source_zone << " does not exist" << dendl;
+    return -EINVAL;
+  }
+
+  error_logger = new RGWSyncErrorLogger(driver, RGW_SYNC_ERROR_LOG_SHARD_PREFIX, ERROR_LOGGER_SHARDS);
+
+  int r = source_log.init(source_zone, conn, error_logger, driver->getRados()->get_sync_tracer(),
+                          sync_module, counters);
+  if (r < 0) {
+    ldpp_dout(this, 0) << "ERROR: failed to init remote log, r=" << r << dendl;
+    finalize();
+    return r;
+  }
+
+  rgw_datalog_info datalog_info;
+  r = source_log.read_log_info(dpp, &datalog_info);
+  if (r < 0) {
+    ldpp_dout(this, 5) << "ERROR: master.read_log_info() returned r=" << r << dendl;
+    finalize();
+    return r;
+  }
+
+  num_shards = datalog_info.num_shards;
+
+  for (int i = 0; i < num_shards; i++) {
+    shard_objs[i] = rgw_raw_obj(zone_params.log_pool, shard_obj_name(source_zone, i));
+  }
+
+  return 0;
+}
+
+void RGWDataSyncStatusManager::finalize()
+{
+  delete error_logger;
+  error_logger = nullptr;
+}
+
+unsigned RGWDataSyncStatusManager::get_subsys() const
+{
+  return dout_subsys;
+}
+
+std::ostream& RGWDataSyncStatusManager::gen_prefix(std::ostream& out) const
+{
+  auto zone = std::string_view{source_zone.id};
+  return out << "data sync zone:" << zone.substr(0, 8) << ' ';
+}
+
+string RGWDataSyncStatusManager::sync_status_oid(const rgw_zone_id& source_zone)
+{
+  char buf[datalog_sync_status_oid_prefix.size() + source_zone.id.size() + 16];
+  snprintf(buf, sizeof(buf), "%s.%s", datalog_sync_status_oid_prefix.c_str(), source_zone.id.c_str());
+
+  return string(buf);
+}
+
+string RGWDataSyncStatusManager::shard_obj_name(const rgw_zone_id& source_zone, int shard_id)
+{
+  char buf[datalog_sync_status_shard_prefix.size() + source_zone.id.size() + 16];
+  snprintf(buf, sizeof(buf), "%s.%s.%d", datalog_sync_status_shard_prefix.c_str(), source_zone.id.c_str(), shard_id);
+
+  return string(buf);
+}
+
+class RGWInitBucketShardSyncStatusCoroutine : public RGWCoroutine {
+  RGWDataSyncCtx *sc;
+  RGWDataSyncEnv *sync_env;
+
+  const rgw_bucket_sync_pair_info& sync_pair;
+  const string sync_status_oid;
+
+  rgw_bucket_shard_sync_info& status;
+  RGWObjVersionTracker& objv_tracker;
+  const BucketIndexShardsManager& marker_mgr;
+  bool exclusive;
+public:
+  RGWInitBucketShardSyncStatusCoroutine(RGWDataSyncCtx *_sc,
+                                        const rgw_bucket_sync_pair_info& _sync_pair,
+                                        rgw_bucket_shard_sync_info& _status,
+                                        uint64_t gen,
+                                        const BucketIndexShardsManager& _marker_mgr,
+                                        RGWObjVersionTracker& objv_tracker,
+                                        bool exclusive)
+    : RGWCoroutine(_sc->cct), sc(_sc), sync_env(_sc->env),
+      sync_pair(_sync_pair),
+      sync_status_oid(RGWBucketPipeSyncStatusManager::inc_status_oid(sc->source_zone, _sync_pair, gen)),
+      status(_status), objv_tracker(objv_tracker), marker_mgr(_marker_mgr), exclusive(exclusive)
+  {}
+
+  int operate(const DoutPrefixProvider *dpp) override {
+    reenter(this) {
+      yield {
+        rgw_raw_obj obj(sync_env->svc->zone->get_zone_params().log_pool, sync_status_oid);
+
+        // whether or not to do full sync, incremental sync will follow anyway
+        if (sync_env->sync_module->should_full_sync()) {
+          const auto max_marker = marker_mgr.get(sync_pair.source_bs.shard_id, "");
+          status.inc_marker.position = max_marker;
+        }
+        status.inc_marker.timestamp = ceph::real_clock::now();
+        status.state = rgw_bucket_shard_sync_info::StateIncrementalSync;
+
+        map<string, bufferlist> attrs;
+        status.encode_all_attrs(attrs);
+        call(new RGWSimpleRadosWriteAttrsCR(dpp, sync_env->async_rados, sync_env->svc->sysobj,
+                                            obj, attrs, &objv_tracker, exclusive));
+      }
+
+      if (retcode < 0) {
+        ldout(cct, 20) << "ERROR: init marker position failed. error: " << retcode << dendl;
+        return set_cr_error(retcode);
+      }
+      ldout(cct, 20) << "init marker position: " << status.inc_marker.position << 
+        ". written to shard status object: " << sync_status_oid << dendl;
+      return set_cr_done();
+    }
+    return 0;
+  }
+};
+
+#define BUCKET_SYNC_ATTR_PREFIX RGW_ATTR_PREFIX "bucket-sync."
+
+template <class T>
+static bool decode_attr(CephContext *cct, map<string, bufferlist>& attrs, const string& attr_name, T *val)
+{
+  map<string, bufferlist>::iterator iter = attrs.find(attr_name);
+  if (iter == attrs.end()) {
+    *val = T();
+    return false;
+  }
+
+  auto biter = iter->second.cbegin();
+  try {
+    decode(*val, biter);
+  } catch (buffer::error& err) {
+    ldout(cct, 0) << "ERROR: failed to decode attribute: " << attr_name << dendl;
+    return false;
+  }
+  return true;
+}
+
+void rgw_bucket_shard_sync_info::decode_from_attrs(CephContext *cct, map<string, bufferlist>& attrs)
+{
+  if (!decode_attr(cct, attrs, BUCKET_SYNC_ATTR_PREFIX "state", &state)) {
+    decode_attr(cct, attrs, "state", &state);
+  }
+  if (!decode_attr(cct, attrs, BUCKET_SYNC_ATTR_PREFIX "inc_marker", &inc_marker)) {
+    decode_attr(cct, attrs, "inc_marker", &inc_marker);
+  }
+}
+
+void rgw_bucket_shard_sync_info::encode_all_attrs(map<string, bufferlist>& attrs)
+{
+  encode_state_attr(attrs);
+  inc_marker.encode_attr(attrs);
+}
+
+void rgw_bucket_shard_sync_info::encode_state_attr(map<string, bufferlist>& attrs)
+{
+  using ceph::encode;
+  encode(state, attrs[BUCKET_SYNC_ATTR_PREFIX "state"]);
+}
+
+void rgw_bucket_shard_full_sync_marker::encode_attr(map<string, bufferlist>& attrs)
+{
+  using ceph::encode;
+  encode(*this, attrs[BUCKET_SYNC_ATTR_PREFIX "full_marker"]);
+}
+
+void rgw_bucket_shard_inc_sync_marker::encode_attr(map<string, bufferlist>& attrs)
+{
+  using ceph::encode;
+  encode(*this, attrs[BUCKET_SYNC_ATTR_PREFIX "inc_marker"]);
+}
+
+class RGWReadBucketPipeSyncStatusCoroutine : public RGWCoroutine {
+  RGWDataSyncCtx *sc;
+  RGWDataSyncEnv *sync_env;
+  string oid;
+  rgw_bucket_shard_sync_info *status;
+  RGWObjVersionTracker* objv_tracker;
+  map<string, bufferlist> attrs;
+public:
+  RGWReadBucketPipeSyncStatusCoroutine(RGWDataSyncCtx *_sc,
+                                   const rgw_bucket_sync_pair_info& sync_pair,
+                                   rgw_bucket_shard_sync_info *_status,
+                                   RGWObjVersionTracker* objv_tracker,
+                                   uint64_t gen)
+    : RGWCoroutine(_sc->cct), sc(_sc), sync_env(_sc->env),
+      oid(RGWBucketPipeSyncStatusManager::inc_status_oid(sc->source_zone, sync_pair, gen)),
+      status(_status), objv_tracker(objv_tracker)
+  {}
+  int operate(const DoutPrefixProvider *dpp) override;
+};
+
+int RGWReadBucketPipeSyncStatusCoroutine::operate(const DoutPrefixProvider *dpp)
+{
+  reenter(this) {
+    yield call(new RGWSimpleRadosReadAttrsCR(dpp, sync_env->async_rados, sync_env->svc->sysobj,
+                                             rgw_raw_obj(sync_env->svc->zone->get_zone_params().log_pool, oid),
+                                             &attrs, true, objv_tracker));
+    if (retcode == -ENOENT) {
+      *status = rgw_bucket_shard_sync_info();
+      return set_cr_done();
+    }
+    if (retcode < 0) {
+      ldpp_dout(dpp, 0) << "ERROR: failed to call fetch bucket shard info oid=" << oid << " ret=" << retcode << dendl;
+      return set_cr_error(retcode);
+    }
+    status->decode_from_attrs(sync_env->cct, attrs);
+    return set_cr_done();
+  }
+  return 0;
+}
+
+// wrap ReadSyncStatus and set a flag if it's not in incremental
+class CheckBucketShardStatusIsIncremental : public RGWReadBucketPipeSyncStatusCoroutine {
+  bool* result;
+  rgw_bucket_shard_sync_info status;
+ public:
+  CheckBucketShardStatusIsIncremental(RGWDataSyncCtx* sc,
+                                      const rgw_bucket_sync_pair_info& sync_pair,
+                                      bool* result)
+    : RGWReadBucketPipeSyncStatusCoroutine(sc, sync_pair, &status, nullptr, 0 /*no gen in compat mode*/),
+      result(result)
+  {}
+
+  int operate(const DoutPrefixProvider *dpp) override {
+    int r = RGWReadBucketPipeSyncStatusCoroutine::operate(dpp);
+    if (state == RGWCoroutine_Done &&
+        status.state != rgw_bucket_shard_sync_info::StateIncrementalSync) {
+      *result = false;
+    }
+    return r;
+  }
+};
+
+class CheckAllBucketShardStatusIsIncremental : public RGWShardCollectCR {
+  // start with 1 shard, and only spawn more if we detect an existing shard.
+  // this makes the backward compatilibility check far less expensive in the
+  // general case where no shards exist
+  static constexpr int initial_concurrent_shards = 1;
+  static constexpr int max_concurrent_shards = 16;
+
+  RGWDataSyncCtx* sc;
+  rgw_bucket_sync_pair_info sync_pair;
+  const int num_shards;
+  bool* result;
+  int shard = 0;
+ public:
+  CheckAllBucketShardStatusIsIncremental(RGWDataSyncCtx* sc,
+                                         const rgw_bucket_sync_pair_info& sync_pair,
+                                         int num_shards, bool* result)
+    : RGWShardCollectCR(sc->cct, initial_concurrent_shards),
+      sc(sc), sync_pair(sync_pair), num_shards(num_shards), result(result)
+  {}
+
+  bool spawn_next() override {
+    // stop spawning if we saw any errors or non-incremental shards
+    if (shard >= num_shards || status < 0 || !*result) {
+      return false;
+    }
+    sync_pair.source_bs.shard_id = shard++;
+    spawn(new CheckBucketShardStatusIsIncremental(sc, sync_pair, result), false);
+    return true;
+  }
+
+ private:
+  int handle_result(int r) override {
+    if (r < 0) {
+      ldout(cct, 4) << "failed to read bucket shard status: "
+          << cpp_strerror(r) << dendl;
+    } else if (shard == 0) {
+      // enable concurrency once the first shard succeeds
+      max_concurrent = max_concurrent_shards;
+    }
+    return r;
+  }
+};
+
+// wrap InitBucketShardSyncStatus with local storage for 'status' and 'objv'
+// and a loop to retry on racing writes
+class InitBucketShardStatusCR : public RGWCoroutine {
+  RGWDataSyncCtx* sc;
+  rgw_bucket_sync_pair_info pair;
+  rgw_bucket_shard_sync_info status;
+  RGWObjVersionTracker objv;
+  const uint64_t gen;
+  const BucketIndexShardsManager& marker_mgr;
+
+ public:
+  InitBucketShardStatusCR(RGWDataSyncCtx* sc,
+                         const rgw_bucket_sync_pair_info& pair,
+                         uint64_t gen,
+                         const BucketIndexShardsManager& marker_mgr)
+    : RGWCoroutine(sc->cct), sc(sc), pair(pair), gen(gen), marker_mgr(marker_mgr)
+  {}
+  int operate(const DoutPrefixProvider *dpp) {
+    reenter(this) {
+      // non exclusive create with empty status
+      objv.generate_new_write_ver(cct);
+      yield call(new RGWInitBucketShardSyncStatusCoroutine(sc, pair, status, gen, marker_mgr, objv, false));
+      if (retcode < 0) {
+        return set_cr_error(retcode);
+      }
+      return set_cr_done();
+    }
+    return 0;
+  }
+};
+
+class InitBucketShardStatusCollectCR : public RGWShardCollectCR {
+  static constexpr int max_concurrent_shards = 16;
+  RGWDataSyncCtx* sc;
+  rgw_bucket_sync_pair_info sync_pair;
+  const uint64_t gen;
+  const BucketIndexShardsManager& marker_mgr;
+
+  const int num_shards;
+  int shard = 0;
+
+  int handle_result(int r) override {
+    if (r < 0) {
+      ldout(cct, 4) << "failed to init bucket shard status: "
+          << cpp_strerror(r) << dendl;
+    }
+    return r;
+  }
+ public:
+  InitBucketShardStatusCollectCR(RGWDataSyncCtx* sc,
+                                 const rgw_bucket_sync_pair_info& sync_pair,
+                                 uint64_t gen,
+                                 const BucketIndexShardsManager& marker_mgr,
+                                 int num_shards)
+    : RGWShardCollectCR(sc->cct, max_concurrent_shards),
+      sc(sc), sync_pair(sync_pair), gen(gen), marker_mgr(marker_mgr), num_shards(num_shards)
+  {}
+
+  bool spawn_next() override {
+    if (shard >= num_shards || status < 0) { // stop spawning on any errors
+      return false;
+    }
+    sync_pair.source_bs.shard_id = shard++;
+    spawn(new InitBucketShardStatusCR(sc, sync_pair, gen, marker_mgr), false);
+    return true;
+  }
+};
+
+class RemoveBucketShardStatusCR : public RGWCoroutine {
+  RGWDataSyncCtx* const sc;
+  RGWDataSyncEnv* const sync_env;
+
+  rgw_bucket_sync_pair_info sync_pair;
+  rgw_raw_obj obj;
+  RGWObjVersionTracker objv;
+
+public:
+  RemoveBucketShardStatusCR(RGWDataSyncCtx* sc,
+                             const rgw_bucket_sync_pair_info& sync_pair, uint64_t gen)
+    : RGWCoroutine(sc->cct), sc(sc), sync_env(sc->env),
+      sync_pair(sync_pair),
+      obj(sync_env->svc->zone->get_zone_params().log_pool, 
+          RGWBucketPipeSyncStatusManager::inc_status_oid(sc->source_zone, sync_pair, gen))
+  {}
+
+  int operate(const DoutPrefixProvider *dpp) override {
+    reenter(this) {
+      yield call(new RGWRadosRemoveCR(sync_env->driver, obj, &objv));
+                       if (retcode < 0 && retcode != -ENOENT) {
+        ldout(cct, 20) << "ERROR: failed to remove bucket shard status for: " << sync_pair << 
+          ". with error: " << retcode << dendl;
+        return set_cr_error(retcode);
+      }
+      ldout(cct, 20) << "removed bucket shard status object: " << obj.oid << dendl;
+      return set_cr_done();
+    }
+    return 0;
+  }
+};
+
+class RemoveBucketShardStatusCollectCR : public RGWShardCollectCR {
+  static constexpr int max_concurrent_shards = 16;
+  RGWDataSyncCtx* const sc;
+  RGWDataSyncEnv* const sync_env;
+  rgw_bucket_sync_pair_info sync_pair;
+  const uint64_t gen;
+
+  const int num_shards;
+  int shard = 0;
+
+  int handle_result(int r) override {
+    if (r < 0) {
+      ldout(cct, 4) << "failed to remove bucket shard status object: "
+          << cpp_strerror(r) << dendl;
+    }
+    return r;
+  }
+ public:
+  RemoveBucketShardStatusCollectCR(RGWDataSyncCtx* sc,
+                                 const rgw_bucket_sync_pair_info& sync_pair,
+                                 uint64_t gen,
+                                 int num_shards)
+    : RGWShardCollectCR(sc->cct, max_concurrent_shards),
+      sc(sc), sync_env(sc->env), sync_pair(sync_pair), gen(gen), num_shards(num_shards)
+  {}
+
+  bool spawn_next() override {
+    if (shard >= num_shards) {
+      return false;
+    }
+    sync_pair.source_bs.shard_id = shard++;
+    spawn(new RemoveBucketShardStatusCR(sc, sync_pair, gen),  false);
+    return true;
+  }
+};
+
+class InitBucketFullSyncStatusCR : public RGWCoroutine {
+  RGWDataSyncCtx *sc;
+  RGWDataSyncEnv *sync_env;
+
+  const rgw_bucket_sync_pair_info& sync_pair;
+  const rgw_raw_obj& status_obj;
+  rgw_bucket_sync_status& status;
+  RGWObjVersionTracker& objv;
+  const RGWBucketInfo& source_info;
+  const bool check_compat;
+
+  const rgw_bucket_index_marker_info& info;
+  BucketIndexShardsManager marker_mgr;
+
+  bool all_incremental = true;
+  bool no_zero = false;
+
+public:
+  InitBucketFullSyncStatusCR(RGWDataSyncCtx* sc,
+                             const rgw_bucket_sync_pair_info& sync_pair,
+                             const rgw_raw_obj& status_obj,
+                             rgw_bucket_sync_status& status,
+                             RGWObjVersionTracker& objv,
+                            const RGWBucketInfo& source_info,
+                             bool check_compat,
+                             const rgw_bucket_index_marker_info& info)
+    : RGWCoroutine(sc->cct), sc(sc), sync_env(sc->env),
+      sync_pair(sync_pair), status_obj(status_obj),
+      status(status), objv(objv), source_info(source_info),
+      check_compat(check_compat), info(info)
+  {}
+
+  int operate(const DoutPrefixProvider *dpp) override {
+    reenter(this) {
+      retcode = marker_mgr.from_string(info.max_marker, -1);
+      if (retcode < 0) {
+        lderr(cct) << "failed to parse bilog shard markers: "
+            << cpp_strerror(retcode) << dendl;
+        return set_cr_error(retcode);
+      }
+
+      status.state = BucketSyncState::Init;
+
+      if (info.oldest_gen == 0) {
+       if (check_compat) {
+         // use shard count from our log gen=0
+         // try to convert existing per-shard incremental status for backward compatibility
+         if (source_info.layout.logs.empty() ||
+             source_info.layout.logs.front().gen > 0) {
+           ldpp_dout(dpp, 20) << "no generation zero when checking compatibility" << dendl;
+           no_zero = true;
+         } else if (auto& log = source_info.layout.logs.front();
+                     log.layout.type != rgw::BucketLogType::InIndex) {
+           ldpp_dout(dpp, 20) << "unrecognized log layout type when checking compatibility " << log.layout.type << dendl;
+           no_zero = true;
+         }
+         if (!no_zero) {
+           yield {
+             const int num_shards0 =
+               source_info.layout.logs.front().layout.in_index.layout.num_shards;
+             call(new CheckAllBucketShardStatusIsIncremental(sc, sync_pair,
+                                                             num_shards0,
+                                                             &all_incremental));
+           }
+           if (retcode < 0) {
+             return set_cr_error(retcode);
+           }
+           if (all_incremental) {
+             // we can use existing status and resume incremental sync
+             status.state = BucketSyncState::Incremental;
+           }
+         } else {
+           all_incremental = false;
+         }
+       }
+      }
+
+      if (status.state != BucketSyncState::Incremental) {
+       // initialize all shard sync status. this will populate the log marker
+        // positions where incremental sync will resume after full sync
+       yield {
+         const int num_shards = marker_mgr.get().size();
+         call(new InitBucketShardStatusCollectCR(sc, sync_pair, info.latest_gen, marker_mgr, num_shards));
+       }
+       if (retcode < 0) {
+          ldout(cct, 20) << "failed to init bucket shard status: "
+                        << cpp_strerror(retcode) << dendl;
+         return set_cr_error(retcode);
+        }
+
+        if (sync_env->sync_module->should_full_sync()) {
+          status.state = BucketSyncState::Full;
+        } else {
+          status.state = BucketSyncState::Incremental;
+        }
+      }
+
+      status.shards_done_with_gen.resize(marker_mgr.get().size());
+      status.incremental_gen = info.latest_gen;
+
+      ldout(cct, 20) << "writing bucket sync status during init. state=" << status.state << ". marker=" << status.full.position.to_str() << dendl;
+
+      // write bucket sync status
+      using CR = RGWSimpleRadosWriteCR<rgw_bucket_sync_status>;
+      yield call(new CR(dpp, sync_env->async_rados, sync_env->svc->sysobj,
+                       status_obj, status, &objv, false));
+      if (retcode < 0) {
+        ldout(cct, 20) << "failed to write bucket shard status: "
+            << cpp_strerror(retcode) << dendl;
+        return set_cr_error(retcode);
+      }
+      return set_cr_done();
+    }
+    return 0;
+  }
+};
+
+#define OMAP_READ_MAX_ENTRIES 10
+class RGWReadRecoveringBucketShardsCoroutine : public RGWCoroutine {
+  RGWDataSyncCtx *sc;
+  RGWDataSyncEnv *sync_env;
+  rgw::sal::RadosStore* driver;
+  
+  const int shard_id;
+  int max_entries;
+
+  set<string>& recovering_buckets;
+  string marker;
+  string error_oid;
+
+  RGWRadosGetOmapKeysCR::ResultPtr omapkeys;
+  set<string> error_entries;
+  int max_omap_entries;
+  int count;
+
+public:
+  RGWReadRecoveringBucketShardsCoroutine(RGWDataSyncCtx *_sc, const int _shard_id,
+                                      set<string>& _recovering_buckets, const int _max_entries) 
+  : RGWCoroutine(_sc->cct), sc(_sc), sync_env(_sc->env),
+  driver(sync_env->driver), shard_id(_shard_id), max_entries(_max_entries),
+  recovering_buckets(_recovering_buckets), max_omap_entries(OMAP_READ_MAX_ENTRIES)
+  {
+    error_oid = RGWDataSyncStatusManager::shard_obj_name(sc->source_zone, shard_id) + ".retry";
+  }
+
+  int operate(const DoutPrefixProvider *dpp) override;
+};
+
+int RGWReadRecoveringBucketShardsCoroutine::operate(const DoutPrefixProvider *dpp)
+{
+  reenter(this){
+    //read recovering bucket shards
+    count = 0;
+    do {
+      omapkeys = std::make_shared<RGWRadosGetOmapKeysCR::Result>();
+      yield call(new RGWRadosGetOmapKeysCR(driver, rgw_raw_obj(sync_env->svc->zone->get_zone_params().log_pool, error_oid),
+            marker, max_omap_entries, omapkeys));
+
+      if (retcode == -ENOENT) {
+        break;
+      }
+
+      if (retcode < 0) {
+        ldpp_dout(dpp, 0) << "failed to read recovering bucket shards with " 
+          << cpp_strerror(retcode) << dendl;
+        return set_cr_error(retcode);
+      }
+
+      error_entries = std::move(omapkeys->entries);
+      if (error_entries.empty()) {
+        break;
+      }
+
+      count += error_entries.size();
+      marker = *error_entries.rbegin();
+      recovering_buckets.insert(std::make_move_iterator(error_entries.begin()),
+                                std::make_move_iterator(error_entries.end()));
+    } while (omapkeys->more && count < max_entries);
+  
+    return set_cr_done();
+  }
+
+  return 0;
+}
+
+class RGWReadPendingBucketShardsCoroutine : public RGWCoroutine {
+  RGWDataSyncCtx *sc;
+  RGWDataSyncEnv *sync_env;
+  rgw::sal::RadosStore* driver;
+
+  const int shard_id;
+  int max_entries;
+
+  set<string>& pending_buckets;
+  string marker;
+  string status_oid;
+
+  rgw_data_sync_marker* sync_marker;
+  int count;
+
+  std::string next_marker;
+  vector<rgw_data_change_log_entry> log_entries;
+  bool truncated;
+
+public:
+  RGWReadPendingBucketShardsCoroutine(RGWDataSyncCtx *_sc, const int _shard_id,
+                                      set<string>& _pending_buckets,
+                                      rgw_data_sync_marker* _sync_marker, const int _max_entries) 
+  : RGWCoroutine(_sc->cct), sc(_sc), sync_env(_sc->env),
+  driver(sync_env->driver), shard_id(_shard_id), max_entries(_max_entries),
+  pending_buckets(_pending_buckets), sync_marker(_sync_marker)
+  {
+    status_oid = RGWDataSyncStatusManager::shard_obj_name(sc->source_zone, shard_id);
+  }
+
+  int operate(const DoutPrefixProvider *dpp) override;
+};
+
+int RGWReadPendingBucketShardsCoroutine::operate(const DoutPrefixProvider *dpp)
+{
+  reenter(this){
+    //read sync status marker
+    using CR = RGWSimpleRadosReadCR<rgw_data_sync_marker>;
+    yield call(new CR(dpp, sync_env->async_rados, sync_env->svc->sysobj,
+                      rgw_raw_obj(sync_env->svc->zone->get_zone_params().log_pool, status_oid),
+                      sync_marker));
+    if (retcode < 0) {
+      ldpp_dout(dpp, 0) << "failed to read sync status marker with " 
+        << cpp_strerror(retcode) << dendl;
+      return set_cr_error(retcode);
+    }
+
+    //read pending bucket shards
+    marker = sync_marker->marker;
+    count = 0;
+    do{
+      yield call(new RGWReadRemoteDataLogShardCR(sc, shard_id, marker,
+                                                 &next_marker, &log_entries, &truncated));
+
+      if (retcode == -ENOENT) {
+        break;
+      }
+
+      if (retcode < 0) {
+        ldpp_dout(dpp, 0) << "failed to read remote data log info with " 
+          << cpp_strerror(retcode) << dendl;
+        return set_cr_error(retcode);
+      }
+
+      if (log_entries.empty()) {
+        break;
+      }
+
+      count += log_entries.size();
+      for (const auto& entry : log_entries) {
+        pending_buckets.insert(entry.entry.key);
+      }
+    }while(truncated && count < max_entries);
+
+    return set_cr_done();
+  }
+
+  return 0;
+}
+
+int RGWRemoteDataLog::read_shard_status(const DoutPrefixProvider *dpp, int shard_id, set<string>& pending_buckets, set<string>& recovering_buckets, rgw_data_sync_marker *sync_marker, const int max_entries)
+{
+  // cannot run concurrently with run_sync(), so run in a separate manager
+  RGWCoroutinesManager crs(driver->ctx(), driver->getRados()->get_cr_registry());
+  RGWHTTPManager http_manager(driver->ctx(), crs.get_completion_mgr());
+  int ret = http_manager.start();
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "failed in http_manager.start() ret=" << ret << dendl;
+    return ret;
+  }
+  RGWDataSyncEnv sync_env_local = sync_env;
+  sync_env_local.http_manager = &http_manager;
+  RGWDataSyncCtx sc_local = sc;
+  sc_local.env = &sync_env_local;
+  list<RGWCoroutinesStack *> stacks;
+  RGWCoroutinesStack* recovering_stack = new RGWCoroutinesStack(driver->ctx(), &crs);
+  recovering_stack->call(new RGWReadRecoveringBucketShardsCoroutine(&sc_local, shard_id, recovering_buckets, max_entries));
+  stacks.push_back(recovering_stack);
+  RGWCoroutinesStack* pending_stack = new RGWCoroutinesStack(driver->ctx(), &crs);
+  pending_stack->call(new RGWReadPendingBucketShardsCoroutine(&sc_local, shard_id, pending_buckets, sync_marker, max_entries));
+  stacks.push_back(pending_stack);
+  ret = crs.run(dpp, stacks);
+  http_manager.stop();
+  return ret;
+}
+
+CephContext *RGWBucketPipeSyncStatusManager::get_cct() const
+{
+  return driver->ctx();
+}
+
+void rgw_bucket_entry_owner::decode_json(JSONObj *obj)
+{
+  JSONDecoder::decode_json("ID", id, obj);
+  JSONDecoder::decode_json("DisplayName", display_name, obj);
+}
+
+struct bucket_list_entry {
+  bool delete_marker;
+  rgw_obj_key key;
+  bool is_latest;
+  real_time mtime;
+  string etag;
+  uint64_t size;
+  string storage_class;
+  rgw_bucket_entry_owner owner;
+  uint64_t versioned_epoch;
+  string rgw_tag;
+
+  bucket_list_entry() : delete_marker(false), is_latest(false), size(0), versioned_epoch(0) {}
+
+  void decode_json(JSONObj *obj) {
+    JSONDecoder::decode_json("IsDeleteMarker", delete_marker, obj);
+    JSONDecoder::decode_json("Key", key.name, obj);
+    JSONDecoder::decode_json("VersionId", key.instance, obj);
+    JSONDecoder::decode_json("IsLatest", is_latest, obj);
+    string mtime_str;
+    JSONDecoder::decode_json("RgwxMtime", mtime_str, obj);
+
+    struct tm t;
+    uint32_t nsec;
+    if (parse_iso8601(mtime_str.c_str(), &t, &nsec)) {
+      ceph_timespec ts;
+      ts.tv_sec = (uint64_t)internal_timegm(&t);
+      ts.tv_nsec = nsec;
+      mtime = real_clock::from_ceph_timespec(ts);
+    }
+    JSONDecoder::decode_json("ETag", etag, obj);
+    JSONDecoder::decode_json("Size", size, obj);
+    JSONDecoder::decode_json("StorageClass", storage_class, obj);
+    JSONDecoder::decode_json("Owner", owner, obj);
+    JSONDecoder::decode_json("VersionedEpoch", versioned_epoch, obj);
+    JSONDecoder::decode_json("RgwxTag", rgw_tag, obj);
+    if (key.instance == "null" && !versioned_epoch) {
+      key.instance.clear();
+    }
+  }
+
+  RGWModifyOp get_modify_op() const {
+    if (delete_marker) {
+      return CLS_RGW_OP_LINK_OLH_DM;
+    } else if (!key.instance.empty() && key.instance != "null") {
+      return CLS_RGW_OP_LINK_OLH;
+    } else {
+      return CLS_RGW_OP_ADD;
+    }
+  }
+};
+
+struct bucket_list_result {
+  string name;
+  string prefix;
+  string key_marker;
+  string version_id_marker;
+  int max_keys;
+  bool is_truncated;
+  list<bucket_list_entry> entries;
+
+  bucket_list_result() : max_keys(0), is_truncated(false) {}
+
+  void decode_json(JSONObj *obj) {
+    JSONDecoder::decode_json("Name", name, obj);
+    JSONDecoder::decode_json("Prefix", prefix, obj);
+    JSONDecoder::decode_json("KeyMarker", key_marker, obj);
+    JSONDecoder::decode_json("VersionIdMarker", version_id_marker, obj);
+    JSONDecoder::decode_json("MaxKeys", max_keys, obj);
+    JSONDecoder::decode_json("IsTruncated", is_truncated, obj);
+    JSONDecoder::decode_json("Entries", entries, obj);
+  }
+};
+
+class RGWListRemoteBucketCR: public RGWCoroutine {
+  RGWDataSyncCtx *sc;
+  RGWDataSyncEnv *sync_env;
+  const rgw_bucket_shard& bs;
+  rgw_obj_key marker_position;
+
+  bucket_list_result *result;
+
+public:
+  RGWListRemoteBucketCR(RGWDataSyncCtx *_sc, const rgw_bucket_shard& bs,
+                        rgw_obj_key& _marker_position, bucket_list_result *_result)
+    : RGWCoroutine(_sc->cct), sc(_sc), sync_env(_sc->env), bs(bs),
+      marker_position(_marker_position), result(_result) {}
+
+  int operate(const DoutPrefixProvider *dpp) override {
+    reenter(this) {
+      yield {
+        rgw_http_param_pair pairs[] = { { "versions" , NULL },
+                                       { "format" , "json" },
+                                       { "objs-container" , "true" },
+                                       { "key-marker" , marker_position.name.c_str() },
+                                       { "version-id-marker" , marker_position.instance.c_str() },
+                                       { NULL, NULL } };
+        string p = string("/") + bs.bucket.get_key(':', 0);
+        call(new RGWReadRESTResourceCR<bucket_list_result>(sync_env->cct, sc->conn, sync_env->http_manager, p, pairs, result));
+      }
+      if (retcode < 0) {
+        return set_cr_error(retcode);
+      }
+      return set_cr_done();
+    }
+    return 0;
+  }
+};
+
+struct next_bilog_result {
+  uint64_t generation = 0;
+  int num_shards = 0;
+
+  void decode_json(JSONObj *obj) {
+    JSONDecoder::decode_json("generation", generation, obj);
+    JSONDecoder::decode_json("num_shards", num_shards, obj);
+  }
+};
+
+struct bilog_list_result {
+  list<rgw_bi_log_entry> entries;
+  bool truncated{false};
+  std::optional<next_bilog_result> next_log;
+
+  void decode_json(JSONObj *obj) {
+    JSONDecoder::decode_json("entries", entries, obj);
+    JSONDecoder::decode_json("truncated", truncated, obj);
+    JSONDecoder::decode_json("next_log", next_log, obj);
+  }
+};
+
+class RGWListBucketIndexLogCR: public RGWCoroutine {
+  RGWDataSyncCtx *sc;
+  RGWDataSyncEnv *sync_env;
+  const string instance_key;
+  string marker;
+
+  bilog_list_result *result;
+  std::optional<PerfGuard> timer;
+  uint64_t generation;
+  std::string gen_str = std::to_string(generation);
+  uint32_t format_ver{1};
+
+public:
+  RGWListBucketIndexLogCR(RGWDataSyncCtx *_sc, const rgw_bucket_shard& bs, string& _marker,
+                          uint64_t _generation, bilog_list_result *_result)
+    : RGWCoroutine(_sc->cct), sc(_sc), sync_env(_sc->env),
+      instance_key(bs.get_key()), marker(_marker), result(_result), generation(_generation) {}
+
+  int operate(const DoutPrefixProvider *dpp) override {
+    reenter(this) {
+      if (sync_env->counters) {
+        timer.emplace(sync_env->counters, sync_counters::l_poll);
+      }
+      yield {
+        rgw_http_param_pair pairs[] = { { "bucket-instance", instance_key.c_str() },
+                                       { "format" , "json" },
+                                       { "marker" , marker.c_str() },
+                                       { "type", "bucket-index" },
+                                       { "generation", gen_str.c_str() },
+                                       { "format-ver", "2"},
+                                       { NULL, NULL } };
+
+        call(new RGWReadRESTResourceCR<bilog_list_result>(sync_env->cct, sc->conn, sync_env->http_manager,
+                                                      "/admin/log", pairs, result));
+      }
+      timer.reset();
+      if (retcode < 0) {
+        if (sync_env->counters) {
+          sync_env->counters->inc(sync_counters::l_poll_err);
+        }
+        return set_cr_error(retcode);
+      }
+      return set_cr_done();
+    }
+    return 0;
+  }
+};
+
+#define BUCKET_SYNC_UPDATE_MARKER_WINDOW 10
+
+class RGWBucketFullSyncMarkerTrack : public RGWSyncShardMarkerTrack<rgw_obj_key, rgw_obj_key> {
+  RGWDataSyncCtx *sc;
+  RGWDataSyncEnv *sync_env;
+
+  const rgw_raw_obj& status_obj;
+  rgw_bucket_sync_status& sync_status;
+  RGWSyncTraceNodeRef tn;
+  RGWObjVersionTracker& objv_tracker;
+
+public:
+  RGWBucketFullSyncMarkerTrack(RGWDataSyncCtx *_sc,
+                               const rgw_raw_obj& status_obj,
+                               rgw_bucket_sync_status& sync_status,
+                               RGWSyncTraceNodeRef tn,
+                               RGWObjVersionTracker& objv_tracker)
+    : RGWSyncShardMarkerTrack(BUCKET_SYNC_UPDATE_MARKER_WINDOW),
+      sc(_sc), sync_env(_sc->env), status_obj(status_obj),
+      sync_status(sync_status), tn(std::move(tn)), objv_tracker(objv_tracker)
+  {}
+
+
+  RGWCoroutine *store_marker(const rgw_obj_key& new_marker, uint64_t index_pos, const real_time& timestamp) override {
+    sync_status.full.position = new_marker;
+    sync_status.full.count = index_pos;
+
+    tn->log(20, SSTR("updating marker oid=" << status_obj.oid << " marker=" << new_marker));
+    return new RGWSimpleRadosWriteCR<rgw_bucket_sync_status>(
+        sync_env->dpp, sync_env->async_rados, sync_env->svc->sysobj,
+       status_obj, sync_status, &objv_tracker);
+  }
+
+  RGWOrderCallCR *allocate_order_control_cr() override {
+    return new RGWLastCallerWinsCR(sync_env->cct);
+  }
+};
+
+// write the incremental sync status and update 'stable_timestamp' on success
+class RGWWriteBucketShardIncSyncStatus : public RGWCoroutine {
+  RGWDataSyncEnv *sync_env;
+  rgw_raw_obj obj;
+  rgw_bucket_shard_inc_sync_marker sync_marker;
+  ceph::real_time* stable_timestamp;
+  RGWObjVersionTracker& objv_tracker;
+  std::map<std::string, bufferlist> attrs;
+ public:
+  RGWWriteBucketShardIncSyncStatus(RGWDataSyncEnv *sync_env,
+                                   const rgw_raw_obj& obj,
+                                   const rgw_bucket_shard_inc_sync_marker& sync_marker,
+                                   ceph::real_time* stable_timestamp,
+                                   RGWObjVersionTracker& objv_tracker)
+    : RGWCoroutine(sync_env->cct), sync_env(sync_env), obj(obj),
+      sync_marker(sync_marker), stable_timestamp(stable_timestamp),
+      objv_tracker(objv_tracker)
+  {}
+  int operate(const DoutPrefixProvider *dpp) {
+    reenter(this) {
+      sync_marker.encode_attr(attrs);
+
+      yield call(new RGWSimpleRadosWriteAttrsCR(sync_env->dpp, sync_env->async_rados, sync_env->svc->sysobj,
+                                                obj, attrs, &objv_tracker));
+      if (retcode < 0) {
+        return set_cr_error(retcode);
+      }
+      if (stable_timestamp) {
+        *stable_timestamp = sync_marker.timestamp;
+      }
+      return set_cr_done();
+    }
+    return 0;
+  }
+};
+
+class RGWBucketIncSyncShardMarkerTrack : public RGWSyncShardMarkerTrack<string, rgw_obj_key> {
+  RGWDataSyncCtx *sc;
+  RGWDataSyncEnv *sync_env;
+
+  rgw_raw_obj obj;
+  rgw_bucket_shard_inc_sync_marker sync_marker;
+
+  map<rgw_obj_key, string> key_to_marker;
+
+  struct operation {
+    rgw_obj_key key;
+    bool is_olh;
+  };
+  map<string, operation> marker_to_op;
+  std::set<std::string> pending_olh; // object names with pending olh operations
+
+  RGWSyncTraceNodeRef tn;
+  RGWObjVersionTracker& objv_tracker;
+  ceph::real_time* stable_timestamp;
+
+  void handle_finish(const string& marker) override {
+    auto iter = marker_to_op.find(marker);
+    if (iter == marker_to_op.end()) {
+      return;
+    }
+    auto& op = iter->second;
+    key_to_marker.erase(op.key);
+    reset_need_retry(op.key);
+    if (op.is_olh) {
+      pending_olh.erase(op.key.name);
+    }
+    marker_to_op.erase(iter);
+  }
+
+public:
+  RGWBucketIncSyncShardMarkerTrack(RGWDataSyncCtx *_sc,
+                         const string& _marker_oid,
+                         const rgw_bucket_shard_inc_sync_marker& _marker,
+                         RGWSyncTraceNodeRef tn,
+                         RGWObjVersionTracker& objv_tracker,
+                         ceph::real_time* stable_timestamp)
+    : RGWSyncShardMarkerTrack(BUCKET_SYNC_UPDATE_MARKER_WINDOW),
+      sc(_sc), sync_env(_sc->env),
+      obj(sync_env->svc->zone->get_zone_params().log_pool, _marker_oid),
+      sync_marker(_marker), tn(std::move(tn)), objv_tracker(objv_tracker),
+      stable_timestamp(stable_timestamp)
+  {}
+
+  const rgw_raw_obj& get_obj() const { return obj; }
+
+  RGWCoroutine* store_marker(const string& new_marker, uint64_t index_pos, const real_time& timestamp) override {
+    sync_marker.position = new_marker;
+    sync_marker.timestamp = timestamp;
+
+    tn->log(20, SSTR("updating marker marker_oid=" << obj.oid << " marker=" << new_marker << " timestamp=" << timestamp));
+    return new RGWWriteBucketShardIncSyncStatus(sync_env, obj, sync_marker,
+                                                stable_timestamp, objv_tracker);
+  }
+
+  /*
+   * create index from key -> <op, marker>, and from marker -> key
+   * this is useful so that we can insure that we only have one
+   * entry for any key that is used. This is needed when doing
+   * incremenatl sync of data, and we don't want to run multiple
+   * concurrent sync operations for the same bucket shard 
+   * Also, we should make sure that we don't run concurrent operations on the same key with
+   * different ops.
+   */
+  bool index_key_to_marker(const rgw_obj_key& key, const string& marker, bool is_olh) {
+    auto result = key_to_marker.emplace(key, marker);
+    if (!result.second) { // exists
+      set_need_retry(key);
+      return false;
+    }
+    marker_to_op[marker] = operation{key, is_olh};
+    if (is_olh) {
+      // prevent other olh ops from starting on this object name
+      pending_olh.insert(key.name);
+    }
+    return true;
+  }
+
+  bool can_do_op(const rgw_obj_key& key, bool is_olh) {
+    // serialize olh ops on the same object name
+    if (is_olh && pending_olh.count(key.name)) {
+      tn->log(20, SSTR("sync of " << key << " waiting for pending olh op"));
+      return false;
+    }
+    return (key_to_marker.find(key) == key_to_marker.end());
+  }
+
+  RGWOrderCallCR *allocate_order_control_cr() override {
+    return new RGWLastCallerWinsCR(sync_env->cct);
+  }
+};
+
+static bool ignore_sync_error(int err) {
+  switch (err) {
+    case -ENOENT:
+    case -EPERM:
+      return true;
+    default:
+      break;
+  }
+  return false;
+}
+
+template <class T, class K>
+class RGWBucketSyncSingleEntryCR : public RGWCoroutine {
+  RGWDataSyncCtx *sc;
+  RGWDataSyncEnv *sync_env;
+
+  rgw_bucket_sync_pipe& sync_pipe;
+  rgw_bucket_shard& bs;
+
+  rgw_obj_key key;
+  bool versioned;
+  std::optional<uint64_t> versioned_epoch;
+  rgw_bucket_entry_owner owner;
+  real_time timestamp;
+  RGWModifyOp op;
+  RGWPendingState op_state;
+
+  T entry_marker;
+  RGWSyncShardMarkerTrack<T, K> *marker_tracker;
+
+  int sync_status;
+
+  stringstream error_ss;
+
+  bool error_injection;
+
+  RGWDataSyncModule *data_sync_module;
+  
+  rgw_zone_set zones_trace;
+
+  RGWSyncTraceNodeRef tn;
+  std::string zone_name;
+
+public:
+  RGWBucketSyncSingleEntryCR(RGWDataSyncCtx *_sc,
+                             rgw_bucket_sync_pipe& _sync_pipe,
+                             const rgw_obj_key& _key, bool _versioned,
+                             std::optional<uint64_t> _versioned_epoch,
+                             real_time& _timestamp,
+                             const rgw_bucket_entry_owner& _owner,
+                             RGWModifyOp _op, RGWPendingState _op_state,
+                            const T& _entry_marker, RGWSyncShardMarkerTrack<T, K> *_marker_tracker, rgw_zone_set& _zones_trace,
+                             RGWSyncTraceNodeRef& _tn_parent) : RGWCoroutine(_sc->cct),
+                                                     sc(_sc), sync_env(_sc->env),
+                                                      sync_pipe(_sync_pipe), bs(_sync_pipe.info.source_bs),
+                                                      key(_key), versioned(_versioned), versioned_epoch(_versioned_epoch),
+                                                      owner(_owner),
+                                                      timestamp(_timestamp), op(_op),
+                                                      op_state(_op_state),
+                                                      entry_marker(_entry_marker),
+                                                      marker_tracker(_marker_tracker),
+                                                      sync_status(0){
+    stringstream ss;
+    ss << bucket_shard_str{bs} << "/" << key << "[" << versioned_epoch.value_or(0) << "]";
+    set_description() << "bucket sync single entry (source_zone=" << sc->source_zone << ") b=" << ss.str() << " log_entry=" << entry_marker << " op=" << (int)op << " op_state=" << (int)op_state;
+    set_status("init");
+
+    tn = sync_env->sync_tracer->add_node(_tn_parent, "entry", SSTR(key));
+
+    tn->log(20, SSTR("bucket sync single entry (source_zone=" << sc->source_zone << ") b=" << ss.str() << " log_entry=" << entry_marker << " op=" << (int)op << " op_state=" << (int)op_state));
+    error_injection = (sync_env->cct->_conf->rgw_sync_data_inject_err_probability > 0);
+
+    data_sync_module = sync_env->sync_module->get_data_handler();
+    
+    zones_trace = _zones_trace;
+    zones_trace.insert(sync_env->svc->zone->get_zone().id, _sync_pipe.info.dest_bucket.get_key());
+
+    if (sc->env->ostr) {
+      RGWZone* z;
+      if ((z = sc->env->driver->svc()->zone->find_zone(sc->source_zone))) {
+       zone_name = z->name;
+      }
+    }
+  }
+
+  int operate(const DoutPrefixProvider *dpp) override {
+    reenter(this) {
+      /* skip entries that are not complete */
+      if (op_state != CLS_RGW_STATE_COMPLETE) {
+        goto done;
+      }
+      tn->set_flag(RGW_SNS_FLAG_ACTIVE);
+      do {
+        yield {
+          marker_tracker->reset_need_retry(key);
+          if (key.name.empty()) {
+            /* shouldn't happen */
+            set_status("skipping empty entry");
+            tn->log(0, "entry with empty obj name, skipping");
+            goto done;
+          }
+          if (error_injection &&
+              rand() % 10000 < cct->_conf->rgw_sync_data_inject_err_probability * 10000.0) {
+            tn->log(0, SSTR(": injecting data sync error on key=" << key.name));
+            retcode = -EIO;
+          } else if (op == CLS_RGW_OP_ADD ||
+                     op == CLS_RGW_OP_LINK_OLH) {
+            set_status("syncing obj");
+            tn->log(5, SSTR("bucket sync: sync obj: " << sc->source_zone << "/" << bs.bucket << "/" << key << "[" << versioned_epoch.value_or(0) << "]"));
+           if (versioned_epoch) {
+             pretty_print(sc->env, "Syncing object s3://{}/{} version {} in sync from zone {}\n", 
+                          bs.bucket.name, key, *versioned_epoch, zone_name);
+           } else {
+             pretty_print(sc->env, "Syncing object s3://{}/{} in sync from zone {}\n",
+                          bs.bucket.name, key, zone_name);
+           }
+            call(data_sync_module->sync_object(dpp, sc, sync_pipe, key, versioned_epoch, &zones_trace));
+          } else if (op == CLS_RGW_OP_DEL || op == CLS_RGW_OP_UNLINK_INSTANCE) {
+            set_status("removing obj");
+           if (versioned_epoch) {
+             pretty_print(sc->env, "Deleting object s3://{}/{} version {} in sync from zone {}\n",
+                          bs.bucket.name, key, *versioned_epoch, zone_name);
+           } else {
+             pretty_print(sc->env, "Deleting object s3://{}/{} in sync from zone {}\n",
+                          bs.bucket.name, key, zone_name);
+           }
+            if (op == CLS_RGW_OP_UNLINK_INSTANCE) {
+              versioned = true;
+            }
+            tn->log(10, SSTR("removing obj: " << sc->source_zone << "/" << bs.bucket << "/" << key << "[" << versioned_epoch.value_or(0) << "]"));
+            call(data_sync_module->remove_object(dpp, sc, sync_pipe, key, timestamp, versioned, versioned_epoch.value_or(0), &zones_trace));
+            // our copy of the object is more recent, continue as if it succeeded
+          } else if (op == CLS_RGW_OP_LINK_OLH_DM) {
+            set_status("creating delete marker");
+            tn->log(10, SSTR("creating delete marker: obj: " << sc->source_zone << "/" << bs.bucket << "/" << key << "[" << versioned_epoch.value_or(0) << "]"));
+            call(data_sync_module->create_delete_marker(dpp, sc, sync_pipe, key, timestamp, owner, versioned, versioned_epoch.value_or(0), &zones_trace));
+          }
+          tn->set_resource_name(SSTR(bucket_str_noinstance(bs.bucket) << "/" << key));
+        }
+        if (retcode == -ERR_PRECONDITION_FAILED) {
+         pretty_print(sc->env, "Skipping object s3://{}/{} in sync from zone {}\n",
+                      bs.bucket.name, key, zone_name);
+          set_status("Skipping object sync: precondition failed (object contains newer change or policy doesn't allow sync)");
+          tn->log(0, "Skipping object sync: precondition failed (object contains newer change or policy doesn't allow sync)");
+          retcode = 0;
+        }
+      } while (marker_tracker->need_retry(key));
+      {
+        tn->unset_flag(RGW_SNS_FLAG_ACTIVE);
+        if (retcode >= 0) {
+          tn->log(10, "success");
+        } else {
+          tn->log(10, SSTR("failed, retcode=" << retcode << " (" << cpp_strerror(-retcode) << ")"));
+        }
+      }
+
+      if (retcode < 0 && retcode != -ENOENT) {
+        set_status() << "failed to sync obj; retcode=" << retcode;
+        tn->log(0, SSTR("ERROR: failed to sync object: "
+            << bucket_shard_str{bs} << "/" << key.name));
+        if (!ignore_sync_error(retcode)) {
+          error_ss << bucket_shard_str{bs} << "/" << key.name;
+          sync_status = retcode;
+        }
+      }
+      if (!error_ss.str().empty()) {
+        yield call(sync_env->error_logger->log_error_cr(dpp, sc->conn->get_remote_id(), "data", error_ss.str(), -retcode, string("failed to sync object") + cpp_strerror(-sync_status)));
+      }
+done:
+      if (sync_status == 0) {
+        /* update marker */
+        set_status() << "calling marker_tracker->finish(" << entry_marker << ")";
+        yield call(marker_tracker->finish(entry_marker));
+        sync_status = retcode;
+      }
+      if (sync_status < 0) {
+        return set_cr_error(sync_status);
+      }
+      return set_cr_done();
+    }
+    return 0;
+  }
+};
+
+class RGWBucketFullSyncCR : public RGWCoroutine {
+  RGWDataSyncCtx *sc;
+  RGWDataSyncEnv *sync_env;
+  rgw_bucket_sync_pipe& sync_pipe;
+  rgw_bucket_sync_status& sync_status;
+  rgw_bucket_shard& bs;
+  boost::intrusive_ptr<const RGWContinuousLeaseCR> lease_cr;
+  bucket_list_result list_result;
+  list<bucket_list_entry>::iterator entries_iter;
+  rgw_obj_key list_marker;
+  bucket_list_entry *entry{nullptr};
+
+  int total_entries{0};
+
+  int sync_result{0};
+
+  const rgw_raw_obj& status_obj;
+  RGWObjVersionTracker& objv;
+
+  rgw_zone_set zones_trace;
+
+  RGWSyncTraceNodeRef tn;
+  RGWBucketFullSyncMarkerTrack marker_tracker;
+
+  struct _prefix_handler {
+    RGWBucketSyncFlowManager::pipe_rules_ref rules;
+    RGWBucketSyncFlowManager::pipe_rules::prefix_map_t::const_iterator iter;
+    std::optional<string> cur_prefix;
+
+    void set_rules(RGWBucketSyncFlowManager::pipe_rules_ref& _rules) {
+      rules = _rules;
+    }
+
+    bool revalidate_marker(rgw_obj_key *marker) {
+      if (cur_prefix &&
+          boost::starts_with(marker->name, *cur_prefix)) {
+        return true;
+      }
+      if (!rules) {
+        return false;
+      }
+      iter = rules->prefix_search(marker->name);
+      if (iter == rules->prefix_end()) {
+        return false;
+      }
+      cur_prefix = iter->first;
+      marker->name = *cur_prefix;
+      marker->instance.clear();
+      return true;
+    }
+
+    bool check_key_handled(const rgw_obj_key& key) {
+      if (!rules) {
+        return false;
+      }
+      if (cur_prefix &&
+          boost::starts_with(key.name, *cur_prefix)) {
+        return true;
+      }
+      iter = rules->prefix_search(key.name);
+      if (iter == rules->prefix_end()) {
+        return false;
+      }
+      cur_prefix = iter->first;
+      return boost::starts_with(key.name, iter->first);
+    }
+  } prefix_handler;
+
+public:
+  RGWBucketFullSyncCR(RGWDataSyncCtx *_sc,
+                      rgw_bucket_sync_pipe& _sync_pipe,
+                      const rgw_raw_obj& status_obj,
+                      boost::intrusive_ptr<const RGWContinuousLeaseCR> lease_cr,
+                      rgw_bucket_sync_status& sync_status,
+                      RGWSyncTraceNodeRef tn_parent,
+                      RGWObjVersionTracker& objv_tracker)
+    : RGWCoroutine(_sc->cct), sc(_sc), sync_env(_sc->env),
+      sync_pipe(_sync_pipe), sync_status(sync_status),
+      bs(_sync_pipe.info.source_bs),
+      lease_cr(std::move(lease_cr)), status_obj(status_obj), objv(objv_tracker),
+      tn(sync_env->sync_tracer->add_node(tn_parent, "full_sync",
+                                         SSTR(bucket_shard_str{bs}))),
+      marker_tracker(sc, status_obj, sync_status, tn, objv_tracker)
+  {
+    zones_trace.insert(sc->source_zone.id, sync_pipe.info.dest_bucket.get_key());
+    prefix_handler.set_rules(sync_pipe.get_rules());
+  }
+
+  int operate(const DoutPrefixProvider *dpp) override;
+};
+
+int RGWBucketFullSyncCR::operate(const DoutPrefixProvider *dpp)
+{
+  reenter(this) {
+    list_marker = sync_status.full.position;
+
+    total_entries = sync_status.full.count;
+    do {
+      if (lease_cr && !lease_cr->is_locked()) {
+        drain_all();
+        tn->log(1, "no lease or lease is lost, abort");
+        return set_cr_error(-ECANCELED);
+      }
+      set_status("listing remote bucket");
+      tn->log(20, "listing bucket for full sync");
+
+      if (!prefix_handler.revalidate_marker(&list_marker)) {
+        set_status() << "finished iterating over all available prefixes: last marker=" << list_marker;
+        tn->log(20, SSTR("finished iterating over all available prefixes: last marker=" << list_marker));
+        break;
+      }
+
+      yield call(new RGWListRemoteBucketCR(sc, bs, list_marker, &list_result));
+      if (retcode < 0 && retcode != -ENOENT) {
+        set_status("failed bucket listing, going down");
+        drain_all();
+        return set_cr_error(retcode);
+      }
+      if (list_result.entries.size() > 0) {
+        tn->set_flag(RGW_SNS_FLAG_ACTIVE); /* actually have entries to sync */
+      }
+      entries_iter = list_result.entries.begin();
+      for (; entries_iter != list_result.entries.end(); ++entries_iter) {
+        if (lease_cr && !lease_cr->is_locked()) {
+          drain_all();
+          tn->log(1, "no lease or lease is lost, abort");
+          return set_cr_error(-ECANCELED);
+        }
+        tn->log(20, SSTR("[full sync] syncing object: "
+            << bucket_shard_str{bs} << "/" << entries_iter->key));
+        entry = &(*entries_iter);
+        list_marker = entries_iter->key;
+        if (!prefix_handler.check_key_handled(entries_iter->key)) {
+          set_status() << "skipping entry due to policy rules: " << entries_iter->key;
+          tn->log(20, SSTR("skipping entry due to policy rules: " << entries_iter->key));
+          continue;
+        }
+        total_entries++;
+        if (!marker_tracker.start(entry->key, total_entries, real_time())) {
+          tn->log(0, SSTR("ERROR: cannot start syncing " << entry->key << ". Duplicate entry?"));
+        } else {
+          using SyncCR = RGWBucketSyncSingleEntryCR<rgw_obj_key, rgw_obj_key>;
+          yield spawn(new SyncCR(sc, sync_pipe, entry->key,
+                                 false, /* versioned, only matters for object removal */
+                                 entry->versioned_epoch, entry->mtime,
+                                 entry->owner, entry->get_modify_op(), CLS_RGW_STATE_COMPLETE,
+                                 entry->key, &marker_tracker, zones_trace, tn),
+                      false);
+        }
+        drain_with_cb(cct->_conf->rgw_bucket_sync_spawn_window,
+                      [&](uint64_t stack_id, int ret) {
+                if (ret < 0) {
+                  tn->log(10, "a sync operation returned error");
+                  sync_result = ret;
+                }
+                return 0;
+              });
+      }
+    } while (list_result.is_truncated && sync_result == 0);
+    set_status("done iterating over all objects");
+
+    /* wait for all operations to complete */
+    drain_all_cb([&](uint64_t stack_id, int ret) {
+      if (ret < 0) {
+        tn->log(10, "a sync operation returned error");
+        sync_result = ret;
+      }
+      return 0;
+    });
+    tn->unset_flag(RGW_SNS_FLAG_ACTIVE);
+    if (lease_cr && !lease_cr->is_locked()) {
+      tn->log(1, "no lease or lease is lost, abort");
+      return set_cr_error(-ECANCELED);
+    }
+    yield call(marker_tracker.flush());
+    if (retcode < 0) {
+      tn->log(0, SSTR("ERROR: marker_tracker.flush() returned retcode=" << retcode));
+      return set_cr_error(retcode);
+    }
+    /* update sync state to incremental */
+    if (sync_result == 0) {
+      sync_status.state = BucketSyncState::Incremental;
+      tn->log(5, SSTR("set bucket state=" << sync_status.state));
+      yield call(new RGWSimpleRadosWriteCR<rgw_bucket_sync_status>(
+             dpp, sync_env->async_rados, sync_env->svc->sysobj,
+              status_obj, sync_status, &objv));
+      tn->log(5, SSTR("bucket status objv=" << objv));
+    } else {
+      tn->log(10, SSTR("backing out with sync_status=" << sync_result));
+    }
+    if (retcode < 0 && sync_result == 0) { /* actually tried to set incremental state and failed */
+      tn->log(0, SSTR("ERROR: failed to set sync state on bucket "
+          << bucket_shard_str{bs} << " retcode=" << retcode));
+      return set_cr_error(retcode);
+    }
+    if (sync_result < 0) {
+      return set_cr_error(sync_result);
+    }
+    return set_cr_done();
+  }
+  return 0;
+}
+
+static bool has_olh_epoch(RGWModifyOp op) {
+  return op == CLS_RGW_OP_LINK_OLH || op == CLS_RGW_OP_UNLINK_INSTANCE;
+}
+
+class RGWBucketShardIsDoneCR : public RGWCoroutine {
+  RGWDataSyncCtx *sc;
+  RGWDataSyncEnv *sync_env;
+  rgw_bucket_sync_status bucket_status;
+  const rgw_raw_obj& bucket_status_obj;
+  const int shard_id;
+  RGWObjVersionTracker objv_tracker;
+  const next_bilog_result& next_log;
+  const uint64_t generation;
+
+public:
+  RGWBucketShardIsDoneCR(RGWDataSyncCtx *_sc, const rgw_raw_obj& _bucket_status_obj,
+                         int _shard_id, const next_bilog_result& _next_log, const uint64_t _gen)
+    : RGWCoroutine(_sc->cct), sc(_sc), sync_env(_sc->env),
+      bucket_status_obj(_bucket_status_obj),
+      shard_id(_shard_id), next_log(_next_log), generation(_gen) {}
+
+  int operate(const DoutPrefixProvider* dpp) override
+  {
+    reenter(this) {
+      do {
+        // read bucket sync status
+        objv_tracker.clear();
+        using ReadCR = RGWSimpleRadosReadCR<rgw_bucket_sync_status>;
+        yield call(new ReadCR(dpp, sync_env->async_rados, sync_env->svc->sysobj,
+                              bucket_status_obj, &bucket_status, false, &objv_tracker));
+        if (retcode < 0) {
+          ldpp_dout(dpp, 20) << "failed to read bucket shard status: "
+              << cpp_strerror(retcode) << dendl;
+          return set_cr_error(retcode);
+        }
+
+        if (bucket_status.state != BucketSyncState::Incremental) {
+          // exit with success to avoid stale shard being
+          // retried in error repo if we lost a race
+          ldpp_dout(dpp, 20) << "RGWBucketShardIsDoneCR found sync state = " << bucket_status.state << dendl;
+          return set_cr_done();
+        }
+
+        if (bucket_status.incremental_gen != generation) {
+          // exit with success to avoid stale shard being
+          // retried in error repo if we lost a race
+          ldpp_dout(dpp, 20) << "RGWBucketShardIsDoneCR expected gen: " << generation
+              << ", got: " << bucket_status.incremental_gen << dendl;
+          return set_cr_done();
+        }
+
+        yield {
+          // update bucket_status after a shard is done with current gen
+          auto& done = bucket_status.shards_done_with_gen;
+          done[shard_id] = true;
+
+          // increment gen if all shards are already done with current gen
+          if (std::all_of(done.begin(), done.end(),
+            [] (const bool done){return done; } )) {
+            bucket_status.incremental_gen = next_log.generation;
+            done.clear();
+            done.resize(next_log.num_shards, false);
+          }
+          ldpp_dout(dpp, 20) << "bucket status incremental gen is " << bucket_status.incremental_gen << dendl;
+          using WriteCR = RGWSimpleRadosWriteCR<rgw_bucket_sync_status>;
+          call(new WriteCR(dpp, sync_env->async_rados, sync_env->svc->sysobj,
+                            bucket_status_obj, bucket_status, &objv_tracker, false));
+        }
+        if (retcode < 0 && retcode != -ECANCELED) {
+          ldpp_dout(dpp, 20) << "failed to write bucket sync status: " << cpp_strerror(retcode) << dendl;
+          return set_cr_error(retcode);
+        } else if (retcode >= 0) {
+          return set_cr_done();
+        }
+      } while (retcode == -ECANCELED);
+    }
+    return 0;
+  }
+};
+
+class RGWBucketShardIncrementalSyncCR : public RGWCoroutine {
+  RGWDataSyncCtx *sc;
+  RGWDataSyncEnv *sync_env;
+  rgw_bucket_sync_pipe& sync_pipe;
+  RGWBucketSyncFlowManager::pipe_rules_ref rules;
+  rgw_bucket_shard& bs;
+  const rgw_raw_obj& bucket_status_obj;
+  boost::intrusive_ptr<const RGWContinuousLeaseCR> lease_cr;
+  bilog_list_result extended_result;
+  list<rgw_bi_log_entry> list_result;
+  int next_num_shards;
+  uint64_t next_gen;
+  bool truncated;
+
+  list<rgw_bi_log_entry>::iterator entries_iter, entries_end;
+  map<pair<string, string>, pair<real_time, RGWModifyOp> > squash_map;
+  rgw_bucket_shard_sync_info& sync_info;
+  uint64_t generation;
+  rgw_obj_key key;
+  rgw_bi_log_entry *entry{nullptr};
+  bool updated_status{false};
+  rgw_zone_id zone_id;
+  string target_location_key;
+
+  string cur_id;
+
+  int sync_status{0};
+  bool syncstopped{false};
+
+  RGWSyncTraceNodeRef tn;
+  RGWBucketIncSyncShardMarkerTrack marker_tracker;
+
+public:
+  RGWBucketShardIncrementalSyncCR(RGWDataSyncCtx *_sc,
+                                  rgw_bucket_sync_pipe& _sync_pipe,
+                                  const std::string& shard_status_oid,
+                                  const rgw_raw_obj& _bucket_status_obj,
+                                  boost::intrusive_ptr<const RGWContinuousLeaseCR> lease_cr,
+                                  rgw_bucket_shard_sync_info& sync_info,
+                                  uint64_t generation,
+                                  RGWSyncTraceNodeRef& _tn_parent,
+                                  RGWObjVersionTracker& objv_tracker,
+                                  ceph::real_time* stable_timestamp)
+    : RGWCoroutine(_sc->cct), sc(_sc), sync_env(_sc->env),
+      sync_pipe(_sync_pipe), bs(_sync_pipe.info.source_bs),
+      bucket_status_obj(_bucket_status_obj), lease_cr(std::move(lease_cr)),
+      sync_info(sync_info), generation(generation), zone_id(sync_env->svc->zone->get_zone().id),
+      tn(sync_env->sync_tracer->add_node(_tn_parent, "inc_sync",
+                                         SSTR(bucket_shard_str{bs}))),
+      marker_tracker(sc, shard_status_oid, sync_info.inc_marker, tn,
+                     objv_tracker, stable_timestamp)
+  {
+    set_description() << "bucket shard incremental sync bucket="
+        << bucket_shard_str{bs};
+    set_status("init");
+    rules = sync_pipe.get_rules();
+    target_location_key = sync_pipe.info.dest_bucket.get_key();
+  }
+
+  bool check_key_handled(const rgw_obj_key& key) {
+    if (!rules) {
+      return false;
+    }
+    auto iter = rules->prefix_search(key.name);
+    if (iter == rules->prefix_end()) {
+      return false;
+    }
+    return boost::starts_with(key.name, iter->first);
+  }
+
+  int operate(const DoutPrefixProvider *dpp) override;
+};
+
+int RGWBucketShardIncrementalSyncCR::operate(const DoutPrefixProvider *dpp)
+{
+  int ret;
+  reenter(this) {
+    do {
+      if (lease_cr && !lease_cr->is_locked()) {
+        drain_all();
+        tn->log(1, "no lease or lease is lost, abort");
+        return set_cr_error(-ECANCELED);
+      }
+      tn->log(20, SSTR("listing bilog for incremental sync; position=" << sync_info.inc_marker.position));
+      set_status() << "listing bilog; position=" << sync_info.inc_marker.position;
+      yield call(new RGWListBucketIndexLogCR(sc, bs, sync_info.inc_marker.position, generation, &extended_result));
+      if (retcode < 0 && retcode != -ENOENT) {
+        /* wait for all operations to complete */
+        drain_all();
+        return set_cr_error(retcode);
+      }
+      list_result = std::move(extended_result.entries);
+      truncated = extended_result.truncated;
+      if (extended_result.next_log) {
+        next_gen = extended_result.next_log->generation;
+        next_num_shards = extended_result.next_log->num_shards;
+      }
+
+      squash_map.clear();
+      entries_iter = list_result.begin();
+      entries_end = list_result.end();
+      for (; entries_iter != entries_end; ++entries_iter) {
+        auto e = *entries_iter;
+        if (e.op == RGWModifyOp::CLS_RGW_OP_SYNCSTOP) {
+          ldpp_dout(dpp, 20) << "syncstop at: " << e.timestamp << ". marker: " << e.id << dendl;
+          syncstopped = true;
+          entries_end = std::next(entries_iter); // stop after this entry
+          break;
+        }
+        if (e.op == RGWModifyOp::CLS_RGW_OP_RESYNC) {
+          ldpp_dout(dpp, 20) << "syncstart at: " << e.timestamp << ". marker: " << e.id << dendl;
+          continue;
+        }
+        if (e.op == CLS_RGW_OP_CANCEL) {
+          continue;
+        }
+        if (e.state != CLS_RGW_STATE_COMPLETE) {
+          continue;
+        }
+        if (e.zones_trace.exists(zone_id.id, target_location_key)) {
+          continue;
+        }
+        auto& squash_entry = squash_map[make_pair(e.object, e.instance)];
+        // don't squash over olh entries - we need to apply their olh_epoch
+        if (has_olh_epoch(squash_entry.second) && !has_olh_epoch(e.op)) {
+          continue;
+        }
+        if (squash_entry.first <= e.timestamp) {
+          squash_entry = make_pair<>(e.timestamp, e.op);
+        }
+      }
+
+      entries_iter = list_result.begin();
+      for (; entries_iter != entries_end; ++entries_iter) {
+        if (lease_cr && !lease_cr->is_locked()) {
+          drain_all();
+          tn->log(1, "no lease or lease is lost, abort");
+          return set_cr_error(-ECANCELED);
+        }
+        entry = &(*entries_iter);
+        {
+          ssize_t p = entry->id.find('#'); /* entries might have explicit shard info in them, e.g., 6#00000000004.94.3 */
+          if (p < 0) {
+            cur_id = entry->id;
+          } else {
+            cur_id = entry->id.substr(p + 1);
+          }
+        }
+        sync_info.inc_marker.position = cur_id;
+
+        if (entry->op == RGWModifyOp::CLS_RGW_OP_SYNCSTOP || entry->op == RGWModifyOp::CLS_RGW_OP_RESYNC) {
+          ldpp_dout(dpp, 20) << "detected syncstop or resync on " << entries_iter->timestamp << ", skipping entry" << dendl;
+          marker_tracker.try_update_high_marker(cur_id, 0, entry->timestamp);
+          continue;
+        }
+
+        if (!key.set(rgw_obj_index_key{entry->object, entry->instance})) {
+          set_status() << "parse_raw_oid() on " << entry->object << " returned false, skipping entry";
+          tn->log(20, SSTR("parse_raw_oid() on " << entry->object << " returned false, skipping entry"));
+          marker_tracker.try_update_high_marker(cur_id, 0, entry->timestamp);
+          continue;
+        }
+
+        tn->log(20, SSTR("parsed entry: id=" << cur_id << " iter->object=" << entry->object << " iter->instance=" << entry->instance << " name=" << key.name << " instance=" << key.instance << " ns=" << key.ns));
+
+        if (!key.ns.empty()) {
+          set_status() << "skipping entry in namespace: " << entry->object;
+          tn->log(20, SSTR("skipping entry in namespace: " << entry->object));
+          marker_tracker.try_update_high_marker(cur_id, 0, entry->timestamp);
+          continue;
+        }
+
+        if (!check_key_handled(key)) {
+          set_status() << "skipping entry due to policy rules: " << entry->object;
+          tn->log(20, SSTR("skipping entry due to policy rules: " << entry->object));
+          marker_tracker.try_update_high_marker(cur_id, 0, entry->timestamp);
+          continue;
+        }
+
+        set_status() << "got entry.id=" << cur_id << " key=" << key << " op=" << (int)entry->op;
+        if (entry->op == CLS_RGW_OP_CANCEL) {
+          set_status() << "canceled operation, skipping";
+          tn->log(20, SSTR("skipping object: "
+              << bucket_shard_str{bs} << "/" << key << ": canceled operation"));
+          marker_tracker.try_update_high_marker(cur_id, 0, entry->timestamp);
+          continue;
+        }
+        if (entry->state != CLS_RGW_STATE_COMPLETE) {
+          set_status() << "non-complete operation, skipping";
+          tn->log(20, SSTR("skipping object: "
+              << bucket_shard_str{bs} << "/" << key << ": non-complete operation"));
+          marker_tracker.try_update_high_marker(cur_id, 0, entry->timestamp);
+          continue;
+        }
+        if (entry->zones_trace.exists(zone_id.id, target_location_key)) {
+          set_status() << "redundant operation, skipping";
+          tn->log(20, SSTR("skipping object: "
+              <<bucket_shard_str{bs} <<"/"<<key<<": redundant operation"));
+          marker_tracker.try_update_high_marker(cur_id, 0, entry->timestamp);
+          continue;
+        }
+        if (make_pair<>(entry->timestamp, entry->op) != squash_map[make_pair(entry->object, entry->instance)]) {
+          set_status() << "squashed operation, skipping";
+          tn->log(20, SSTR("skipping object: "
+              << bucket_shard_str{bs} << "/" << key << ": squashed operation"));
+          marker_tracker.try_update_high_marker(cur_id, 0, entry->timestamp);
+          continue;
+        }
+        tn->set_flag(RGW_SNS_FLAG_ACTIVE);
+        tn->log(20, SSTR("syncing object: "
+            << bucket_shard_str{bs} << "/" << key));
+        updated_status = false;
+        while (!marker_tracker.can_do_op(key, has_olh_epoch(entry->op))) {
+          if (!updated_status) {
+            set_status() << "can't do op, conflicting inflight operation";
+            updated_status = true;
+          }
+          tn->log(5, SSTR("can't do op on key=" << key << " need to wait for conflicting operation to complete"));
+          yield wait_for_child();
+          bool again = true;
+          while (again) {
+            again = collect(&ret, nullptr);
+            if (ret < 0) {
+              tn->log(0, SSTR("ERROR: a child operation returned error (ret=" << ret << ")"));
+              sync_status = ret;
+              /* we have reported this error */
+            }
+          }
+          if (sync_status != 0)
+            break;
+        }
+        if (sync_status != 0) {
+          /* get error, stop */
+          break;
+        }
+        if (!marker_tracker.index_key_to_marker(key, cur_id, has_olh_epoch(entry->op))) {
+          set_status() << "can't do op, sync already in progress for object";
+          tn->log(20, SSTR("skipping sync of entry: " << cur_id << ":" << key << " sync already in progress for object"));
+          marker_tracker.try_update_high_marker(cur_id, 0, entry->timestamp);
+          continue;
+        }
+        // yield {
+          set_status() << "start object sync";
+          if (!marker_tracker.start(cur_id, 0, entry->timestamp)) {
+            tn->log(0, SSTR("ERROR: cannot start syncing " << cur_id << ". Duplicate entry?"));
+          } else {
+            std::optional<uint64_t> versioned_epoch;
+            rgw_bucket_entry_owner owner(entry->owner, entry->owner_display_name);
+            if (entry->ver.pool < 0) {
+              versioned_epoch = entry->ver.epoch;
+            }
+            tn->log(20, SSTR("entry->timestamp=" << entry->timestamp));
+            using SyncCR = RGWBucketSyncSingleEntryCR<string, rgw_obj_key>;
+            spawn(new SyncCR(sc, sync_pipe, key,
+                             entry->is_versioned(), versioned_epoch,
+                             entry->timestamp, owner, entry->op, entry->state,
+                             cur_id, &marker_tracker, entry->zones_trace, tn),
+                  false);
+          }
+        // }
+        drain_with_cb(cct->_conf->rgw_bucket_sync_spawn_window,
+                      [&](uint64_t stack_id, int ret) {
+                if (ret < 0) {
+                  tn->log(10, "a sync operation returned error");
+                  sync_status = ret;
+                }
+                return 0;
+              });
+      }
+
+    } while (!list_result.empty() && sync_status == 0 && !syncstopped);
+
+    drain_all_cb([&](uint64_t stack_id, int ret) {
+      if (ret < 0) {
+        tn->log(10, "a sync operation returned error");
+        sync_status = ret;
+      }
+      return 0;
+    });
+    tn->unset_flag(RGW_SNS_FLAG_ACTIVE);
+
+    if (syncstopped) {
+      // transition to StateStopped in RGWSyncBucketShardCR. if sync is
+      // still disabled, we'll delete the sync status object. otherwise we'll
+      // restart full sync to catch any changes that happened while sync was
+      // disabled
+      sync_info.state = rgw_bucket_shard_sync_info::StateStopped;
+      return set_cr_done();
+    }
+
+    yield call(marker_tracker.flush());
+    if (retcode < 0) {
+      tn->log(0, SSTR("ERROR: marker_tracker.flush() returned retcode=" << retcode));
+      return set_cr_error(retcode);
+    }
+    if (sync_status < 0) {
+      tn->log(10, SSTR("backing out with sync_status=" << sync_status));
+      return set_cr_error(sync_status);
+    }
+
+    if (!truncated && extended_result.next_log) {
+      yield call(new RGWBucketShardIsDoneCR(sc, bucket_status_obj, bs.shard_id, *extended_result.next_log, generation));
+      if (retcode < 0) {
+        ldout(cct, 20) << "failed to update bucket sync status: "
+            << cpp_strerror(retcode) << dendl;
+        return set_cr_error(retcode);
+      }
+      yield {
+        // delete the shard status object
+        auto status_obj = sync_env->svc->rados->obj(marker_tracker.get_obj());
+        retcode = status_obj.open(dpp);
+        if (retcode < 0) {
+          return set_cr_error(retcode);
+        }
+        call(new RGWRadosRemoveOidCR(sync_env->driver, std::move(status_obj)));
+        if (retcode < 0) {
+          ldpp_dout(dpp, 20) << "failed to remove shard status object: " << cpp_strerror(retcode) << dendl;
+          return set_cr_error(retcode);
+        }
+      }
+    }
+
+    return set_cr_done();
+  }
+  return 0;
+}
+
+class RGWGetBucketPeersCR : public RGWCoroutine {
+  RGWDataSyncEnv *sync_env;
+
+  std::optional<rgw_bucket> target_bucket;
+  std::optional<rgw_zone_id> source_zone;
+  std::optional<rgw_bucket> source_bucket;
+
+  rgw_sync_pipe_info_set *pipes;
+  map<rgw_bucket, all_bucket_info> buckets_info;
+  map<rgw_bucket, all_bucket_info>::iterator siiter;
+  std::optional<all_bucket_info> target_bucket_info;
+  std::optional<all_bucket_info> source_bucket_info;
+
+  rgw_sync_pipe_info_set::iterator siter;
+
+  std::shared_ptr<rgw_bucket_get_sync_policy_result> source_policy;
+  std::shared_ptr<rgw_bucket_get_sync_policy_result> target_policy;
+
+  RGWSyncTraceNodeRef tn;
+
+  using pipe_const_iter = map<rgw_zone_id, RGWBucketSyncFlowManager::pipe_set>::const_iterator;
+
+  static pair<pipe_const_iter, pipe_const_iter> get_pipe_iters(const map<rgw_zone_id, RGWBucketSyncFlowManager::pipe_set>& m, std::optional<rgw_zone_id> zone) {
+    if (!zone) {
+      return { m.begin(), m.end() };
+    }
+
+    auto b = m.find(*zone);
+    if (b == m.end()) {
+      return { b, b };
+    }
+    return { b, std::next(b) };
+  }
+
+  void filter_sources(std::optional<rgw_zone_id> source_zone,
+                      std::optional<rgw_bucket> source_bucket,
+                      const map<rgw_zone_id, RGWBucketSyncFlowManager::pipe_set>& all_sources,
+                      rgw_sync_pipe_info_set *result) {
+    ldpp_dout(sync_env->dpp, 20) << __func__ << ": source_zone=" << source_zone.value_or(rgw_zone_id("*")).id
+                                << " source_bucket=" << source_bucket.value_or(rgw_bucket())
+                                << " all_sources.size()=" << all_sources.size() << dendl;
+    auto iters = get_pipe_iters(all_sources, source_zone);
+    for (auto i = iters.first; i != iters.second; ++i) {
+      for (auto& handler : i->second) {
+        if (!handler.specific()) {
+          ldpp_dout(sync_env->dpp, 20) << __func__ << ": pipe_handler=" << handler << ": skipping" << dendl;
+          continue;
+        }
+        if (source_bucket &&
+            !source_bucket->match(*handler.source.bucket)) {
+          continue;
+        }
+        ldpp_dout(sync_env->dpp, 20) << __func__ << ": pipe_handler=" << handler << ": adding" << dendl;
+        result->insert(handler, source_bucket_info, target_bucket_info);
+      }
+    }
+  }
+
+  void filter_targets(std::optional<rgw_zone_id> target_zone,
+                      std::optional<rgw_bucket> target_bucket,
+                      const map<rgw_zone_id, RGWBucketSyncFlowManager::pipe_set>& all_targets,
+                      rgw_sync_pipe_info_set *result) {
+    ldpp_dout(sync_env->dpp, 20) << __func__ << ": target_zone=" << source_zone.value_or(rgw_zone_id("*")).id
+                                << " target_bucket=" << source_bucket.value_or(rgw_bucket())
+                                << " all_targets.size()=" << all_targets.size() << dendl;
+    auto iters = get_pipe_iters(all_targets, target_zone);
+    for (auto i = iters.first; i != iters.second; ++i) {
+      for (auto& handler : i->second) {
+        if (target_bucket &&
+            handler.dest.bucket &&
+            !target_bucket->match(*handler.dest.bucket)) {
+          ldpp_dout(sync_env->dpp, 20) << __func__ << ": pipe_handler=" << handler << ": skipping" << dendl;
+          continue;
+        }
+        ldpp_dout(sync_env->dpp, 20) << __func__ << ": pipe_handler=" << handler << ": adding" << dendl;
+        result->insert(handler, source_bucket_info, target_bucket_info);
+      }
+    }
+  }
+
+  void update_from_target_bucket_policy();
+  void update_from_source_bucket_policy();
+
+  struct GetHintTargets : public RGWGenericAsyncCR::Action {
+    RGWDataSyncEnv *sync_env;
+    rgw_bucket source_bucket;
+    std::set<rgw_bucket> targets;
+    
+    GetHintTargets(RGWDataSyncEnv *_sync_env,
+                   const rgw_bucket& _source_bucket) : sync_env(_sync_env),
+                                                       source_bucket(_source_bucket) {}
+    int operate() override {
+      int r = sync_env->svc->bucket_sync->get_bucket_sync_hints(sync_env->dpp, 
+                                                                source_bucket,
+                                                                nullptr,
+                                                                &targets,
+                                                                null_yield);
+      if (r < 0) {
+        ldpp_dout(sync_env->dpp, 0) << "ERROR: " << __func__ << "(): failed to fetch bucket sync hints for bucket=" << source_bucket << dendl;
+        return r;
+      }
+
+      return 0;
+    }
+  };
+
+  std::shared_ptr<GetHintTargets> get_hint_targets_action;
+  std::set<rgw_bucket>::iterator hiter;
+
+public:
+  RGWGetBucketPeersCR(RGWDataSyncEnv *_sync_env,
+                      std::optional<rgw_bucket> _target_bucket,
+                      std::optional<rgw_zone_id> _source_zone,
+                      std::optional<rgw_bucket> _source_bucket,
+                      rgw_sync_pipe_info_set *_pipes,
+                      const RGWSyncTraceNodeRef& _tn_parent)
+    : RGWCoroutine(_sync_env->cct),
+      sync_env(_sync_env),
+      target_bucket(_target_bucket),
+      source_zone(_source_zone),
+      source_bucket(_source_bucket),
+      pipes(_pipes),
+      tn(sync_env->sync_tracer->add_node(_tn_parent, "get_bucket_peers",
+                                         SSTR( "target=" << target_bucket.value_or(rgw_bucket())
+                                               << ":source=" << target_bucket.value_or(rgw_bucket())
+                                               << ":source_zone=" << source_zone.value_or(rgw_zone_id("*")).id))) {
+      }
+
+  int operate(const DoutPrefixProvider *dpp) override;
+};
+
+std::ostream& operator<<(std::ostream& out, std::optional<rgw_bucket_shard>& bs) {
+  if (!bs) {
+    out << "*";
+  } else {
+    out << *bs;
+  }
+  return out;
+}
+
+static RGWCoroutine* sync_bucket_shard_cr(RGWDataSyncCtx* sc,
+                                          boost::intrusive_ptr<const RGWContinuousLeaseCR> lease,
+                                          const rgw_bucket_sync_pair_info& sync_pair,
+                                          std::optional<uint64_t> gen,
+                                          const RGWSyncTraceNodeRef& tn,
+                                          ceph::real_time* progress);
+
+RGWRunBucketSourcesSyncCR::RGWRunBucketSourcesSyncCR(RGWDataSyncCtx *_sc,
+                                                     boost::intrusive_ptr<const RGWContinuousLeaseCR> lease_cr,
+                                                     const rgw_bucket_shard& source_bs,
+                                                     const RGWSyncTraceNodeRef& _tn_parent,
+                                                    std::optional<uint64_t> gen,
+                                                     ceph::real_time* progress)
+  : RGWCoroutine(_sc->env->cct), sc(_sc), sync_env(_sc->env),
+    lease_cr(std::move(lease_cr)),
+    tn(sync_env->sync_tracer->add_node(
+        _tn_parent, "bucket_sync_sources",
+        SSTR( "source=" << source_bs << ":source_zone=" << sc->source_zone))),
+    progress(progress),
+    gen(gen)
+{
+  sync_pair.source_bs = source_bs;
+}
+
+int RGWRunBucketSourcesSyncCR::operate(const DoutPrefixProvider *dpp)
+{
+  reenter(this) {
+    yield call(new RGWGetBucketPeersCR(sync_env, std::nullopt, sc->source_zone,
+                                       sync_pair.source_bs.bucket, &pipes, tn));
+    if (retcode < 0 && retcode != -ENOENT) {
+      tn->log(0, SSTR("ERROR: failed to read sync status for bucket. error: " << retcode));
+      return set_cr_error(retcode);
+    }
+
+    ldpp_dout(dpp, 20) << __func__ << "(): requested source_bs=" << sync_pair.source_bs << dendl;
+
+    if (pipes.empty()) {
+      ldpp_dout(dpp, 20) << __func__ << "(): no relevant sync pipes found" << dendl;
+      return set_cr_done();
+    }
+
+    shard_progress.resize(pipes.size());
+    cur_shard_progress = shard_progress.begin();
+
+    for (siter = pipes.begin(); siter != pipes.end(); ++siter, ++cur_shard_progress) {
+      ldpp_dout(dpp, 20) << __func__ << "(): sync pipe=" << *siter << dendl;
+
+      sync_pair.dest_bucket = siter->target.get_bucket();
+      sync_pair.handler = siter->handler;
+
+      ldpp_dout(dpp, 20) << __func__ << "(): sync_pair=" << sync_pair << dendl;
+
+      yield_spawn_window(sync_bucket_shard_cr(sc, lease_cr, sync_pair,
+                                              gen, tn, &*cur_shard_progress),
+                         cct->_conf->rgw_bucket_sync_spawn_window,
+                         [&](uint64_t stack_id, int ret) {
+                           if (ret < 0) {
+                             tn->log(10, SSTR("ERROR: a sync operation returned error: " << ret));
+                           }
+                           return ret;
+                         });
+    }
+    drain_all_cb([&](uint64_t stack_id, int ret) {
+                   if (ret < 0) {
+                     tn->log(10, SSTR("a sync operation returned error: " << ret));
+                   }
+                   return ret;
+                 });
+    if (progress) {
+      *progress = *std::min_element(shard_progress.begin(), shard_progress.end());
+    }
+    return set_cr_done();
+  }
+
+  return 0;
+}
+
+class RGWSyncGetBucketInfoCR : public RGWCoroutine {
+  RGWDataSyncEnv *sync_env;
+  rgw_bucket bucket;
+  RGWBucketInfo *pbucket_info;
+  map<string, bufferlist> *pattrs;
+  RGWMetaSyncEnv meta_sync_env;
+
+  RGWSyncTraceNodeRef tn;
+
+public:
+  RGWSyncGetBucketInfoCR(RGWDataSyncEnv *_sync_env,
+                         const rgw_bucket& _bucket,
+                         RGWBucketInfo *_pbucket_info,
+                         map<string, bufferlist> *_pattrs,
+                         const RGWSyncTraceNodeRef& _tn_parent)
+    : RGWCoroutine(_sync_env->cct),
+      sync_env(_sync_env),
+      bucket(_bucket),
+      pbucket_info(_pbucket_info),
+      pattrs(_pattrs),
+      tn(sync_env->sync_tracer->add_node(_tn_parent, "get_bucket_info",
+                                         SSTR(bucket))) {
+  }
+
+  int operate(const DoutPrefixProvider *dpp) override;
+};
+
+int RGWSyncGetBucketInfoCR::operate(const DoutPrefixProvider *dpp)
+{
+  reenter(this) {
+    yield call(new RGWGetBucketInstanceInfoCR(sync_env->async_rados, sync_env->driver, bucket, pbucket_info, pattrs, dpp));
+    if (retcode == -ENOENT) {
+      /* bucket instance info has not been synced in yet, fetch it now */
+      yield {
+        tn->log(10, SSTR("no local info for bucket:" << ": fetching metadata"));
+        string raw_key = string("bucket.instance:") + bucket.get_key();
+
+        meta_sync_env.init(dpp, cct, sync_env->driver, sync_env->svc->zone->get_master_conn(), sync_env->async_rados,
+                           sync_env->http_manager, sync_env->error_logger, sync_env->sync_tracer);
+
+        call(new RGWMetaSyncSingleEntryCR(&meta_sync_env, raw_key,
+                                          string() /* no marker */,
+                                          MDLOG_STATUS_COMPLETE,
+                                          NULL /* no marker tracker */,
+                                          tn));
+      }
+      if (retcode < 0) {
+        tn->log(0, SSTR("ERROR: failed to fetch bucket instance info for " << bucket_str{bucket}));
+        return set_cr_error(retcode);
+      }
+
+      yield call(new RGWGetBucketInstanceInfoCR(sync_env->async_rados, sync_env->driver, bucket, pbucket_info, pattrs, dpp));
+    }
+    if (retcode < 0) {
+      tn->log(0, SSTR("ERROR: failed to retrieve bucket info for bucket=" << bucket_str{bucket}));
+      return set_cr_error(retcode);
+    }
+
+    return set_cr_done();
+  }
+
+  return 0;
+}
+
+void RGWGetBucketPeersCR::update_from_target_bucket_policy()
+{
+  if (!target_policy ||
+      !target_policy->policy_handler ||
+      !pipes) {
+    return;
+  }
+
+  auto handler = target_policy->policy_handler.get();
+
+  filter_sources(source_zone,
+                 source_bucket,
+                 handler->get_sources(),
+                 pipes);
+
+  for (siter = pipes->begin(); siter != pipes->end(); ++siter) {
+    if (!siter->source.has_bucket_info()) {
+      buckets_info.emplace(siter->source.get_bucket(), all_bucket_info());
+    }
+    if (!siter->target.has_bucket_info()) {
+      buckets_info.emplace(siter->target.get_bucket(), all_bucket_info());
+    }
+  }
+}
+
+void RGWGetBucketPeersCR::update_from_source_bucket_policy()
+{
+  if (!source_policy ||
+      !source_policy->policy_handler ||
+      !pipes) {
+    return;
+  }
+
+  auto handler = source_policy->policy_handler.get();
+
+  filter_targets(sync_env->svc->zone->get_zone().id,
+                 target_bucket,
+                 handler->get_targets(),
+                 pipes);
+
+  for (siter = pipes->begin(); siter != pipes->end(); ++siter) {
+    if (!siter->source.has_bucket_info()) {
+      buckets_info.emplace(siter->source.get_bucket(), all_bucket_info());
+    }
+    if (!siter->target.has_bucket_info()) {
+      buckets_info.emplace(siter->target.get_bucket(), all_bucket_info());
+    }
+  }
+}
+
+
+class RGWSyncGetBucketSyncPolicyHandlerCR : public RGWCoroutine {
+  RGWDataSyncEnv *sync_env;
+  rgw_bucket bucket;
+  rgw_bucket_get_sync_policy_params get_policy_params;
+
+  std::shared_ptr<rgw_bucket_get_sync_policy_result> policy;
+
+  RGWSyncTraceNodeRef tn;
+
+  int i;
+
+public:
+  RGWSyncGetBucketSyncPolicyHandlerCR(RGWDataSyncEnv *_sync_env,
+                         std::optional<rgw_zone_id> zone,
+                         const rgw_bucket& _bucket,
+                         std::shared_ptr<rgw_bucket_get_sync_policy_result>& _policy,
+                         const RGWSyncTraceNodeRef& _tn_parent)
+    : RGWCoroutine(_sync_env->cct),
+      sync_env(_sync_env),
+      bucket(_bucket),
+      policy(_policy),
+      tn(sync_env->sync_tracer->add_node(_tn_parent, "get_sync_policy_handler",
+                                         SSTR(bucket))) {
+    get_policy_params.zone = zone;
+    get_policy_params.bucket = bucket;
+  }
+
+  int operate(const DoutPrefixProvider *dpp) override {
+    reenter(this) {
+      for (i = 0; i < 2; ++i) {
+        yield call(new RGWBucketGetSyncPolicyHandlerCR(sync_env->async_rados,
+                                                       sync_env->driver,
+                                                       get_policy_params,
+                                                       policy,
+                                                       dpp));
+        if (retcode < 0 &&
+            retcode != -ENOENT) {
+          return set_cr_error(retcode);
+        }
+
+        if (retcode == 0) {
+          return set_cr_done();
+        }
+
+        /* bucket instance was not found,
+         * try to get bucket instance info, can trigger
+         * metadata sync of bucket instance
+         */
+        yield call(new RGWSyncGetBucketInfoCR(sync_env, 
+                                              bucket, 
+                                              nullptr,
+                                              nullptr,
+                                              tn));
+        if (retcode < 0) {
+          return set_cr_error(retcode);
+        }
+      }
+    }
+
+    return 0;
+  }
+};
+
+
+int RGWGetBucketPeersCR::operate(const DoutPrefixProvider *dpp)
+{
+  reenter(this) {
+    if (pipes) {
+      pipes->clear();
+    }
+    if (target_bucket) {
+      target_policy = make_shared<rgw_bucket_get_sync_policy_result>();
+      yield call(new RGWSyncGetBucketSyncPolicyHandlerCR(sync_env,
+                                                         nullopt,
+                                                         *target_bucket,
+                                                         target_policy,
+                                                         tn));
+      if (retcode < 0 &&
+          retcode != -ENOENT) {
+        return set_cr_error(retcode);
+      }
+
+      update_from_target_bucket_policy();
+    }
+
+    if (source_bucket && source_zone) {
+      source_policy = make_shared<rgw_bucket_get_sync_policy_result>();
+      yield call(new RGWSyncGetBucketSyncPolicyHandlerCR(sync_env,
+                                                         source_zone,
+                                                         *source_bucket,
+                                                         source_policy,
+                                                         tn));
+      if (retcode < 0 &&
+          retcode != -ENOENT) {
+        return set_cr_error(retcode);
+      }
+
+      if (source_policy->policy_handler) {
+        auto& opt_bucket_info = source_policy->policy_handler->get_bucket_info();
+        auto& opt_attrs = source_policy->policy_handler->get_bucket_attrs();
+        if (opt_bucket_info && opt_attrs) {
+          source_bucket_info.emplace();
+          source_bucket_info->bucket_info = *opt_bucket_info;
+          source_bucket_info->attrs = *opt_attrs;
+        }
+      }
+
+      if (!target_bucket) {
+        get_hint_targets_action = make_shared<GetHintTargets>(sync_env, *source_bucket);
+
+        yield call(new RGWGenericAsyncCR(cct, sync_env->async_rados,
+                                         get_hint_targets_action));
+        if (retcode < 0) {
+          return set_cr_error(retcode);
+        }
+
+        /* hints might have incomplete bucket ids,
+         * in which case we need to figure out the current
+         * bucket_id
+         */
+        for (hiter = get_hint_targets_action->targets.begin();
+             hiter != get_hint_targets_action->targets.end();
+             ++hiter) {
+          ldpp_dout(dpp, 20) << "Got sync hint for bucket=" << *source_bucket << ": " << hiter->get_key() << dendl;
+
+          target_policy = make_shared<rgw_bucket_get_sync_policy_result>();
+          yield call(new RGWSyncGetBucketSyncPolicyHandlerCR(sync_env,
+                                                             nullopt,
+                                                             *hiter,
+                                                             target_policy,
+                                                             tn));
+          if (retcode < 0 &&
+              retcode != -ENOENT) {
+            return set_cr_error(retcode);
+          }
+          update_from_target_bucket_policy();
+        }
+      }
+    }
+
+    update_from_source_bucket_policy();
+
+    for (siiter = buckets_info.begin(); siiter != buckets_info.end(); ++siiter) {
+      if (siiter->second.bucket_info.bucket.name.empty()) {
+        yield call(new RGWSyncGetBucketInfoCR(sync_env, siiter->first,
+                                              &siiter->second.bucket_info,
+                                              &siiter->second.attrs,
+                                              tn));
+      }
+    }
+
+    if (pipes) {
+      pipes->update_empty_bucket_info(buckets_info);
+    }
+
+    return set_cr_done();
+  }
+
+  return 0;
+}
+
+class RGWSyncBucketShardCR : public RGWCoroutine {
+  RGWDataSyncCtx *sc;
+  RGWDataSyncEnv *sync_env;
+  boost::intrusive_ptr<const RGWContinuousLeaseCR> lease_cr;
+  rgw_bucket_sync_pair_info sync_pair;
+  rgw_bucket_sync_pipe& sync_pipe;
+  bool& bucket_stopped;
+  uint64_t generation;
+  ceph::real_time* progress;
+
+  const std::string shard_status_oid;
+  const rgw_raw_obj bucket_status_obj;
+  rgw_bucket_shard_sync_info sync_status;
+  RGWObjVersionTracker objv_tracker;
+
+  RGWSyncTraceNodeRef tn;
+
+public:
+  RGWSyncBucketShardCR(RGWDataSyncCtx *_sc,
+                       boost::intrusive_ptr<const RGWContinuousLeaseCR> lease_cr,
+                       const rgw_bucket_sync_pair_info& _sync_pair,
+                       rgw_bucket_sync_pipe& sync_pipe,
+                       bool& bucket_stopped,
+                       uint64_t generation,
+                       const RGWSyncTraceNodeRef& tn,
+                       ceph::real_time* progress)
+    : RGWCoroutine(_sc->cct), sc(_sc), sync_env(_sc->env),
+      lease_cr(std::move(lease_cr)), sync_pair(_sync_pair),
+      sync_pipe(sync_pipe), bucket_stopped(bucket_stopped), generation(generation), progress(progress),
+      shard_status_oid(RGWBucketPipeSyncStatusManager::inc_status_oid(sc->source_zone, sync_pair, generation)),
+      bucket_status_obj(sc->env->svc->zone->get_zone_params().log_pool,
+                 RGWBucketPipeSyncStatusManager::full_status_oid(sc->source_zone,
+                                                                 sync_pair.source_bs.bucket,
+                                                                 sync_pair.dest_bucket)),
+      tn(tn) {
+  }
+
+  int operate(const DoutPrefixProvider *dpp) override;
+};
+
+int RGWSyncBucketShardCR::operate(const DoutPrefixProvider *dpp)
+{
+  reenter(this) {
+    yield call(new RGWReadBucketPipeSyncStatusCoroutine(sc, sync_pair, &sync_status, &objv_tracker, generation));
+    if (retcode < 0 && retcode != -ENOENT) {
+      tn->log(0, SSTR("ERROR: failed to read sync status for bucket. error: " << retcode));
+      return set_cr_error(retcode);
+    }
+
+    tn->log(20, SSTR("sync status for source bucket shard: " << sync_status.state));
+    sync_status.state = rgw_bucket_shard_sync_info::StateIncrementalSync;
+    if (progress) {
+      *progress = sync_status.inc_marker.timestamp;
+    }
+
+    yield call(new RGWBucketShardIncrementalSyncCR(sc, sync_pipe,
+                                                   shard_status_oid, bucket_status_obj, lease_cr,
+                                                   sync_status, generation, tn,
+                                                   objv_tracker, progress));
+    if (retcode < 0) {
+      tn->log(5, SSTR("incremental sync on bucket failed, retcode=" << retcode));
+      return set_cr_error(retcode);
+    }
+
+    if (sync_status.state == rgw_bucket_shard_sync_info::StateStopped) {
+      tn->log(20, SSTR("syncstopped indication for source bucket shard"));
+      bucket_stopped = true;
+    }
+
+    return set_cr_done();
+  }
+
+  return 0;
+}
+
+class RGWSyncBucketCR : public RGWCoroutine {
+  RGWDataSyncCtx *sc;
+  RGWDataSyncEnv *env;
+  boost::intrusive_ptr<const RGWContinuousLeaseCR> data_lease_cr;
+  boost::intrusive_ptr<RGWContinuousLeaseCR> bucket_lease_cr;
+  rgw_bucket_sync_pair_info sync_pair;
+  rgw_bucket_sync_pipe sync_pipe;
+  std::optional<uint64_t> gen;
+  ceph::real_time* progress;
+
+  const std::string lock_name = "bucket sync";
+  const uint32_t lock_duration;
+  const rgw_raw_obj status_obj;
+  rgw_bucket_sync_status bucket_status;
+  bool bucket_stopped = false;
+  RGWObjVersionTracker objv;
+  bool init_check_compat = false;
+  rgw_bucket_index_marker_info info;
+
+  RGWSyncTraceNodeRef tn;
+
+public:
+  RGWSyncBucketCR(RGWDataSyncCtx *_sc,
+                  boost::intrusive_ptr<const RGWContinuousLeaseCR> lease_cr,
+                  const rgw_bucket_sync_pair_info& _sync_pair,
+                  std::optional<uint64_t> gen,
+                  const RGWSyncTraceNodeRef& _tn_parent,
+                  ceph::real_time* progress)
+    : RGWCoroutine(_sc->cct), sc(_sc), env(_sc->env),
+      data_lease_cr(std::move(lease_cr)), sync_pair(_sync_pair),
+      gen(gen), progress(progress),
+      lock_duration(cct->_conf->rgw_sync_lease_period),
+      status_obj(env->svc->zone->get_zone_params().log_pool,
+                 RGWBucketPipeSyncStatusManager::full_status_oid(sc->source_zone,
+                                                                 sync_pair.source_bs.bucket,
+                                                                 sync_pair.dest_bucket)),
+      tn(env->sync_tracer->add_node(_tn_parent, "bucket",
+                                    SSTR(bucket_str{_sync_pair.dest_bucket} << "<-" << bucket_shard_str{_sync_pair.source_bs} ))) {
+  }
+
+  int operate(const DoutPrefixProvider *dpp) override;
+};
+
+static RGWCoroutine* sync_bucket_shard_cr(RGWDataSyncCtx* sc,
+                                          boost::intrusive_ptr<const RGWContinuousLeaseCR> lease,
+                                          const rgw_bucket_sync_pair_info& sync_pair,
+                                          std::optional<uint64_t> gen,
+                                          const RGWSyncTraceNodeRef& tn,
+                                          ceph::real_time* progress)
+{
+  return new RGWSyncBucketCR(sc, std::move(lease), sync_pair,
+                             gen, tn, progress);
+}
+
+#define RELEASE_LOCK(cr) \
+       if (cr) {cr->go_down(); drain_all(); cr.reset();}
+
+int RGWSyncBucketCR::operate(const DoutPrefixProvider *dpp)
+{
+  reenter(this) {
+    // read source/destination bucket info
+    yield call(new RGWSyncGetBucketInfoCR(env, sync_pair.source_bs.bucket, &sync_pipe.source_bucket_info,
+                                          &sync_pipe.source_bucket_attrs, tn));
+    if (retcode < 0) {
+      tn->log(0, SSTR("ERROR: failed to retrieve bucket info for bucket=" << bucket_str{sync_pair.source_bs.bucket}));
+      return set_cr_error(retcode);
+    }
+
+    yield call(new RGWSyncGetBucketInfoCR(env, sync_pair.dest_bucket, &sync_pipe.dest_bucket_info,
+                                          &sync_pipe.dest_bucket_attrs, tn));
+    if (retcode < 0) {
+      tn->log(0, SSTR("ERROR: failed to retrieve bucket info for bucket=" << bucket_str{sync_pair.source_bs.bucket}));
+      return set_cr_error(retcode);
+    }
+
+    sync_pipe.info = sync_pair;
+
+    // read bucket sync status
+    using ReadCR = RGWSimpleRadosReadCR<rgw_bucket_sync_status>;
+    using WriteCR = RGWSimpleRadosWriteCR<rgw_bucket_sync_status>;
+
+    yield call(new ReadCR(dpp, env->async_rados, env->svc->sysobj,
+                          status_obj, &bucket_status, false, &objv));
+    if (retcode == -ENOENT) {
+      // use exclusive create to set state=Init
+      objv.generate_new_write_ver(cct);
+      yield call(new WriteCR(dpp, env->async_rados, env->svc->sysobj,
+                             status_obj, bucket_status, &objv, true));
+      tn->log(20, "bucket status object does not exist, create a new one");
+      if (retcode == -EEXIST) {
+        // raced with another create, read its status
+        tn->log(20, "raced with another create, read its status");
+        yield call(new ReadCR(dpp, env->async_rados, env->svc->sysobj,
+                              status_obj, &bucket_status, false, &objv));
+      }
+    }
+    if (retcode < 0) {
+      tn->log(20, SSTR("ERROR: failed to read bucket status object. error: " << retcode));
+      return set_cr_error(retcode);
+    }
+
+    do {
+      tn->log(20, SSTR("sync status for source bucket: " << bucket_status.state << 
+            ". lease is: " << (bucket_lease_cr ? "taken" : "not taken") << ". stop indications is: " << bucket_stopped));
+
+      if (bucket_status.state != BucketSyncState::Incremental ||
+          bucket_stopped) { 
+        // if state is Init or Stopped, we query the remote RGW for ther state
+        yield call(new RGWReadRemoteBucketIndexLogInfoCR(sc, sync_pair.source_bs.bucket, &info));
+        if (retcode < 0) {
+          return set_cr_error(retcode);
+        }
+        if (info.syncstopped) {
+          // remote indicates stopped state
+          tn->log(20, "remote bilog indicates that sync was stopped");
+          if (!bucket_lease_cr) {
+            bucket_lease_cr.reset(new RGWContinuousLeaseCR(env->async_rados, env->driver, status_obj,
+                                                         lock_name, lock_duration, this));
+            yield spawn(bucket_lease_cr.get(), false);
+            while (!bucket_lease_cr->is_locked()) {
+              if (bucket_lease_cr->is_done()) {
+                tn->log(5, "failed to take lease");
+                set_status("lease lock failed, early abort");
+                drain_all();
+                return set_cr_error(bucket_lease_cr->get_ret_status());
+              }
+              tn->log(5, "waiting on bucket lease");
+              yield set_sleeping(true);
+            }
+          }
+
+          // if state was incremental, remove all per-shard status objects
+          if (bucket_status.state == BucketSyncState::Incremental) {
+            yield {
+              const auto num_shards = bucket_status.shards_done_with_gen.size();
+              const auto gen = bucket_status.incremental_gen;
+              call(new RemoveBucketShardStatusCollectCR(sc, sync_pair, gen, num_shards));
+            }
+          }
+
+          // check if local state is "stopped"
+          yield call(new ReadCR(dpp, env->async_rados, env->svc->sysobj,
+                status_obj, &bucket_status, false, &objv));
+          if (retcode < 0) {
+            tn->log(20, SSTR("ERROR: failed to read status before writing 'stopped'. error: " << retcode));
+            RELEASE_LOCK(bucket_lease_cr);
+            return set_cr_error(retcode);
+          }
+          if (bucket_status.state != BucketSyncState::Stopped) {
+            // make sure that state is changed to stopped localy
+            bucket_status.state = BucketSyncState::Stopped;
+            yield call(new WriteCR(dpp, env->async_rados, env->svc->sysobj,
+                  status_obj, bucket_status, &objv, false));
+            if (retcode < 0) {
+              tn->log(20, SSTR("ERROR: failed to write 'stopped' status. error: " << retcode));
+              RELEASE_LOCK(bucket_lease_cr);
+              return set_cr_error(retcode);
+            }
+          }
+          RELEASE_LOCK(bucket_lease_cr);
+          return set_cr_done();
+        }
+        if (bucket_stopped) {
+          tn->log(20, SSTR("ERROR: switched from 'stop' to 'start' sync. while state is: " << bucket_status.state));
+          bucket_stopped = false;
+          bucket_status.state = BucketSyncState::Init;
+        }
+      }
+
+      if (bucket_status.state != BucketSyncState::Incremental) {
+        // if the state wasn't Incremental, take a bucket-wide lease to prevent
+        // different shards from duplicating the init and full sync
+        if (!bucket_lease_cr) {
+          bucket_lease_cr.reset(new RGWContinuousLeaseCR(env->async_rados, env->driver, status_obj,
+                                                       lock_name, lock_duration, this));
+          yield spawn(bucket_lease_cr.get(), false);
+          while (!bucket_lease_cr->is_locked()) {
+            if (bucket_lease_cr->is_done()) {
+              tn->log(5, "failed to take lease");
+              set_status("lease lock failed, early abort");
+              drain_all();
+              return set_cr_error(bucket_lease_cr->get_ret_status());
+            }
+            tn->log(5, "waiting on bucket lease");
+            yield set_sleeping(true);
+          }
+        }
+
+        // reread the status after acquiring the lock
+        yield call(new ReadCR(dpp, env->async_rados, env->svc->sysobj,
+                            status_obj, &bucket_status, false, &objv));
+        if (retcode < 0) {
+          RELEASE_LOCK(bucket_lease_cr);
+          tn->log(20, SSTR("ERROR: reading the status after acquiring the lock failed. error: " << retcode));
+          return set_cr_error(retcode);
+        }
+        tn->log(20, SSTR("status after acquiring the lock is: " << bucket_status.state));
+
+       yield call(new InitBucketFullSyncStatusCR(sc, sync_pair, status_obj,
+                                                 bucket_status, objv,
+                                                 sync_pipe.source_bucket_info,
+                                                 init_check_compat, info));
+
+        if (retcode < 0) {
+          tn->log(20, SSTR("ERROR: init full sync failed. error: " << retcode));
+          RELEASE_LOCK(bucket_lease_cr);
+          return set_cr_error(retcode);
+        }
+      }
+
+      assert(bucket_status.state == BucketSyncState::Incremental || 
+          bucket_status.state == BucketSyncState::Full);
+
+      if (bucket_status.state == BucketSyncState::Full) {
+        assert(bucket_lease_cr);
+        yield call(new RGWBucketFullSyncCR(sc, sync_pipe, status_obj,
+                                           bucket_lease_cr, bucket_status,
+                                           tn, objv));
+        if (retcode < 0) {
+          tn->log(20, SSTR("ERROR: full sync failed. error: " << retcode));
+          RELEASE_LOCK(bucket_lease_cr);
+          return set_cr_error(retcode);
+        }
+      }
+
+      if (bucket_status.state == BucketSyncState::Incremental) {
+        // lease not required for incremental sync
+        RELEASE_LOCK(bucket_lease_cr);
+
+        // if a specific gen was requested, compare that to the sync status
+        if (gen) {
+          const auto current_gen = bucket_status.incremental_gen;
+          if (*gen > current_gen) {
+            retcode = -EAGAIN;
+            tn->log(10, SSTR("ERROR: requested sync of future generation "
+                             << *gen << " > " << current_gen
+                             << ", returning " << retcode << " for later retry"));
+            return set_cr_error(retcode);
+          } else if (*gen < current_gen) {
+            tn->log(10, SSTR("WARNING: requested sync of past generation "
+                             << *gen << " < " << current_gen
+                             << ", returning success"));
+            return set_cr_done();
+          }
+        }
+
+        assert(sync_pair.source_bs.shard_id >= 0);
+        if (static_cast<std::size_t>(sync_pair.source_bs.shard_id) >= bucket_status.shards_done_with_gen.size()) {
+          tn->log(1, SSTR("bucket shard " << sync_pair.source_bs << " index out of bounds"));
+          return set_cr_done(); // return success so we don't retry
+        }
+        if (bucket_status.shards_done_with_gen[sync_pair.source_bs.shard_id]) {
+          tn->log(10, SSTR("bucket shard " << sync_pair.source_bs << " of gen " <<
+                          gen << " already synced."));
+          return set_cr_done();
+        }
+
+        yield call(new RGWSyncBucketShardCR(sc, data_lease_cr, sync_pair,
+                                            sync_pipe, bucket_stopped,
+                                            bucket_status.incremental_gen, tn, progress));
+        if (retcode < 0) {
+          tn->log(20, SSTR("ERROR: incremental sync failed. error: " << retcode));
+          return set_cr_error(retcode);
+        }
+      }
+      // loop back to previous states unless incremental sync returns normally
+    } while (bucket_status.state != BucketSyncState::Incremental || bucket_stopped);
+
+    return set_cr_done();
+  }
+
+  return 0;
+}
+
+int RGWBucketPipeSyncStatusManager::do_init(const DoutPrefixProvider *dpp,
+                                           std::ostream* ostr)
+{
+  int ret = http_manager.start();
+  if (ret < 0) {
+    ldpp_dout(this, 0) << "failed in http_manager.start() ret=" << ret << dendl;
+    return ret;
+  }
+
+  sync_module.reset(new RGWDefaultSyncModuleInstance());
+  auto async_rados = driver->svc()->rados->get_async_processor();
+
+  sync_env.init(this, driver->ctx(), driver,
+                driver->svc(), async_rados, &http_manager,
+                error_logger.get(), driver->getRados()->get_sync_tracer(),
+                sync_module, nullptr);
+
+  sync_env.ostr = ostr;
+
+  rgw_sync_pipe_info_set pipes;
+
+  ret = cr_mgr.run(dpp, new RGWGetBucketPeersCR(&sync_env,
+                                           dest_bucket,
+                                           source_zone,
+                                           source_bucket,
+                                           &pipes,
+                                           sync_env.sync_tracer->root_node));
+  if (ret < 0) {
+    ldpp_dout(this, 0) << "failed to get bucket source peers info: (ret=" << ret << "): " << cpp_strerror(-ret) << dendl;
+    return ret;
+  }
+
+  if (pipes.empty()) {
+    ldpp_dout(this, 0) << "No peers. This is not a valid multisite configuration." << dendl;
+    return -EINVAL;
+  }
+
+  for (auto& pipe : pipes) {
+    auto& szone = pipe.source.zone;
+
+    auto conn = driver->svc()->zone->get_zone_conn(szone);
+    if (!conn) {
+      ldpp_dout(this, 0) << "connection object to zone " << szone << " does not exist" << dendl;
+      return -EINVAL;
+    }
+
+    RGWZone* z;
+    if (!(z = driver->svc()->zone->find_zone(szone))) {
+      ldpp_dout(this, 0) << "zone " << szone << " does not exist" << dendl;
+      return -EINVAL;
+    }
+    sources.emplace_back(&sync_env, szone, conn,
+                        pipe.source.get_bucket_info(),
+                        pipe.target.get_bucket(),
+                        pipe.handler, z->name);
+  }
+
+  return 0;
+}
+
+int RGWBucketPipeSyncStatusManager::remote_info(const DoutPrefixProvider *dpp,
+                                               source& s,
+                                               uint64_t* oldest_gen,
+                                               uint64_t* latest_gen,
+                                               uint64_t* num_shards)
+{
+  rgw_bucket_index_marker_info remote_info;
+  BucketIndexShardsManager remote_markers;
+  auto r = rgw_read_remote_bilog_info(dpp, s.sc.conn, s.info.bucket,
+                                     remote_info, remote_markers,
+                                     null_yield);
+
+  if (r < 0) {
+    ldpp_dout(dpp, 0) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                     << " rgw_read_remote_bilog_info: r="
+                     << r << dendl;
+    return r;
+  }
+  if (oldest_gen)
+    *oldest_gen = remote_info.oldest_gen;
+
+  if (latest_gen)
+    *latest_gen = remote_info.latest_gen;
+
+  if (num_shards)
+    *num_shards = remote_markers.get().size();
+
+  return 0;
+}
+
+tl::expected<std::unique_ptr<RGWBucketPipeSyncStatusManager>, int>
+RGWBucketPipeSyncStatusManager::construct(
+  const DoutPrefixProvider* dpp,
+  rgw::sal::RadosStore* driver,
+  std::optional<rgw_zone_id> source_zone,
+  std::optional<rgw_bucket> source_bucket,
+  const rgw_bucket& dest_bucket,
+  std::ostream* ostr)
+{
+  std::unique_ptr<RGWBucketPipeSyncStatusManager> self{
+    new RGWBucketPipeSyncStatusManager(driver, source_zone, source_bucket,
+                                      dest_bucket)};
+  auto r = self->do_init(dpp, ostr);
+  if (r < 0) {
+    return tl::unexpected(r);
+  }
+  return self;
+}
+
+int RGWBucketPipeSyncStatusManager::init_sync_status(
+  const DoutPrefixProvider *dpp)
+{
+  // Just running one at a time saves us from buildup/teardown and in
+  // practice we only do one zone at a time.
+  for (auto& source : sources) {
+    list<RGWCoroutinesStack*> stacks;
+    RGWCoroutinesStack *stack = new RGWCoroutinesStack(driver->ctx(), &cr_mgr);
+    pretty_print(source.sc.env, "Initializing sync state of bucket {} with zone {}.\n",
+                source.info.bucket.name, source.zone_name);
+    stack->call(new RGWSimpleRadosWriteCR<rgw_bucket_sync_status>(
+                 dpp, source.sc.env->async_rados, source.sc.env->svc->sysobj,
+                 {sync_env.svc->zone->get_zone_params().log_pool,
+                   full_status_oid(source.sc.source_zone,
+                                  source.info.bucket,
+                                  source.dest)},
+                 rgw_bucket_sync_status{}));
+    stacks.push_back(stack);
+    auto r = cr_mgr.run(dpp, stacks);
+    if (r < 0) {
+      pretty_print(source.sc.env,
+                  "Initialization of sync state for bucket {} with zone {} "
+                  "failed with error {}\n",
+                  source.info.bucket.name, source.zone_name, cpp_strerror(r));
+    }
+  }
+  return 0;
+}
+
+tl::expected<std::map<int, rgw_bucket_shard_sync_info>, int>
+RGWBucketPipeSyncStatusManager::read_sync_status(
+  const DoutPrefixProvider *dpp)
+{
+  std::map<int, rgw_bucket_shard_sync_info> sync_status;
+  list<RGWCoroutinesStack *> stacks;
+
+  auto sz = sources.begin();
+
+  if (source_zone) {
+    sz = std::find_if(sources.begin(), sources.end(),
+                     [this](const source& s) {
+                       return s.sc.source_zone == *source_zone;
+                     }
+      );
+    if (sz == sources.end()) {
+      ldpp_dout(this, 0) << "ERROR: failed to find source zone: "
+                        << *source_zone << dendl;
+      return tl::unexpected(-ENOENT);
+    }
+  } else {
+    ldpp_dout(this, 5) << "No source zone specified, using source zone: "
+                      << sz->sc.source_zone << dendl;
+    return tl::unexpected(-ENOENT);
+  }
+  uint64_t num_shards, latest_gen;
+  auto ret = remote_info(dpp, *sz, nullptr, &latest_gen, &num_shards);
+  if (!ret) {
+    ldpp_dout(this, 5) << "Unable to get remote info: "
+                      << ret << dendl;
+    return tl::unexpected(ret);
+  }
+  auto stack = new RGWCoroutinesStack(driver->ctx(), &cr_mgr);
+  std::vector<rgw_bucket_sync_pair_info> pairs(num_shards);
+  for (auto shard = 0u; shard < num_shards; ++shard) {
+    auto& pair = pairs[shard];
+    pair.source_bs.bucket = sz->info.bucket;
+    pair.dest_bucket = sz->dest;
+    pair.source_bs.shard_id = shard;
+    stack->call(new RGWReadBucketPipeSyncStatusCoroutine(
+                 &sz->sc, pair, &sync_status[shard],
+                 nullptr, latest_gen));
+  }
+
+  stacks.push_back(stack);
+
+  ret = cr_mgr.run(dpp, stacks);
+  if (ret < 0) {
+    ldpp_dout(this, 0) << "ERROR: failed to read sync status for "
+                      << bucket_str{dest_bucket} << dendl;
+    return tl::unexpected(ret);
+  }
+
+  return sync_status;
+}
+
+namespace rgw::bucket_sync_run {
+// Retry-loop over calls to sync_bucket_shard_cr
+class ShardCR : public RGWCoroutine {
+  static constexpr auto allowed_retries = 10u;
+
+  RGWDataSyncCtx& sc;
+  const rgw_bucket_sync_pair_info& pair;
+  const uint64_t gen;
+  unsigned retries = 0;
+
+  ceph::real_time prev_progress;
+  ceph::real_time progress;
+
+public:
+
+  ShardCR(RGWDataSyncCtx& sc, const rgw_bucket_sync_pair_info& pair,
+         const uint64_t gen)
+    : RGWCoroutine(sc.cct), sc(sc), pair(pair), gen(gen) {}
+
+  int operate(const DoutPrefixProvider *dpp) override {
+    reenter(this) {
+      // Since all errors (except ECANCELED) are considered retryable,
+      // retry other errors so long as we're making progress.
+      for (retries = 0u, retcode = -EDOM;
+          (retries < allowed_retries) && (retcode != 0);
+          ++retries) {
+       ldpp_dout(dpp, 5) << "ShardCR: syncing bucket shard on: "
+                         << "zone=" << sc.source_zone
+                         << ", bucket=" << pair.source_bs.bucket.name
+                         << ", shard=" << pair.source_bs.shard_id
+                         << ", gen=" << gen
+                         << dendl;
+       yield call(sync_bucket_shard_cr(&sc, nullptr, pair, gen,
+                                       sc.env->sync_tracer->root_node,
+                                       &progress));
+
+       if (retcode == -ECANCELED) {
+         ldpp_dout(dpp, -1) << "ERROR: Got -ECANCELED for "
+                            << pair.source_bs << dendl;
+         drain_all();
+         return set_cr_error(retcode);
+       } else if (retcode < 0) {
+         ldpp_dout(dpp, 5) << "WARNING: Got error, retcode=" << retcode << " for "
+                           << pair.source_bs << "on retry "
+                           << retries + 1 << " of " << allowed_retries
+                           << " allowed" << dendl;
+         // Reset the retry counter if we made any progress
+         if (progress != prev_progress) {
+           retries = 0;
+         }
+         prev_progress = progress;
+       }
+      }
+      if (retcode < 0) {
+       ldpp_dout(dpp, -1) << "ERROR: Exhausted retries for "
+                          << pair.source_bs << " retcode="
+                          << retcode << dendl;
+       drain_all();
+       return set_cr_error(retcode);
+      }
+
+      drain_all();
+      return set_cr_done();
+    }
+    return 0;
+  }
+};
+
+// Loop over calls to ShardCR with limited concurrency
+class GenCR : public RGWShardCollectCR {
+  static constexpr auto MAX_CONCURRENT_SHARDS = 64;
+
+  RGWDataSyncCtx& sc;
+  const uint64_t gen;
+
+  std::vector<rgw_bucket_sync_pair_info> pairs;
+  decltype(pairs)::const_iterator iter;
+
+public:
+  GenCR(RGWDataSyncCtx& sc, const rgw_bucket& source, const rgw_bucket& dest,
+       const uint64_t gen, const uint64_t shards,
+       const RGWBucketSyncFlowManager::pipe_handler& handler)
+    : RGWShardCollectCR(sc.cct, MAX_CONCURRENT_SHARDS),
+      sc(sc), gen(gen) {
+    pairs.resize(shards);
+    for (auto shard = 0u; shard < shards; ++shard) {
+      auto& pair = pairs[shard];
+      pair.handler = handler;
+      pair.source_bs.bucket = source;
+      pair.dest_bucket = dest;
+      pair.source_bs.shard_id = shard;
+    }
+    iter = pairs.cbegin();
+    assert(pairs.size() == shards);
+  }
+
+  virtual bool spawn_next() override {
+    if (iter == pairs.cend()) {
+      return false;
+    }
+    spawn(new ShardCR(sc, *iter, gen), false);
+    ++iter;
+    return true;
+  }
+
+  int handle_result(int r) override {
+    if (r < 0) {
+      ldpp_dout(sc.env->dpp, 4) << "ERROR: Error syncing shard: "
+                               << cpp_strerror(r) << dendl;
+    }
+    return r;
+  }
+};
+
+// Read sync status, loop over calls to GenCR
+class SourceCR : public RGWCoroutine {
+  RGWDataSyncCtx& sc;
+  const RGWBucketInfo& info;
+  const rgw_bucket& dest;
+  const RGWBucketSyncFlowManager::pipe_handler& handler;
+  const rgw_raw_obj status_obj{
+    sc.env->svc->zone->get_zone_params().log_pool,
+    RGWBucketPipeSyncStatusManager::full_status_oid(sc.source_zone, info.bucket,
+                                                   dest)};
+
+  BucketSyncState state = BucketSyncState::Incremental;
+  uint64_t gen = 0;
+  uint64_t num_shards = 0;
+  rgw_bucket_sync_status status;
+  std::string zone_name;
+
+public:
+
+  SourceCR(RGWDataSyncCtx& sc, const RGWBucketInfo& info,
+          const rgw_bucket& dest,
+          const RGWBucketSyncFlowManager::pipe_handler& handler,
+          const std::string& zone_name)
+    : RGWCoroutine(sc.cct), sc(sc), info(info), dest(dest), handler(handler),
+      zone_name(zone_name) {}
+
+  int operate(const DoutPrefixProvider *dpp) override {
+    reenter(this) {
+      // Get the source's status. In incremental sync, this gives us
+      // the generation and shard count that is next needed to be run.
+      yield call(new RGWSimpleRadosReadCR<rgw_bucket_sync_status>(
+                  dpp, sc.env->async_rados, sc.env->svc->sysobj,
+                  status_obj, &status));
+      if (retcode < 0) {
+       ldpp_dout(dpp, -1) << "ERROR: Unable to fetch status for zone="
+                          << sc.source_zone << " retcode="
+                          << retcode << dendl;
+       drain_all();
+       return set_cr_error(retcode);
+      }
+
+      if (status.state == BucketSyncState::Stopped) {
+       // Nothing to do.
+       pretty_print(sc.env, "Sync of bucket {} from source zone {} is in state Stopped. "
+                    "Nothing to do.\n", dest.name, zone_name);
+       ldpp_dout(dpp, 5) << "SourceCR: Bucket is in state Stopped, returning."
+                         << dendl;
+       drain_all();
+       return set_cr_done();
+      }
+
+      do {
+       state = status.state;
+       gen = status.incremental_gen;
+       num_shards = status.shards_done_with_gen.size();
+
+       ldpp_dout(dpp, 5) << "SourceCR: "
+                         << "state=" << state
+                         << ", gen=" << gen
+                         << ", num_shards=" << num_shards
+                         << dendl;
+
+       // Special case to handle full sync. Since full sync no longer
+       // uses shards and has no generations, we sync shard zero,
+       // though use the current generation so a following
+       // incremental sync can carry on.
+       if (state != BucketSyncState::Incremental) {
+         pretty_print(sc.env, "Beginning full sync of bucket {} from source zone {}.\n",
+                      dest.name, zone_name);
+         ldpp_dout(dpp, 5)  << "SourceCR: Calling GenCR with "
+                            << "gen=" << gen
+                            << ", num_shards=" << 1
+                            << dendl;
+         yield call(new GenCR(sc, info.bucket, dest, gen, 1, handler));
+       } else {
+         pretty_print(sc.env, "Beginning incremental sync of bucket {}, generation {} from source zone {}.\n",
+                      dest.name, gen, zone_name);
+         ldpp_dout(dpp, 5) << "SourceCR: Calling GenCR with "
+                           << "gen=" << gen
+                           << ", num_shards=" << num_shards
+                           << dendl;
+         yield call(new GenCR(sc, info.bucket, dest, gen, num_shards,
+                              handler));
+       }
+       if (retcode < 0) {
+         ldpp_dout(dpp, -1) << "ERROR: Giving up syncing from "
+                            << sc.source_zone << " retcode="
+                            << retcode << dendl;
+         drain_all();
+         return set_cr_error(retcode);
+       }
+
+       pretty_print(sc.env, "Completed.\n");
+
+       yield call(new RGWSimpleRadosReadCR<rgw_bucket_sync_status>(
+                    dpp, sc.env->async_rados, sc.env->svc->sysobj,
+                    status_obj, &status));
+       if (retcode < 0) {
+         ldpp_dout(dpp, -1) << "ERROR: Unable to fetch status for zone="
+                            << sc.source_zone << " retcode="
+                            << retcode << dendl;
+         drain_all();
+         return set_cr_error(retcode);
+       }
+       // Repeat until we have done an incremental run and the
+       // generation remains unchanged.
+       ldpp_dout(dpp, 5) << "SourceCR: "
+                         << "state=" << state
+                         << ", gen=" << gen
+                         << ", num_shards=" << num_shards
+                         << ", status.state=" << status.state
+                         << ", status.incremental_gen=" << status.incremental_gen
+                         << ", status.shards_done_with_gen.size()=" << status.shards_done_with_gen.size()
+                         << dendl;
+      } while (state != BucketSyncState::Incremental ||
+              gen != status.incremental_gen);
+      drain_all();
+      return set_cr_done();
+    }
+    return 0;
+  }
+};
+} // namespace rgw::bucket_sync_run
+
+int RGWBucketPipeSyncStatusManager::run(const DoutPrefixProvider *dpp)
+{
+  list<RGWCoroutinesStack *> stacks;
+  for (auto& source : sources) {
+    auto stack = new RGWCoroutinesStack(driver->ctx(), &cr_mgr);
+    stack->call(new rgw::bucket_sync_run::SourceCR(
+                 source.sc, source.info, source.dest, source.handler,
+                 source.zone_name));
+    stacks.push_back(stack);
+  }
+  auto ret = cr_mgr.run(dpp, stacks);
+  if (ret < 0) {
+    ldpp_dout(this, 0) << "ERROR: Sync unsuccessful on bucket "
+                      << bucket_str{dest_bucket} << dendl;
+  }
+  return ret;
+}
+
+unsigned RGWBucketPipeSyncStatusManager::get_subsys() const
+{
+  return dout_subsys;
+}
+
+std::ostream& RGWBucketPipeSyncStatusManager::gen_prefix(std::ostream& out) const
+{
+  auto zone = std::string_view{source_zone.value_or(rgw_zone_id("*")).id};
+  return out << "bucket sync zone:" << zone.substr(0, 8)
+    << " bucket:" << dest_bucket << ' ';
+}
+
+string RGWBucketPipeSyncStatusManager::full_status_oid(const rgw_zone_id& source_zone,
+                                                       const rgw_bucket& source_bucket,
+                                                       const rgw_bucket& dest_bucket)
+{
+  if (source_bucket == dest_bucket) {
+    return bucket_full_status_oid_prefix + "." + source_zone.id + ":"
+        + dest_bucket.get_key();
+  } else {
+    return bucket_full_status_oid_prefix + "." + source_zone.id + ":"
+        + dest_bucket.get_key() + ":" + source_bucket.get_key();
+  }
+}
+
+inline std::string generation_token(uint64_t gen) {
+  return (gen == 0) ? "" : (":" + std::to_string(gen));
+}
+
+string RGWBucketPipeSyncStatusManager::inc_status_oid(const rgw_zone_id& source_zone,
+                                                      const rgw_bucket_sync_pair_info& sync_pair,
+                                                      uint64_t gen)
+{
+  if (sync_pair.source_bs.bucket == sync_pair.dest_bucket) {
+    return bucket_status_oid_prefix + "." + source_zone.id + ":" + sync_pair.source_bs.get_key() + 
+      generation_token(gen);
+  } else {
+    return bucket_status_oid_prefix + "." + source_zone.id + ":" + sync_pair.dest_bucket.get_key() + ":" + sync_pair.source_bs.get_key() +
+      generation_token(gen);
+  }
+}
+
+string RGWBucketPipeSyncStatusManager::obj_status_oid(const rgw_bucket_sync_pipe& sync_pipe,
+                                                      const rgw_zone_id& source_zone,
+                                                      const rgw::sal::Object* obj)
+{
+  string prefix = object_status_oid_prefix + "." + source_zone.id + ":" + obj->get_bucket()->get_key().get_key();
+  if (sync_pipe.source_bucket_info.bucket !=
+      sync_pipe.dest_bucket_info.bucket) {
+    prefix += string("/") + sync_pipe.dest_bucket_info.bucket.get_key();
+  }
+  return prefix + ":" + obj->get_name() + ":" + obj->get_instance();
+}
+
+int rgw_read_remote_bilog_info(const DoutPrefixProvider *dpp,
+                               RGWRESTConn* conn,
+                               const rgw_bucket& bucket,
+                               rgw_bucket_index_marker_info& info,
+                               BucketIndexShardsManager& markers,
+                               optional_yield y)
+{
+  const auto instance_key = bucket.get_key();
+  const rgw_http_param_pair params[] = {
+    { "type" , "bucket-index" },
+    { "bucket-instance", instance_key.c_str() },
+    { "info" , nullptr },
+    { nullptr, nullptr }
+  };
+  int r = conn->get_json_resource(dpp, "/admin/log/", params, y, info);
+  if (r < 0) {
+    ldpp_dout(dpp, -1) << "failed to fetch remote log markers: " << cpp_strerror(r) << dendl;
+    return r;
+  }
+  // parse shard markers
+  r = markers.from_string(info.max_marker, -1);
+  if (r < 0) {
+    ldpp_dout(dpp, -1) << "failed to decode remote log markers" << dendl;
+    return r;
+  }
+  return 0;
+}
+
+class RGWCollectBucketSyncStatusCR : public RGWShardCollectCR {
+  static constexpr int max_concurrent_shards = 16;
+  rgw::sal::RadosStore* const driver;
+  RGWDataSyncCtx *const sc;
+  RGWDataSyncEnv *const env;
+  const uint64_t gen;
+
+  rgw_bucket_sync_pair_info sync_pair;
+  using Vector = std::vector<rgw_bucket_shard_sync_info>;
+  Vector::iterator i, end;
+
+  int handle_result(int r) override {
+    if (r == -ENOENT) { // ENOENT is not a fatal error
+      return 0;
+    }
+    if (r < 0) {
+      ldout(cct, 4) << "failed to read bucket shard sync status: "
+          << cpp_strerror(r) << dendl;
+    }
+    return r;
+  }
+ public:
+  RGWCollectBucketSyncStatusCR(rgw::sal::RadosStore* driver, RGWDataSyncCtx *sc,
+                               const rgw_bucket_sync_pair_info& sync_pair,
+                               uint64_t gen,
+                               Vector *status)
+    : RGWShardCollectCR(sc->cct, max_concurrent_shards),
+      driver(driver), sc(sc), env(sc->env), gen(gen), sync_pair(sync_pair),
+      i(status->begin()), end(status->end())
+  {}
+
+  bool spawn_next() override {
+    if (i == end) {
+      return false;
+    }
+    spawn(new RGWReadBucketPipeSyncStatusCoroutine(sc, sync_pair, &*i, nullptr, gen), false);
+    ++i;
+    ++sync_pair.source_bs.shard_id;
+    return true;
+  }
+};
+
+int rgw_read_bucket_full_sync_status(const DoutPrefixProvider *dpp,
+                                     rgw::sal::RadosStore *driver,
+                                     const rgw_sync_bucket_pipe& pipe,
+                                     rgw_bucket_sync_status *status,
+                                     optional_yield y)
+{
+  auto get_oid = RGWBucketPipeSyncStatusManager::full_status_oid;
+  const rgw_raw_obj obj{driver->svc()->zone->get_zone_params().log_pool,
+                        get_oid(*pipe.source.zone, *pipe.source.bucket, *pipe.dest.bucket)};
+
+  auto svc = driver->svc()->sysobj;
+  auto sysobj = svc->get_obj(obj);
+  bufferlist bl;
+  int ret = sysobj.rop().read(dpp, &bl, y);
+  if (ret < 0)
+    return ret;
+
+  try {
+    auto iter = bl.cbegin();
+    using ceph::decode;
+    rgw_bucket_sync_status result;
+    decode(result, iter);
+    *status = result;
+    return 0;
+  } catch (const buffer::error& err) {
+    lderr(svc->ctx()) << "error decoding " << obj << ": " << err.what() << dendl;
+    return -EIO;
+  }
+}
+
+int rgw_read_bucket_inc_sync_status(const DoutPrefixProvider *dpp,
+                                    rgw::sal::RadosStore *driver,
+                                    const rgw_sync_bucket_pipe& pipe,
+                                    uint64_t gen,
+                                    std::vector<rgw_bucket_shard_sync_info> *status)
+{
+  if (!pipe.source.zone ||
+      !pipe.source.bucket ||
+      !pipe.dest.zone ||
+      !pipe.dest.bucket) {
+    return -EINVAL;
+  }
+
+  rgw_bucket_sync_pair_info sync_pair;
+  sync_pair.source_bs.bucket = *pipe.source.bucket;
+  sync_pair.source_bs.shard_id = 0;
+  sync_pair.dest_bucket = *pipe.dest.bucket;
+
+  RGWDataSyncEnv env;
+  RGWSyncModuleInstanceRef module; // null sync module
+  env.init(dpp, driver->ctx(), driver, driver->svc(), driver->svc()->rados->get_async_processor(),
+           nullptr, nullptr, nullptr, module, nullptr);
+
+  RGWDataSyncCtx sc;
+  sc.init(&env, nullptr, *pipe.source.zone);
+
+  RGWCoroutinesManager crs(driver->ctx(), driver->getRados()->get_cr_registry());
+  return crs.run(dpp, new RGWCollectBucketSyncStatusCR(driver, &sc,
+                                                  sync_pair,
+                                                  gen,
+                                                  status));
+}
+
+void rgw_data_sync_info::generate_test_instances(list<rgw_data_sync_info*>& o)
+{
+  auto info = new rgw_data_sync_info;
+  info->state = rgw_data_sync_info::StateBuildingFullSyncMaps;
+  info->num_shards = 8;
+  o.push_back(info);
+  o.push_back(new rgw_data_sync_info);
+}
+
+void rgw_data_sync_marker::generate_test_instances(list<rgw_data_sync_marker*>& o)
+{
+  auto marker = new rgw_data_sync_marker;
+  marker->state = rgw_data_sync_marker::IncrementalSync;
+  marker->marker = "01234";
+  marker->pos = 5;
+  o.push_back(marker);
+  o.push_back(new rgw_data_sync_marker);
+}
+
+void rgw_data_sync_status::generate_test_instances(list<rgw_data_sync_status*>& o)
+{
+  o.push_back(new rgw_data_sync_status);
+}
+
+void rgw_bucket_shard_full_sync_marker::dump(Formatter *f) const
+{
+  encode_json("position", position, f);
+  encode_json("count", count, f);
+}
+
+void rgw_bucket_shard_inc_sync_marker::decode_json(JSONObj *obj)
+{
+  JSONDecoder::decode_json("position", position, obj);
+  JSONDecoder::decode_json("timestamp", timestamp, obj);
+}
+
+void rgw_bucket_shard_inc_sync_marker::dump(Formatter *f) const
+{
+  encode_json("position", position, f);
+  encode_json("timestamp", timestamp, f);
+}
+
+void rgw_bucket_shard_sync_info::decode_json(JSONObj *obj)
+{
+  std::string s;
+  JSONDecoder::decode_json("status", s, obj);
+  if (s == "full-sync") {
+    state = StateFullSync;
+  } else if (s == "incremental-sync") {
+    state = StateIncrementalSync;
+  } else if (s == "stopped") {
+    state = StateStopped;
+  } else {
+    state = StateInit;
+  }
+  JSONDecoder::decode_json("inc_marker", inc_marker, obj);
+}
+
+void rgw_bucket_shard_full_sync_marker::decode_json(JSONObj *obj)
+{
+  JSONDecoder::decode_json("position", position, obj);
+  JSONDecoder::decode_json("count", count, obj);
+}
+
+void rgw_bucket_shard_sync_info::dump(Formatter *f) const
+{
+  const char *s{nullptr};
+  switch ((SyncState)state) {
+    case StateInit:
+    s = "init";
+    break;
+  case StateFullSync:
+    s = "full-sync";
+    break;
+  case StateIncrementalSync:
+    s = "incremental-sync";
+    break;
+  case StateStopped:
+    s = "stopped";
+    break;
+  default:
+    s = "unknown";
+    break;
+  }
+  encode_json("status", s, f);
+  encode_json("inc_marker", inc_marker, f);
+}
+
+void rgw_bucket_full_sync_status::decode_json(JSONObj *obj)
+{
+  JSONDecoder::decode_json("position", position, obj);
+  JSONDecoder::decode_json("count", count, obj);
+}
+
+void rgw_bucket_full_sync_status::dump(Formatter *f) const
+{
+  encode_json("position", position, f);
+  encode_json("count", count, f);
+}
+
+void encode_json(const char *name, BucketSyncState state, Formatter *f)
+{
+  switch (state) {
+  case BucketSyncState::Init:
+    encode_json(name, "init", f);
+    break;
+  case BucketSyncState::Full:
+    encode_json(name, "full-sync", f);
+    break;
+  case BucketSyncState::Incremental:
+    encode_json(name, "incremental-sync", f);
+    break;
+  case BucketSyncState::Stopped:
+    encode_json(name, "stopped", f);
+    break;
+  default:
+    encode_json(name, "unknown", f);
+    break;
+  }
+}
+
+void decode_json_obj(BucketSyncState& state, JSONObj *obj)
+{
+  std::string s;
+  decode_json_obj(s, obj);
+  if (s == "full-sync") {
+    state = BucketSyncState::Full;
+  } else if (s == "incremental-sync") {
+    state = BucketSyncState::Incremental;
+  } else if (s == "stopped") {
+    state = BucketSyncState::Stopped;
+  } else {
+    state = BucketSyncState::Init;
+  }
+}
+
+void rgw_bucket_sync_status::decode_json(JSONObj *obj)
+{
+  JSONDecoder::decode_json("state", state, obj);
+  JSONDecoder::decode_json("full", full, obj);
+  JSONDecoder::decode_json("incremental_gen", incremental_gen, obj);
+}
+
+void rgw_bucket_sync_status::dump(Formatter *f) const
+{
+  encode_json("state", state, f);
+  encode_json("full", full, f);
+  encode_json("incremental_gen", incremental_gen, f);
+}
+
+
+void bilog_status_v2::dump(Formatter *f) const
+{
+  encode_json("sync_status", sync_status, f);
+  encode_json("inc_status", inc_status, f);
+}
+
+void bilog_status_v2::decode_json(JSONObj *obj)
+{
+  JSONDecoder::decode_json("sync_status", sync_status, obj);
+  JSONDecoder::decode_json("inc_status", inc_status, obj);
+}
diff --git a/src/rgw/driver/rados/rgw_data_sync.h b/src/rgw/driver/rados/rgw_data_sync.h
new file mode 100644 (file)
index 0000000..6cc714d
--- /dev/null
@@ -0,0 +1,823 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#ifndef CEPH_RGW_DATA_SYNC_H
+#define CEPH_RGW_DATA_SYNC_H
+
+#undef FMT_HEADER_ONLY
+#define FMT_HEADER_ONLY 1
+#include <fmt/format.h>
+#include <fmt/ostream.h>
+
+#include "include/encoding.h"
+
+#include "common/ceph_json.h"
+#include "common/likely.h"
+
+#include "rgw_coroutine.h"
+#include "rgw_http_client.h"
+#include "rgw_sal_rados.h"
+
+#include "rgw_datalog.h"
+#include "rgw_sync.h"
+#include "rgw_sync_module.h"
+#include "rgw_sync_trace.h"
+#include "rgw_sync_policy.h"
+
+#include "rgw_bucket_sync.h"
+
+// represents an obligation to sync an entry up a given time
+struct rgw_data_sync_obligation {
+  rgw_bucket_shard bs;
+  std::optional<uint64_t> gen;
+  std::string marker;
+  ceph::real_time timestamp;
+  bool retry = false;
+};
+
+inline std::ostream& operator<<(std::ostream& out, const rgw_data_sync_obligation& o) {
+  out << "key=" << o.bs;
+  if (o.gen) {
+    out << '[' << *o.gen << ']';
+  }
+  if (!o.marker.empty()) {
+    out << " marker=" << o.marker;
+  }
+  if (o.timestamp != ceph::real_time{}) {
+    out << " timestamp=" << o.timestamp;
+  }
+  if (o.retry) {
+    out << " retry";
+  }
+  return out;
+}
+
+class JSONObj;
+struct rgw_sync_bucket_pipe;
+
+struct rgw_bucket_sync_pair_info {
+  RGWBucketSyncFlowManager::pipe_handler handler; /* responsible for sync filters */
+  rgw_bucket_shard source_bs;
+  rgw_bucket dest_bucket;
+};
+
+inline std::ostream& operator<<(std::ostream& out, const rgw_bucket_sync_pair_info& p) {
+  if (p.source_bs.bucket == p.dest_bucket) {
+    return out << p.source_bs;
+  }
+  return out << p.source_bs << "->" << p.dest_bucket;
+}
+
+struct rgw_bucket_sync_pipe {
+  rgw_bucket_sync_pair_info info;
+  RGWBucketInfo source_bucket_info;
+  std::map<std::string, bufferlist> source_bucket_attrs;
+  RGWBucketInfo dest_bucket_info;
+  std::map<std::string, bufferlist> dest_bucket_attrs;
+
+  RGWBucketSyncFlowManager::pipe_rules_ref& get_rules() {
+    return info.handler.rules;
+  }
+};
+
+inline std::ostream& operator<<(std::ostream& out, const rgw_bucket_sync_pipe& p) {
+  return out << p.info;
+}
+
+struct rgw_datalog_info {
+  uint32_t num_shards;
+
+  rgw_datalog_info() : num_shards(0) {}
+
+  void decode_json(JSONObj *obj);
+};
+
+struct rgw_data_sync_info {
+  enum SyncState {
+    StateInit = 0,
+    StateBuildingFullSyncMaps = 1,
+    StateSync = 2,
+  };
+
+  uint16_t state;
+  uint32_t num_shards;
+
+  uint64_t instance_id{0};
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(2, 1, bl);
+    encode(state, bl);
+    encode(num_shards, bl);
+    encode(instance_id, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) {
+     DECODE_START(2, bl);
+     decode(state, bl);
+     decode(num_shards, bl);
+     if (struct_v >= 2) {
+       decode(instance_id, bl);
+     }
+     DECODE_FINISH(bl);
+  }
+
+  void dump(Formatter *f) const {
+    std::string s;
+    switch ((SyncState)state) {
+      case StateInit:
+       s = "init";
+       break;
+      case StateBuildingFullSyncMaps:
+       s = "building-full-sync-maps";
+       break;
+      case StateSync:
+       s = "sync";
+       break;
+      default:
+       s = "unknown";
+       break;
+    }
+    encode_json("status", s, f);
+    encode_json("num_shards", num_shards, f);
+    encode_json("instance_id", instance_id, f);
+  }
+  void decode_json(JSONObj *obj) {
+    std::string s;
+    JSONDecoder::decode_json("status", s, obj);
+    if (s == "building-full-sync-maps") {
+      state = StateBuildingFullSyncMaps;
+    } else if (s == "sync") {
+      state = StateSync;
+    } else {
+      state = StateInit;
+    }
+    JSONDecoder::decode_json("num_shards", num_shards, obj);
+    JSONDecoder::decode_json("instance_id", instance_id, obj);
+  }
+  static void generate_test_instances(std::list<rgw_data_sync_info*>& o);
+
+  rgw_data_sync_info() : state((int)StateInit), num_shards(0) {}
+};
+WRITE_CLASS_ENCODER(rgw_data_sync_info)
+
+struct rgw_data_sync_marker {
+  enum SyncState {
+    FullSync = 0,
+    IncrementalSync = 1,
+  };
+  uint16_t state;
+  std::string marker;
+  std::string next_step_marker;
+  uint64_t total_entries;
+  uint64_t pos;
+  real_time timestamp;
+
+  rgw_data_sync_marker() : state(FullSync), total_entries(0), pos(0) {}
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(1, 1, bl);
+    encode(state, bl);
+    encode(marker, bl);
+    encode(next_step_marker, bl);
+    encode(total_entries, bl);
+    encode(pos, bl);
+    encode(timestamp, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) {
+     DECODE_START(1, bl);
+    decode(state, bl);
+    decode(marker, bl);
+    decode(next_step_marker, bl);
+    decode(total_entries, bl);
+    decode(pos, bl);
+    decode(timestamp, bl);
+     DECODE_FINISH(bl);
+  }
+
+  void dump(Formatter *f) const {
+    const char *s{nullptr};
+    switch ((SyncState)state) {
+      case FullSync:
+        s = "full-sync";
+        break;
+      case IncrementalSync:
+        s = "incremental-sync";
+        break;
+      default:
+        s = "unknown";
+        break;
+    }
+    encode_json("status", s, f);
+    encode_json("marker", marker, f);
+    encode_json("next_step_marker", next_step_marker, f);
+    encode_json("total_entries", total_entries, f);
+    encode_json("pos", pos, f);
+    encode_json("timestamp", utime_t(timestamp), f);
+  }
+  void decode_json(JSONObj *obj) {
+    std::string s;
+    JSONDecoder::decode_json("status", s, obj);
+    if (s == "full-sync") {
+      state = FullSync;
+    } else if (s == "incremental-sync") {
+      state = IncrementalSync;
+    }
+    JSONDecoder::decode_json("marker", marker, obj);
+    JSONDecoder::decode_json("next_step_marker", next_step_marker, obj);
+    JSONDecoder::decode_json("total_entries", total_entries, obj);
+    JSONDecoder::decode_json("pos", pos, obj);
+    utime_t t;
+    JSONDecoder::decode_json("timestamp", t, obj);
+    timestamp = t.to_real_time();
+  }
+  static void generate_test_instances(std::list<rgw_data_sync_marker*>& o);
+};
+WRITE_CLASS_ENCODER(rgw_data_sync_marker)
+
+struct rgw_data_sync_status {
+  rgw_data_sync_info sync_info;
+  std::map<uint32_t, rgw_data_sync_marker> sync_markers;
+
+  rgw_data_sync_status() {}
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(1, 1, bl);
+    encode(sync_info, bl);
+    /* sync markers are encoded separately */
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) {
+     DECODE_START(1, bl);
+    decode(sync_info, bl);
+    /* sync markers are decoded separately */
+     DECODE_FINISH(bl);
+  }
+
+  void dump(Formatter *f) const {
+    encode_json("info", sync_info, f);
+    encode_json("markers", sync_markers, f);
+  }
+  void decode_json(JSONObj *obj) {
+    JSONDecoder::decode_json("info", sync_info, obj);
+    JSONDecoder::decode_json("markers", sync_markers, obj);
+  }
+  static void generate_test_instances(std::list<rgw_data_sync_status*>& o);
+};
+WRITE_CLASS_ENCODER(rgw_data_sync_status)
+
+struct rgw_datalog_entry {
+  std::string key;
+  ceph::real_time timestamp;
+
+  void decode_json(JSONObj *obj);
+};
+
+struct rgw_datalog_shard_data {
+  std::string marker;
+  bool truncated;
+  std::vector<rgw_datalog_entry> entries;
+
+  void decode_json(JSONObj *obj);
+};
+
+class RGWAsyncRadosProcessor;
+class RGWDataSyncControlCR;
+
+struct rgw_bucket_entry_owner {
+  std::string id;
+  std::string display_name;
+
+  rgw_bucket_entry_owner() {}
+  rgw_bucket_entry_owner(const std::string& _id, const std::string& _display_name) : id(_id), display_name(_display_name) {}
+
+  void decode_json(JSONObj *obj);
+};
+
+class RGWSyncErrorLogger;
+class RGWRESTConn;
+class RGWServices;
+
+struct RGWDataSyncEnv {
+  const DoutPrefixProvider *dpp{nullptr};
+  CephContext *cct{nullptr};
+  rgw::sal::RadosStore* driver{nullptr};
+  RGWServices *svc{nullptr};
+  RGWAsyncRadosProcessor *async_rados{nullptr};
+  RGWHTTPManager *http_manager{nullptr};
+  RGWSyncErrorLogger *error_logger{nullptr};
+  RGWSyncTraceManager *sync_tracer{nullptr};
+  RGWSyncModuleInstanceRef sync_module{nullptr};
+  PerfCounters* counters{nullptr};
+
+  RGWDataSyncEnv() {}
+
+  void init(const DoutPrefixProvider *_dpp, CephContext *_cct, rgw::sal::RadosStore* _driver, RGWServices *_svc,
+            RGWAsyncRadosProcessor *_async_rados, RGWHTTPManager *_http_manager,
+            RGWSyncErrorLogger *_error_logger, RGWSyncTraceManager *_sync_tracer,
+            RGWSyncModuleInstanceRef& _sync_module,
+            PerfCounters* _counters) {
+     dpp = _dpp;
+    cct = _cct;
+    driver = _driver;
+    svc = _svc;
+    async_rados = _async_rados;
+    http_manager = _http_manager;
+    error_logger = _error_logger;
+    sync_tracer = _sync_tracer;
+    sync_module = _sync_module;
+    counters = _counters;
+  }
+
+  std::string shard_obj_name(int shard_id);
+  std::string status_oid();
+
+  std::ostream* ostr{nullptr}; // For pretty printing progress
+};
+
+// pretty ostream output for `radosgw-admin bucket sync run`
+template<typename ...T>
+void pretty_print(const RGWDataSyncEnv* env, T&& ...t) {
+  if (unlikely(!!env->ostr)) {
+    fmt::print(*env->ostr, std::forward<T>(t)...);
+    env->ostr->flush();
+  }
+}
+
+struct RGWDataSyncCtx {
+  RGWDataSyncEnv *env{nullptr};
+  CephContext *cct{nullptr};
+
+  RGWRESTConn *conn{nullptr};
+  rgw_zone_id source_zone;
+
+  RGWDataSyncCtx() = default;
+
+  RGWDataSyncCtx(RGWDataSyncEnv* env,
+                RGWRESTConn* conn,
+                const rgw_zone_id& source_zone)
+    : env(env), cct(env->cct), conn(conn), source_zone(source_zone) {}
+
+  void init(RGWDataSyncEnv *_env,
+            RGWRESTConn *_conn,
+            const rgw_zone_id& _source_zone) {
+    cct = _env->cct;
+    env = _env;
+    conn = _conn;
+    source_zone = _source_zone;
+  }
+};
+
+class RGWRados;
+
+class RGWRemoteDataLog : public RGWCoroutinesManager {
+  const DoutPrefixProvider *dpp;
+  rgw::sal::RadosStore* driver;
+  CephContext *cct;
+  RGWCoroutinesManagerRegistry *cr_registry;
+  RGWAsyncRadosProcessor *async_rados;
+  RGWHTTPManager http_manager;
+
+  RGWDataSyncEnv sync_env;
+  RGWDataSyncCtx sc;
+
+  ceph::shared_mutex lock = ceph::make_shared_mutex("RGWRemoteDataLog::lock");
+  RGWDataSyncControlCR *data_sync_cr;
+
+  RGWSyncTraceNodeRef tn;
+
+  bool initialized;
+
+public:
+  RGWRemoteDataLog(const DoutPrefixProvider *dpp,
+                   rgw::sal::RadosStore* _store,
+                   RGWAsyncRadosProcessor *async_rados);
+  int init(const rgw_zone_id& _source_zone, RGWRESTConn *_conn, RGWSyncErrorLogger *_error_logger,
+           RGWSyncTraceManager *_sync_tracer, RGWSyncModuleInstanceRef& module,
+           PerfCounters* _counters);
+  void finish();
+
+  int read_log_info(const DoutPrefixProvider *dpp, rgw_datalog_info *log_info);
+  int read_source_log_shards_info(const DoutPrefixProvider *dpp, std::map<int, RGWDataChangesLogInfo> *shards_info);
+  int read_source_log_shards_next(const DoutPrefixProvider *dpp, std::map<int, std::string> shard_markers, std::map<int, rgw_datalog_shard_data> *result);
+  int read_sync_status(const DoutPrefixProvider *dpp, rgw_data_sync_status *sync_status);
+  int read_recovering_shards(const DoutPrefixProvider *dpp, const int num_shards, std::set<int>& recovering_shards);
+  int read_shard_status(const DoutPrefixProvider *dpp, int shard_id, std::set<std::string>& lagging_buckets,std::set<std::string>& recovering_buckets, rgw_data_sync_marker* sync_marker, const int max_entries);
+  int init_sync_status(const DoutPrefixProvider *dpp, int num_shards);
+  int run_sync(const DoutPrefixProvider *dpp, int num_shards);
+
+  void wakeup(int shard_id, bc::flat_set<rgw_data_notify_entry>& entries);
+};
+
+class RGWDataSyncStatusManager : public DoutPrefixProvider {
+  rgw::sal::RadosStore* driver;
+
+  rgw_zone_id source_zone;
+  RGWRESTConn *conn;
+  RGWSyncErrorLogger *error_logger;
+  RGWSyncModuleInstanceRef sync_module;
+  PerfCounters* counters;
+
+  RGWRemoteDataLog source_log;
+
+  std::string source_status_oid;
+  std::string source_shard_status_oid_prefix;
+
+  std::map<int, rgw_raw_obj> shard_objs;
+
+  int num_shards;
+
+public:
+  RGWDataSyncStatusManager(rgw::sal::RadosStore* _driver, RGWAsyncRadosProcessor *async_rados,
+                           const rgw_zone_id& _source_zone, PerfCounters* counters)
+    : driver(_driver), source_zone(_source_zone), conn(NULL), error_logger(NULL),
+      sync_module(nullptr), counters(counters),
+      source_log(this, driver, async_rados), num_shards(0) {}
+  RGWDataSyncStatusManager(rgw::sal::RadosStore* _driver, RGWAsyncRadosProcessor *async_rados,
+                           const rgw_zone_id& _source_zone, PerfCounters* counters,
+                           const RGWSyncModuleInstanceRef& _sync_module)
+    : driver(_driver), source_zone(_source_zone), conn(NULL), error_logger(NULL),
+      sync_module(_sync_module), counters(counters),
+      source_log(this, driver, async_rados), num_shards(0) {}
+  ~RGWDataSyncStatusManager() {
+    finalize();
+  }
+  int init(const DoutPrefixProvider *dpp);
+  void finalize();
+
+  static std::string shard_obj_name(const rgw_zone_id& source_zone, int shard_id);
+  static std::string sync_status_oid(const rgw_zone_id& source_zone);
+
+  int read_sync_status(const DoutPrefixProvider *dpp, rgw_data_sync_status *sync_status) {
+    return source_log.read_sync_status(dpp, sync_status);
+  }
+
+  int read_recovering_shards(const DoutPrefixProvider *dpp, const int num_shards, std::set<int>& recovering_shards) {
+    return source_log.read_recovering_shards(dpp, num_shards, recovering_shards);
+  }
+
+  int read_shard_status(const DoutPrefixProvider *dpp, int shard_id, std::set<std::string>& lagging_buckets, std::set<std::string>& recovering_buckets, rgw_data_sync_marker *sync_marker, const int max_entries) {
+    return source_log.read_shard_status(dpp, shard_id, lagging_buckets, recovering_buckets,sync_marker, max_entries);
+  }
+  int init_sync_status(const DoutPrefixProvider *dpp) { return source_log.init_sync_status(dpp, num_shards); }
+
+  int read_log_info(const DoutPrefixProvider *dpp, rgw_datalog_info *log_info) {
+    return source_log.read_log_info(dpp, log_info);
+  }
+  int read_source_log_shards_info(const DoutPrefixProvider *dpp, std::map<int, RGWDataChangesLogInfo> *shards_info) {
+    return source_log.read_source_log_shards_info(dpp, shards_info);
+  }
+  int read_source_log_shards_next(const DoutPrefixProvider *dpp, std::map<int, std::string> shard_markers, std::map<int, rgw_datalog_shard_data> *result) {
+    return source_log.read_source_log_shards_next(dpp, shard_markers, result);
+  }
+
+  int run(const DoutPrefixProvider *dpp) { return source_log.run_sync(dpp, num_shards); }
+
+  void wakeup(int shard_id, bc::flat_set<rgw_data_notify_entry>& entries) { return source_log.wakeup(shard_id, entries); }
+
+  void stop() {
+    source_log.finish();
+  }
+
+  // implements DoutPrefixProvider
+  CephContext *get_cct() const override;
+  unsigned get_subsys() const override;
+  std::ostream& gen_prefix(std::ostream& out) const override;
+};
+
+class RGWBucketPipeSyncStatusManager;
+class RGWBucketSyncCR;
+
+struct rgw_bucket_shard_full_sync_marker {
+  rgw_obj_key position;
+  uint64_t count;
+
+  rgw_bucket_shard_full_sync_marker() : count(0) {}
+
+  void encode_attr(std::map<std::string, bufferlist>& attrs);
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(1, 1, bl);
+    encode(position, bl);
+    encode(count, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) {
+     DECODE_START(1, bl);
+    decode(position, bl);
+    decode(count, bl);
+     DECODE_FINISH(bl);
+  }
+
+  void dump(Formatter *f) const;
+  void decode_json(JSONObj *obj);
+};
+WRITE_CLASS_ENCODER(rgw_bucket_shard_full_sync_marker)
+
+struct rgw_bucket_shard_inc_sync_marker {
+  std::string position;
+  ceph::real_time timestamp;
+
+  void encode_attr(std::map<std::string, bufferlist>& attrs);
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(2, 1, bl);
+    encode(position, bl);
+    encode(timestamp, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) {
+    DECODE_START(2, bl);
+    decode(position, bl);
+    if (struct_v >= 2) {
+      decode(timestamp, bl);
+    }
+    DECODE_FINISH(bl);
+  }
+
+  void dump(Formatter *f) const;
+  void decode_json(JSONObj *obj);
+};
+WRITE_CLASS_ENCODER(rgw_bucket_shard_inc_sync_marker)
+
+struct rgw_bucket_shard_sync_info {
+  enum SyncState {
+    StateInit = 0,
+    StateFullSync = 1,
+    StateIncrementalSync = 2,
+    StateStopped = 3,
+  };
+
+  uint16_t state;
+  rgw_bucket_shard_inc_sync_marker inc_marker;
+
+  void decode_from_attrs(CephContext *cct, std::map<std::string, bufferlist>& attrs);
+  void encode_all_attrs(std::map<std::string, bufferlist>& attrs);
+  void encode_state_attr(std::map<std::string, bufferlist>& attrs);
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(2, 1, bl);
+    encode(state, bl);
+    encode(inc_marker, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) {
+     DECODE_START(2, bl);
+     decode(state, bl);
+     if (struct_v <= 1) {
+       rgw_bucket_shard_full_sync_marker full_marker;
+       decode(full_marker, bl);
+     }
+     decode(inc_marker, bl);
+     DECODE_FINISH(bl);
+  }
+
+  void dump(Formatter *f) const;
+  void decode_json(JSONObj *obj);
+
+  rgw_bucket_shard_sync_info() : state((int)StateInit) {}
+
+};
+WRITE_CLASS_ENCODER(rgw_bucket_shard_sync_info)
+
+struct rgw_bucket_full_sync_status {
+  rgw_obj_key position;
+  uint64_t count = 0;
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(1, 1, bl);
+    encode(position, bl);
+    encode(count, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) {
+    DECODE_START(1, bl);
+    decode(position, bl);
+    decode(count, bl);
+    DECODE_FINISH(bl);
+  }
+
+  void dump(Formatter *f) const;
+  void decode_json(JSONObj *obj);
+};
+WRITE_CLASS_ENCODER(rgw_bucket_full_sync_status)
+
+enum class BucketSyncState : uint8_t {
+  Init = 0,
+  Full,
+  Incremental,
+  Stopped,
+};
+inline std::ostream& operator<<(std::ostream& out, const BucketSyncState& s) {
+  switch (s) {
+  case BucketSyncState::Init: out << "init"; break;
+  case BucketSyncState::Full: out << "full"; break;
+  case BucketSyncState::Incremental: out << "incremental"; break;
+  case BucketSyncState::Stopped: out << "stopped"; break;
+  }
+  return out;
+}
+
+void encode_json(const char *name, BucketSyncState state, Formatter *f);
+void decode_json_obj(BucketSyncState& state, JSONObj *obj);
+
+struct rgw_bucket_sync_status {
+  BucketSyncState state = BucketSyncState::Init;
+  rgw_bucket_full_sync_status full;
+  uint64_t incremental_gen = 0;
+  std::vector<bool> shards_done_with_gen;
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(2, 1, bl);
+    encode(state, bl);
+    encode(full, bl);
+    encode(incremental_gen, bl);
+    encode(shards_done_with_gen, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) {
+    DECODE_START(2, bl);
+    decode(state, bl);
+    decode(full, bl);
+    if (struct_v > 1) {
+      decode(incremental_gen, bl);
+      decode(shards_done_with_gen, bl);
+    }
+    DECODE_FINISH(bl);
+  }
+
+  void dump(Formatter *f) const;
+  void decode_json(JSONObj *obj);
+};
+WRITE_CLASS_ENCODER(rgw_bucket_sync_status)
+
+struct bilog_status_v2 {
+  rgw_bucket_sync_status sync_status;
+  std::vector<rgw_bucket_shard_sync_info> inc_status;
+
+  void dump(Formatter *f) const;
+  void decode_json(JSONObj *obj);
+};
+
+struct store_gen_shards {
+  uint64_t gen = 0;
+  uint32_t num_shards = 0;
+
+  void dump(Formatter *f) const {
+    encode_json("gen", gen, f);
+    encode_json("num_shards", num_shards, f);
+  }
+
+  void decode_json(JSONObj *obj) {
+    JSONDecoder::decode_json("gen", gen, obj);
+    JSONDecoder::decode_json("num_shards", num_shards, obj);
+  }
+};
+
+struct rgw_bucket_index_marker_info {
+  std::string bucket_ver;
+  std::string master_ver;
+  std::string max_marker;
+  bool syncstopped{false};
+  uint64_t oldest_gen = 0;
+  uint64_t latest_gen = 0;
+  std::vector<store_gen_shards> generations;
+
+  void decode_json(JSONObj *obj) {
+    JSONDecoder::decode_json("bucket_ver", bucket_ver, obj);
+    JSONDecoder::decode_json("master_ver", master_ver, obj);
+    JSONDecoder::decode_json("max_marker", max_marker, obj);
+    JSONDecoder::decode_json("syncstopped", syncstopped, obj);
+    JSONDecoder::decode_json("oldest_gen", oldest_gen, obj);
+    JSONDecoder::decode_json("latest_gen", latest_gen, obj);
+    JSONDecoder::decode_json("generations", generations, obj);
+  }
+};
+
+
+class BucketIndexShardsManager;
+
+int rgw_read_remote_bilog_info(const DoutPrefixProvider *dpp,
+                               RGWRESTConn* conn,
+                               const rgw_bucket& bucket,
+                               rgw_bucket_index_marker_info& info,
+                               BucketIndexShardsManager& markers,
+                               optional_yield y);
+
+class RGWBucketPipeSyncStatusManager : public DoutPrefixProvider {
+  rgw::sal::RadosStore* driver;
+
+  RGWDataSyncEnv sync_env;
+
+  RGWCoroutinesManager cr_mgr{driver->ctx(),
+                              driver->getRados()->get_cr_registry()};
+
+  RGWHTTPManager http_manager{driver->ctx(), cr_mgr.get_completion_mgr()};
+
+  std::optional<rgw_zone_id> source_zone;
+  std::optional<rgw_bucket> source_bucket;
+
+  std::unique_ptr<RGWSyncErrorLogger> error_logger =
+    std::make_unique<RGWSyncErrorLogger>(driver, RGW_SYNC_ERROR_LOG_SHARD_PREFIX,
+                                        ERROR_LOGGER_SHARDS);
+  RGWSyncModuleInstanceRef sync_module;
+
+  rgw_bucket dest_bucket;
+
+  struct source {
+    RGWDataSyncCtx sc;
+    RGWBucketInfo info;
+    rgw_bucket dest;
+    RGWBucketSyncFlowManager::pipe_handler handler;
+    std::string zone_name;
+
+    source(RGWDataSyncEnv* env, const rgw_zone_id& zone, RGWRESTConn* conn,
+          const RGWBucketInfo& info, const rgw_bucket& dest,
+          const RGWBucketSyncFlowManager::pipe_handler& handler,
+          const std::string& zone_name)
+      : sc(env, conn, zone), info(info), dest(dest), handler(handler),
+       zone_name(zone_name) {}
+  };
+  std::vector<source> sources;
+
+  int do_init(const DoutPrefixProvider *dpp, std::ostream* ostr);
+  RGWBucketPipeSyncStatusManager(rgw::sal::RadosStore* driver,
+                                std::optional<rgw_zone_id> source_zone,
+                                std::optional<rgw_bucket> source_bucket,
+                                const rgw_bucket& dest_bucket)
+    : driver(driver), source_zone(source_zone), source_bucket(source_bucket),
+      dest_bucket(dest_bucket) {}
+
+  int remote_info(const DoutPrefixProvider *dpp, source& s,
+                 uint64_t* oldest_gen, uint64_t* latest_gen,
+                 uint64_t* num_shards);
+public:
+  static tl::expected<std::unique_ptr<RGWBucketPipeSyncStatusManager>, int>
+  construct(const DoutPrefixProvider* dpp, rgw::sal::RadosStore* driver,
+           std::optional<rgw_zone_id> source_zone,
+           std::optional<rgw_bucket> source_bucket,
+           const rgw_bucket& dest_bucket, std::ostream *ostream);
+  ~RGWBucketPipeSyncStatusManager() = default;
+
+
+  static std::string full_status_oid(const rgw_zone_id& source_zone,
+                                    const rgw_bucket& source_bucket,
+                                    const rgw_bucket& dest_bucket);
+  static std::string inc_status_oid(const rgw_zone_id& source_zone,
+                                   const rgw_bucket_sync_pair_info& bs,
+                                   uint64_t gen);
+  // specific source obj sync status, can be used by sync modules
+  static std::string obj_status_oid(const rgw_bucket_sync_pipe& sync_pipe,
+                                   const rgw_zone_id& source_zone, const rgw::sal::Object* obj); /* specific source obj sync status,
+                                                                                      can be used by sync modules */
+
+  // implements DoutPrefixProvider
+  CephContext *get_cct() const override;
+  unsigned get_subsys() const override;
+  std::ostream& gen_prefix(std::ostream& out) const override;
+
+  int init_sync_status(const DoutPrefixProvider *dpp);
+  tl::expected<std::map<int, rgw_bucket_shard_sync_info>, int> read_sync_status(
+    const DoutPrefixProvider *dpp);
+  int run(const DoutPrefixProvider *dpp);
+};
+
+/// read the full sync status with respect to a source bucket
+int rgw_read_bucket_full_sync_status(const DoutPrefixProvider *dpp,
+                                     rgw::sal::RadosStore *driver,
+                                     const rgw_sync_bucket_pipe& pipe,
+                                     rgw_bucket_sync_status *status,
+                                     optional_yield y);
+
+/// read the incremental sync status of all bucket shards from the given source zone
+int rgw_read_bucket_inc_sync_status(const DoutPrefixProvider *dpp,
+                                    rgw::sal::RadosStore *driver,
+                                    const rgw_sync_bucket_pipe& pipe,
+                                    uint64_t gen,
+                                    std::vector<rgw_bucket_shard_sync_info> *status);
+
+class RGWDefaultSyncModule : public RGWSyncModule {
+public:
+  RGWDefaultSyncModule() {}
+  bool supports_writes() override { return true; }
+  bool supports_data_export() override { return true; }
+  int create_instance(const DoutPrefixProvider *dpp, CephContext *cct, const JSONFormattable& config, RGWSyncModuleInstanceRef *instance) override;
+};
+
+class RGWArchiveSyncModule : public RGWDefaultSyncModule {
+public:
+  RGWArchiveSyncModule() {}
+  bool supports_writes() override { return true; }
+  bool supports_data_export() override { return false; }
+  int create_instance(const DoutPrefixProvider *dpp, CephContext *cct, const JSONFormattable& config, RGWSyncModuleInstanceRef *instance) override;
+};
+
+#endif
diff --git a/src/rgw/driver/rados/rgw_datalog.cc b/src/rgw/driver/rados/rgw_datalog.cc
new file mode 100644 (file)
index 0000000..3eeb820
--- /dev/null
@@ -0,0 +1,1065 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include <vector>
+
+#include "common/debug.h"
+#include "common/containers.h"
+#include "common/errno.h"
+#include "common/error_code.h"
+
+#include "common/async/blocked_completion.h"
+#include "common/async/librados_completion.h"
+
+#include "cls/fifo/cls_fifo_types.h"
+#include "cls/log/cls_log_client.h"
+
+#include "cls_fifo_legacy.h"
+#include "rgw_bucket_layout.h"
+#include "rgw_datalog.h"
+#include "rgw_log_backing.h"
+#include "rgw_tools.h"
+
+#define dout_context g_ceph_context
+static constexpr auto dout_subsys = ceph_subsys_rgw;
+
+namespace bs = boost::system;
+namespace lr = librados;
+
+using ceph::containers::tiny_vector;
+
+void rgw_data_change::dump(ceph::Formatter *f) const
+{
+  std::string type;
+  switch (entity_type) {
+    case ENTITY_TYPE_BUCKET:
+      type = "bucket";
+      break;
+    default:
+      type = "unknown";
+  }
+  encode_json("entity_type", type, f);
+  encode_json("key", key, f);
+  utime_t ut(timestamp);
+  encode_json("timestamp", ut, f);
+  encode_json("gen", gen, f);
+}
+
+void rgw_data_change::decode_json(JSONObj *obj) {
+  std::string s;
+  JSONDecoder::decode_json("entity_type", s, obj);
+  if (s == "bucket") {
+    entity_type = ENTITY_TYPE_BUCKET;
+  } else {
+    entity_type = ENTITY_TYPE_UNKNOWN;
+  }
+  JSONDecoder::decode_json("key", key, obj);
+  utime_t ut;
+  JSONDecoder::decode_json("timestamp", ut, obj);
+  timestamp = ut.to_real_time();
+  JSONDecoder::decode_json("gen", gen, obj);
+}
+
+void rgw_data_change_log_entry::dump(Formatter *f) const
+{
+  encode_json("log_id", log_id, f);
+  utime_t ut(log_timestamp);
+  encode_json("log_timestamp", ut, f);
+  encode_json("entry", entry, f);
+}
+
+void rgw_data_change_log_entry::decode_json(JSONObj *obj) {
+  JSONDecoder::decode_json("log_id", log_id, obj);
+  utime_t ut;
+  JSONDecoder::decode_json("log_timestamp", ut, obj);
+  log_timestamp = ut.to_real_time();
+  JSONDecoder::decode_json("entry", entry, obj);
+}
+
+void rgw_data_notify_entry::dump(Formatter *f) const
+{
+  encode_json("key", key, f);
+  encode_json("gen", gen, f);
+}
+
+void rgw_data_notify_entry::decode_json(JSONObj *obj) {
+  JSONDecoder::decode_json("key", key, obj);
+  JSONDecoder::decode_json("gen", gen, obj);
+}
+
+class RGWDataChangesOmap final : public RGWDataChangesBE {
+  using centries = std::list<cls_log_entry>;
+  std::vector<std::string> oids;
+
+public:
+  RGWDataChangesOmap(lr::IoCtx& ioctx,
+                    RGWDataChangesLog& datalog,
+                    uint64_t gen_id,
+                    int num_shards)
+    : RGWDataChangesBE(ioctx, datalog, gen_id) {
+    oids.reserve(num_shards);
+    for (auto i = 0; i < num_shards; ++i) {
+      oids.push_back(get_oid(i));
+    }
+  }
+  ~RGWDataChangesOmap() override = default;
+
+  void prepare(ceph::real_time ut, const std::string& key,
+              ceph::buffer::list&& entry, entries& out) override {
+    if (!std::holds_alternative<centries>(out)) {
+      ceph_assert(std::visit([](const auto& v) { return std::empty(v); }, out));
+      out = centries();
+    }
+
+    cls_log_entry e;
+    cls_log_add_prepare_entry(e, utime_t(ut), {}, key, entry);
+    std::get<centries>(out).push_back(std::move(e));
+  }
+  int push(const DoutPrefixProvider *dpp, int index, entries&& items) override {
+    lr::ObjectWriteOperation op;
+    cls_log_add(op, std::get<centries>(items), true);
+    auto r = rgw_rados_operate(dpp, ioctx, oids[index], &op, null_yield);
+    if (r < 0) {
+      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__
+                << ": failed to push to " << oids[index] << cpp_strerror(-r)
+                << dendl;
+    }
+    return r;
+  }
+  int push(const DoutPrefixProvider *dpp, int index, ceph::real_time now,
+          const std::string& key,
+          ceph::buffer::list&& bl) override {
+    lr::ObjectWriteOperation op;
+    cls_log_add(op, utime_t(now), {}, key, bl);
+    auto r = rgw_rados_operate(dpp, ioctx, oids[index], &op, null_yield);
+    if (r < 0) {
+      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__
+                << ": failed to push to " << oids[index]
+                << cpp_strerror(-r) << dendl;
+    }
+    return r;
+  }
+  int list(const DoutPrefixProvider *dpp, int index, int max_entries,
+          std::vector<rgw_data_change_log_entry>& entries,
+          std::optional<std::string_view> marker,
+          std::string* out_marker, bool* truncated) override {
+    std::list<cls_log_entry> log_entries;
+    lr::ObjectReadOperation op;
+    cls_log_list(op, {}, {}, std::string(marker.value_or("")),
+                max_entries, log_entries, out_marker, truncated);
+    auto r = rgw_rados_operate(dpp, ioctx, oids[index], &op, nullptr, null_yield);
+    if (r == -ENOENT) {
+      *truncated = false;
+      return 0;
+    }
+    if (r < 0) {
+      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__
+                << ": failed to list " << oids[index]
+                << cpp_strerror(-r) << dendl;
+      return r;
+    }
+    for (auto iter = log_entries.begin(); iter != log_entries.end(); ++iter) {
+      rgw_data_change_log_entry log_entry;
+      log_entry.log_id = iter->id;
+      auto rt = iter->timestamp.to_real_time();
+      log_entry.log_timestamp = rt;
+      auto liter = iter->data.cbegin();
+      try {
+       decode(log_entry.entry, liter);
+      } catch (ceph::buffer::error& err) {
+       ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__
+                  << ": failed to decode data changes log entry: "
+                  << err.what() << dendl;
+       return -EIO;
+      }
+      entries.push_back(log_entry);
+    }
+    return 0;
+  }
+  int get_info(const DoutPrefixProvider *dpp, int index, RGWDataChangesLogInfo *info) override {
+    cls_log_header header;
+    lr::ObjectReadOperation op;
+    cls_log_info(op, &header);
+    auto r = rgw_rados_operate(dpp, ioctx, oids[index], &op, nullptr, null_yield);
+    if (r == -ENOENT) r = 0;
+    if (r < 0) {
+      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__
+                << ": failed to get info from " << oids[index]
+                << cpp_strerror(-r) << dendl;
+    } else {
+      info->marker = header.max_marker;
+      info->last_update = header.max_time.to_real_time();
+    }
+    return r;
+  }
+  int trim(const DoutPrefixProvider *dpp, int index, std::string_view marker) override {
+    lr::ObjectWriteOperation op;
+    cls_log_trim(op, {}, {}, {}, std::string(marker));
+    auto r = rgw_rados_operate(dpp, ioctx, oids[index], &op, null_yield);
+    if (r == -ENOENT) r = -ENODATA;
+    if (r < 0 && r != -ENODATA) {
+      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__
+                << ": failed to get info from " << oids[index]
+                << cpp_strerror(-r) << dendl;
+    }
+    return r;
+  }
+  int trim(const DoutPrefixProvider *dpp, int index, std::string_view marker,
+          lr::AioCompletion* c) override {
+    lr::ObjectWriteOperation op;
+    cls_log_trim(op, {}, {}, {}, std::string(marker));
+    auto r = ioctx.aio_operate(oids[index], c, &op, 0);
+    if (r == -ENOENT) r = -ENODATA;
+    if (r < 0) {
+      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__
+                << ": failed to get info from " << oids[index]
+                << cpp_strerror(-r) << dendl;
+    }
+    return r;
+  }
+  std::string_view max_marker() const override {
+    return "99999999";
+  }
+  int is_empty(const DoutPrefixProvider *dpp) override {
+    for (auto shard = 0u; shard < oids.size(); ++shard) {
+      std::list<cls_log_entry> log_entries;
+      lr::ObjectReadOperation op;
+      std::string out_marker;
+      bool truncated;
+      cls_log_list(op, {}, {}, {}, 1, log_entries, &out_marker, &truncated);
+      auto r = rgw_rados_operate(dpp, ioctx, oids[shard], &op, nullptr, null_yield);
+      if (r == -ENOENT) {
+       continue;
+      }
+      if (r < 0) {
+       ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__
+                  << ": failed to list " << oids[shard]
+                  << cpp_strerror(-r) << dendl;
+       return r;
+      }
+      if (!log_entries.empty()) {
+       return 0;
+      }
+    }
+    return 1;
+  }
+};
+
+class RGWDataChangesFIFO final : public RGWDataChangesBE {
+  using centries = std::vector<ceph::buffer::list>;
+  tiny_vector<LazyFIFO> fifos;
+
+public:
+  RGWDataChangesFIFO(lr::IoCtx& ioctx,
+                    RGWDataChangesLog& datalog,
+                    uint64_t gen_id, int shards)
+    : RGWDataChangesBE(ioctx, datalog, gen_id),
+      fifos(shards, [&ioctx, this](std::size_t i, auto emplacer) {
+       emplacer.emplace(ioctx, get_oid(i));
+      }) {}
+  ~RGWDataChangesFIFO() override = default;
+  void prepare(ceph::real_time, const std::string&,
+              ceph::buffer::list&& entry, entries& out) override {
+    if (!std::holds_alternative<centries>(out)) {
+      ceph_assert(std::visit([](auto& v) { return std::empty(v); }, out));
+      out = centries();
+    }
+    std::get<centries>(out).push_back(std::move(entry));
+  }
+  int push(const DoutPrefixProvider *dpp, int index, entries&& items) override {
+    auto r = fifos[index].push(dpp, std::get<centries>(items), null_yield);
+    if (r < 0) {
+      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__
+                << ": unable to push to FIFO: " << get_oid(index)
+                << ": " << cpp_strerror(-r) << dendl;
+    }
+    return r;
+  }
+  int push(const DoutPrefixProvider *dpp, int index, ceph::real_time,
+          const std::string&,
+          ceph::buffer::list&& bl) override {
+    auto r = fifos[index].push(dpp, std::move(bl), null_yield);
+    if (r < 0) {
+      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__
+                << ": unable to push to FIFO: " << get_oid(index)
+                << ": " << cpp_strerror(-r) << dendl;
+    }
+    return r;
+  }
+  int list(const DoutPrefixProvider *dpp, int index, int max_entries,
+          std::vector<rgw_data_change_log_entry>& entries,
+          std::optional<std::string_view> marker,
+          std::string* out_marker, bool* truncated) override {
+    std::vector<rgw::cls::fifo::list_entry> log_entries;
+    bool more = false;
+    auto r = fifos[index].list(dpp, max_entries, marker, &log_entries, &more,
+                              null_yield);
+    if (r < 0) {
+      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__
+                << ": unable to list FIFO: " << get_oid(index)
+                << ": " << cpp_strerror(-r) << dendl;
+      return r;
+    }
+    for (const auto& entry : log_entries) {
+      rgw_data_change_log_entry log_entry;
+      log_entry.log_id = entry.marker;
+      log_entry.log_timestamp = entry.mtime;
+      auto liter = entry.data.cbegin();
+      try {
+       decode(log_entry.entry, liter);
+      } catch (const buffer::error& err) {
+       ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__
+                  << ": failed to decode data changes log entry: "
+                  << err.what() << dendl;
+       return -EIO;
+      }
+      entries.push_back(std::move(log_entry));
+    }
+    if (truncated)
+      *truncated = more;
+    if (out_marker && !log_entries.empty()) {
+      *out_marker = log_entries.back().marker;
+    }
+    return 0;
+  }
+  int get_info(const DoutPrefixProvider *dpp, int index, RGWDataChangesLogInfo *info) override {
+    auto& fifo = fifos[index];
+    auto r = fifo.read_meta(dpp, null_yield);
+    if (r < 0) {
+      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__
+                << ": unable to get FIFO metadata: " << get_oid(index)
+                << ": " << cpp_strerror(-r) << dendl;
+      return r;
+    }
+    rados::cls::fifo::info m;
+    fifo.meta(dpp, m, null_yield);
+    auto p = m.head_part_num;
+    if (p < 0) {
+      info->marker = "";
+      info->last_update = ceph::real_clock::zero();
+      return 0;
+    }
+    rgw::cls::fifo::part_info h;
+    r = fifo.get_part_info(dpp, p, &h, null_yield);
+    if (r < 0) {
+      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__
+                << ": unable to get part info: " << get_oid(index) << "/" << p
+                << ": " << cpp_strerror(-r) << dendl;
+      return r;
+    }
+    info->marker = rgw::cls::fifo::marker{p, h.last_ofs}.to_string();
+    info->last_update = h.max_time;
+    return 0;
+  }
+  int trim(const DoutPrefixProvider *dpp, int index, std::string_view marker) override {
+    auto r = fifos[index].trim(dpp, marker, false, null_yield);
+    if (r < 0) {
+      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__
+                << ": unable to trim FIFO: " << get_oid(index)
+                << ": " << cpp_strerror(-r) << dendl;
+    }
+    return r;
+  }
+  int trim(const DoutPrefixProvider *dpp, int index, std::string_view marker,
+          librados::AioCompletion* c) override {
+    int r = 0;
+    if (marker == rgw::cls::fifo::marker(0, 0).to_string()) {
+      rgw_complete_aio_completion(c, -ENODATA);
+    } else {
+      fifos[index].trim(dpp, marker, false, c, null_yield);
+    }
+    return r;
+  }
+  std::string_view max_marker() const override {
+    static const std::string mm =
+      rgw::cls::fifo::marker::max().to_string();
+    return std::string_view(mm);
+  }
+  int is_empty(const DoutPrefixProvider *dpp) override {
+    std::vector<rgw::cls::fifo::list_entry> log_entries;
+    bool more = false;
+    for (auto shard = 0u; shard < fifos.size(); ++shard) {
+      auto r = fifos[shard].list(dpp, 1, {}, &log_entries, &more,
+                                null_yield);
+      if (r < 0) {
+       ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__
+                  << ": unable to list FIFO: " << get_oid(shard)
+                  << ": " << cpp_strerror(-r) << dendl;
+       return r;
+      }
+      if (!log_entries.empty()) {
+       return 0;
+      }
+    }
+    return 1;
+  }
+};
+
+RGWDataChangesLog::RGWDataChangesLog(CephContext* cct)
+  : cct(cct),
+    num_shards(cct->_conf->rgw_data_log_num_shards),
+    prefix(get_prefix()),
+    changes(cct->_conf->rgw_data_log_changes_size) {}
+
+bs::error_code DataLogBackends::handle_init(entries_t e) noexcept {
+  std::unique_lock l(m);
+
+  for (const auto& [gen_id, gen] : e) {
+    if (gen.pruned) {
+      lderr(datalog.cct)
+       << __PRETTY_FUNCTION__ << ":" << __LINE__
+       << ": ERROR: given empty generation: gen_id=" << gen_id << dendl;
+    }
+    if (count(gen_id) != 0) {
+      lderr(datalog.cct)
+       << __PRETTY_FUNCTION__ << ":" << __LINE__
+       << ": ERROR: generation already exists: gen_id=" << gen_id << dendl;
+    }
+    try {
+      switch (gen.type) {
+      case log_type::omap:
+       emplace(gen_id, new RGWDataChangesOmap(ioctx, datalog, gen_id, shards));
+       break;
+      case log_type::fifo:
+       emplace(gen_id, new RGWDataChangesFIFO(ioctx, datalog, gen_id, shards));
+       break;
+      default:
+       lderr(datalog.cct)
+         << __PRETTY_FUNCTION__ << ":" << __LINE__
+         << ": IMPOSSIBLE: invalid log type: gen_id=" << gen_id
+         << ", type" << gen.type << dendl;
+       return bs::error_code(EFAULT, bs::system_category());
+      }
+    } catch (const bs::system_error& err) {
+      lderr(datalog.cct)
+         << __PRETTY_FUNCTION__ << ":" << __LINE__
+         << ": error setting up backend: gen_id=" << gen_id
+         << ", err=" << err.what() << dendl;
+      return err.code();
+    }
+  }
+  return {};
+}
+bs::error_code DataLogBackends::handle_new_gens(entries_t e) noexcept {
+  return handle_init(std::move(e));
+}
+bs::error_code DataLogBackends::handle_empty_to(uint64_t new_tail) noexcept {
+  std::unique_lock l(m);
+  auto i = cbegin();
+  if (i->first < new_tail) {
+    return {};
+  }
+  if (new_tail >= (cend() - 1)->first) {
+    lderr(datalog.cct)
+      << __PRETTY_FUNCTION__ << ":" << __LINE__
+      << ": ERROR: attempt to trim head: new_tail=" << new_tail << dendl;
+    return bs::error_code(EFAULT, bs::system_category());
+  }
+  erase(i, upper_bound(new_tail));
+  return {};
+}
+
+
+int RGWDataChangesLog::start(const DoutPrefixProvider *dpp, const RGWZone* _zone,
+                            const RGWZoneParams& zoneparams,
+                            librados::Rados* lr)
+{
+  zone = _zone;
+  ceph_assert(zone);
+  auto defbacking = to_log_type(
+    cct->_conf.get_val<std::string>("rgw_default_data_log_backing"));
+  // Should be guaranteed by `set_enum_allowed`
+  ceph_assert(defbacking);
+  auto log_pool = zoneparams.log_pool;
+  auto r = rgw_init_ioctx(dpp, lr, log_pool, ioctx, true, false);
+  if (r < 0) {
+    ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__
+              << ": Failed to initialized ioctx, r=" << r
+              << ", pool=" << log_pool << dendl;
+    return -r;
+  }
+
+  auto besr = logback_generations::init<DataLogBackends>(
+    dpp, ioctx, metadata_log_oid(), [this](uint64_t gen_id, int shard) {
+      return get_oid(gen_id, shard);
+    },
+    num_shards, *defbacking, null_yield, *this);
+
+
+  if (!besr) {
+    lderr(cct) << __PRETTY_FUNCTION__
+              << ": Error initializing backends: "
+              << besr.error().message() << dendl;
+    return ceph::from_error_code(besr.error());
+  }
+
+  bes = std::move(*besr);
+  renew_thread = make_named_thread("rgw_dt_lg_renew",
+                                  &RGWDataChangesLog::renew_run, this);
+  return 0;
+}
+
+int RGWDataChangesLog::choose_oid(const rgw_bucket_shard& bs) {
+  const auto& name = bs.bucket.name;
+  auto shard_shift = (bs.shard_id > 0 ? bs.shard_id : 0);
+  auto r = (ceph_str_hash_linux(name.data(), name.size()) +
+           shard_shift) % num_shards;
+  return static_cast<int>(r);
+}
+
+int RGWDataChangesLog::renew_entries(const DoutPrefixProvider *dpp)
+{
+  if (!zone->log_data)
+    return 0;
+
+  /* we can't keep the bucket name as part of the cls_log_entry, and we need
+   * it later, so we keep two lists under the map */
+  bc::flat_map<int, std::pair<std::vector<BucketGen>,
+                             RGWDataChangesBE::entries>> m;
+
+  std::unique_lock l(lock);
+  decltype(cur_cycle) entries;
+  entries.swap(cur_cycle);
+  l.unlock();
+
+  auto ut = real_clock::now();
+  auto be = bes->head();
+  for (const auto& [bs, gen] : entries) {
+    auto index = choose_oid(bs);
+
+    rgw_data_change change;
+    bufferlist bl;
+    change.entity_type = ENTITY_TYPE_BUCKET;
+    change.key = bs.get_key();
+    change.timestamp = ut;
+    change.gen = gen;
+    encode(change, bl);
+
+    m[index].first.push_back({bs, gen});
+    be->prepare(ut, change.key, std::move(bl), m[index].second);
+  }
+
+  for (auto& [index, p] : m) {
+    auto& [buckets, entries] = p;
+
+    auto now = real_clock::now();
+
+    auto ret = be->push(dpp, index, std::move(entries));
+    if (ret < 0) {
+      /* we don't really need to have a special handling for failed cases here,
+       * as this is just an optimization. */
+      ldpp_dout(dpp, -1) << "ERROR: svc.cls->timelog.add() returned " << ret << dendl;
+      return ret;
+    }
+
+    auto expiration = now;
+    expiration += ceph::make_timespan(cct->_conf->rgw_data_log_window);
+    for (auto& [bs, gen] : buckets) {
+      update_renewed(bs, gen, expiration);
+    }
+  }
+
+  return 0;
+}
+
+auto RGWDataChangesLog::_get_change(const rgw_bucket_shard& bs,
+                                   uint64_t gen)
+  -> ChangeStatusPtr
+{
+  ceph_assert(ceph_mutex_is_locked(lock));
+  ChangeStatusPtr status;
+  if (!changes.find({bs, gen}, status)) {
+    status = std::make_shared<ChangeStatus>();
+    changes.add({bs, gen}, status);
+  }
+  return status;
+}
+
+void RGWDataChangesLog::register_renew(const rgw_bucket_shard& bs,
+                                      const rgw::bucket_log_layout_generation& gen)
+{
+  std::scoped_lock l{lock};
+  cur_cycle.insert({bs, gen.gen});
+}
+
+void RGWDataChangesLog::update_renewed(const rgw_bucket_shard& bs,
+                                      uint64_t gen,
+                                      real_time expiration)
+{
+  std::unique_lock l{lock};
+  auto status = _get_change(bs, gen);
+  l.unlock();
+
+  ldout(cct, 20) << "RGWDataChangesLog::update_renewd() bucket_name="
+                << bs.bucket.name << " shard_id=" << bs.shard_id
+                << " expiration=" << expiration << dendl;
+
+  std::unique_lock sl(status->lock);
+  status->cur_expiration = expiration;
+}
+
+int RGWDataChangesLog::get_log_shard_id(rgw_bucket& bucket, int shard_id) {
+  rgw_bucket_shard bs(bucket, shard_id);
+  return choose_oid(bs);
+}
+
+bool RGWDataChangesLog::filter_bucket(const DoutPrefixProvider *dpp, 
+                                      const rgw_bucket& bucket,
+                                     optional_yield y) const
+{
+  if (!bucket_filter) {
+    return true;
+  }
+
+  return bucket_filter(bucket, y, dpp);
+}
+
+std::string RGWDataChangesLog::get_oid(uint64_t gen_id, int i) const {
+  return (gen_id > 0 ?
+         fmt::format("{}@G{}.{}", prefix, gen_id, i) :
+         fmt::format("{}.{}", prefix, i));
+}
+
+int RGWDataChangesLog::add_entry(const DoutPrefixProvider *dpp,
+                                const RGWBucketInfo& bucket_info,
+                                const rgw::bucket_log_layout_generation& gen,
+                                int shard_id)
+{
+  auto& bucket = bucket_info.bucket;
+
+  if (!filter_bucket(dpp, bucket, null_yield)) {
+    return 0;
+  }
+
+  if (observer) {
+    observer->on_bucket_changed(bucket.get_key());
+  }
+
+  rgw_bucket_shard bs(bucket, shard_id);
+
+  int index = choose_oid(bs);
+
+  mark_modified(index, bs, gen.gen);
+
+  std::unique_lock l(lock);
+
+  auto status = _get_change(bs, gen.gen);
+  l.unlock();
+
+  auto now = real_clock::now();
+
+  std::unique_lock sl(status->lock);
+
+  ldpp_dout(dpp, 20) << "RGWDataChangesLog::add_entry() bucket.name=" << bucket.name
+                    << " shard_id=" << shard_id << " now=" << now
+                    << " cur_expiration=" << status->cur_expiration << dendl;
+
+  if (now < status->cur_expiration) {
+    /* no need to send, recently completed */
+    sl.unlock();
+    register_renew(bs, gen);
+    return 0;
+  }
+
+  RefCountedCond* cond;
+
+  if (status->pending) {
+    cond = status->cond;
+
+    ceph_assert(cond);
+
+    status->cond->get();
+    sl.unlock();
+
+    int ret = cond->wait();
+    cond->put();
+    if (!ret) {
+      register_renew(bs, gen);
+    }
+    return ret;
+  }
+
+  status->cond = new RefCountedCond;
+  status->pending = true;
+
+  ceph::real_time expiration;
+
+  int ret;
+
+  do {
+    status->cur_sent = now;
+
+    expiration = now;
+    expiration += ceph::make_timespan(cct->_conf->rgw_data_log_window);
+
+    sl.unlock();
+
+    ceph::buffer::list bl;
+    rgw_data_change change;
+    change.entity_type = ENTITY_TYPE_BUCKET;
+    change.key = bs.get_key();
+    change.timestamp = now;
+    change.gen = gen.gen;
+    encode(change, bl);
+
+    ldpp_dout(dpp, 20) << "RGWDataChangesLog::add_entry() sending update with now=" << now << " cur_expiration=" << expiration << dendl;
+
+    auto be = bes->head();
+    ret = be->push(dpp, index, now, change.key, std::move(bl));
+
+    now = real_clock::now();
+
+    sl.lock();
+
+  } while (!ret && real_clock::now() > expiration);
+
+  cond = status->cond;
+
+  status->pending = false;
+  /* time of when operation started, not completed */
+  status->cur_expiration = status->cur_sent;
+  status->cur_expiration += make_timespan(cct->_conf->rgw_data_log_window);
+  status->cond = nullptr;
+  sl.unlock();
+
+  cond->done(ret);
+  cond->put();
+
+  return ret;
+}
+
+int DataLogBackends::list(const DoutPrefixProvider *dpp, int shard, int max_entries,
+                         std::vector<rgw_data_change_log_entry>& entries,
+                         std::string_view marker,
+                         std::string* out_marker,
+                         bool* truncated)
+{
+  const auto [start_id, start_cursor] = cursorgen(marker);
+  auto gen_id = start_id;
+  std::string out_cursor;
+  while (max_entries > 0) {
+    std::vector<rgw_data_change_log_entry> gentries;
+    std::unique_lock l(m);
+    auto i = lower_bound(gen_id);
+    if (i == end()) return 0;
+    auto be = i->second;
+    l.unlock();
+    gen_id = be->gen_id;
+    auto r = be->list(dpp, shard, max_entries, gentries,
+                     gen_id == start_id ? start_cursor : std::string{},
+                     &out_cursor, truncated);
+    if (r < 0)
+      return r;
+
+    if (out_marker && !out_cursor.empty()) {
+      *out_marker = gencursor(gen_id, out_cursor);
+    }
+    for (auto& g : gentries) {
+      g.log_id = gencursor(gen_id, g.log_id);
+    }
+    if (int s = gentries.size(); s < 0 || s > max_entries)
+      max_entries = 0;
+    else
+      max_entries -= gentries.size();
+
+    std::move(gentries.begin(), gentries.end(),
+             std::back_inserter(entries));
+    ++gen_id;
+  }
+  return 0;
+}
+
+int RGWDataChangesLog::list_entries(const DoutPrefixProvider *dpp, int shard, int max_entries,
+                                   std::vector<rgw_data_change_log_entry>& entries,
+                                   std::string_view marker,
+                                   std::string* out_marker, bool* truncated)
+{
+  assert(shard < num_shards);
+  return bes->list(dpp, shard, max_entries, entries, marker, out_marker, truncated);
+}
+
+int RGWDataChangesLog::list_entries(const DoutPrefixProvider *dpp, int max_entries,
+                                   std::vector<rgw_data_change_log_entry>& entries,
+                                   LogMarker& marker, bool *ptruncated)
+{
+  bool truncated;
+  entries.clear();
+  for (; marker.shard < num_shards && int(entries.size()) < max_entries;
+       marker.shard++, marker.marker.clear()) {
+    int ret = list_entries(dpp, marker.shard, max_entries - entries.size(),
+                          entries, marker.marker, NULL, &truncated);
+    if (ret == -ENOENT) {
+      continue;
+    }
+    if (ret < 0) {
+      return ret;
+    }
+    if (!truncated) {
+      *ptruncated = false;
+      return 0;
+    }
+  }
+  *ptruncated = (marker.shard < num_shards);
+  return 0;
+}
+
+int RGWDataChangesLog::get_info(const DoutPrefixProvider *dpp, int shard_id, RGWDataChangesLogInfo *info)
+{
+  assert(shard_id < num_shards);
+  auto be = bes->head();
+  auto r = be->get_info(dpp, shard_id, info);
+  if (!info->marker.empty()) {
+    info->marker = gencursor(be->gen_id, info->marker);
+  }
+  return r;
+}
+
+int DataLogBackends::trim_entries(const DoutPrefixProvider *dpp, int shard_id, std::string_view marker)
+{
+  auto [target_gen, cursor] = cursorgen(marker);
+  std::unique_lock l(m);
+  const auto head_gen = (end() - 1)->second->gen_id;
+  const auto tail_gen = begin()->first;
+  if (target_gen < tail_gen) return 0;
+  auto r = 0;
+  for (auto be = lower_bound(0)->second;
+       be->gen_id <= target_gen && be->gen_id <= head_gen && r >= 0;
+       be = upper_bound(be->gen_id)->second) {
+    l.unlock();
+    auto c = be->gen_id == target_gen ? cursor : be->max_marker();
+    r = be->trim(dpp, shard_id, c);
+    if (r == -ENOENT)
+      r = -ENODATA;
+    if (r == -ENODATA && be->gen_id < target_gen)
+      r = 0;
+    if (be->gen_id == target_gen)
+      break;
+    l.lock();
+  };
+  return r;
+}
+
+int RGWDataChangesLog::trim_entries(const DoutPrefixProvider *dpp, int shard_id, std::string_view marker)
+{
+  assert(shard_id < num_shards);
+  return bes->trim_entries(dpp, shard_id, marker);
+}
+
+class GenTrim : public rgw::cls::fifo::Completion<GenTrim> {
+public:
+  DataLogBackends* const bes;
+  const int shard_id;
+  const uint64_t target_gen;
+  const std::string cursor;
+  const uint64_t head_gen;
+  const uint64_t tail_gen;
+  boost::intrusive_ptr<RGWDataChangesBE> be;
+
+  GenTrim(const DoutPrefixProvider *dpp, DataLogBackends* bes, int shard_id, uint64_t target_gen,
+         std::string cursor, uint64_t head_gen, uint64_t tail_gen,
+         boost::intrusive_ptr<RGWDataChangesBE> be,
+         lr::AioCompletion* super)
+    : Completion(dpp, super), bes(bes), shard_id(shard_id), target_gen(target_gen),
+      cursor(std::move(cursor)), head_gen(head_gen), tail_gen(tail_gen),
+      be(std::move(be)) {}
+
+  void handle(const DoutPrefixProvider *dpp, Ptr&& p, int r) {
+    auto gen_id = be->gen_id;
+    be.reset();
+    if (r == -ENOENT)
+      r = -ENODATA;
+    if (r == -ENODATA && gen_id < target_gen)
+      r = 0;
+    if (r < 0) {
+      complete(std::move(p), r);
+      return;
+    }
+
+    {
+      std::unique_lock l(bes->m);
+      auto i = bes->upper_bound(gen_id);
+      if (i == bes->end() || i->first > target_gen || i->first > head_gen) {
+       l.unlock();
+       complete(std::move(p), -ENODATA);
+       return;
+      }
+      be = i->second;
+    }
+    auto c = be->gen_id == target_gen ? cursor : be->max_marker();
+    be->trim(dpp, shard_id, c, call(std::move(p)));
+  }
+};
+
+void DataLogBackends::trim_entries(const DoutPrefixProvider *dpp, int shard_id, std::string_view marker,
+                                  librados::AioCompletion* c)
+{
+  auto [target_gen, cursor] = cursorgen(marker);
+  std::unique_lock l(m);
+  const auto head_gen = (end() - 1)->second->gen_id;
+  const auto tail_gen = begin()->first;
+  if (target_gen < tail_gen) {
+    l.unlock();
+    rgw_complete_aio_completion(c, -ENODATA);
+    return;
+  }
+  auto be = begin()->second;
+  l.unlock();
+  auto gt = std::make_unique<GenTrim>(dpp, this, shard_id, target_gen,
+                                     std::string(cursor), head_gen, tail_gen,
+                                     be, c);
+
+  auto cc = be->gen_id == target_gen ? cursor : be->max_marker();
+  be->trim(dpp, shard_id, cc,  GenTrim::call(std::move(gt)));
+}
+
+int DataLogBackends::trim_generations(const DoutPrefixProvider *dpp, std::optional<uint64_t>& through) {
+  if (size() != 1) {
+    std::vector<mapped_type> candidates;
+    {
+      std::scoped_lock l(m);
+      auto e = cend() - 1;
+      for (auto i = cbegin(); i < e; ++i) {
+       candidates.push_back(i->second);
+      }
+    }
+
+    std::optional<uint64_t> highest;
+    for (auto& be : candidates) {
+      auto r = be->is_empty(dpp);
+      if (r < 0) {
+       return r;
+      } else if (r == 1) {
+       highest = be->gen_id;
+      } else {
+       break;
+      }
+    }
+
+    through = highest;
+    if (!highest) {
+      return 0;
+    }
+    auto ec = empty_to(dpp, *highest, null_yield);
+    if (ec) {
+      return ceph::from_error_code(ec);
+    }
+  }
+
+  return ceph::from_error_code(remove_empty(dpp, null_yield));
+}
+
+
+int RGWDataChangesLog::trim_entries(const DoutPrefixProvider *dpp, int shard_id, std::string_view marker,
+                                   librados::AioCompletion* c)
+{
+  assert(shard_id < num_shards);
+  bes->trim_entries(dpp, shard_id, marker, c);
+  return 0;
+}
+
+bool RGWDataChangesLog::going_down() const
+{
+  return down_flag;
+}
+
+RGWDataChangesLog::~RGWDataChangesLog() {
+  down_flag = true;
+  if (renew_thread.joinable()) {
+    renew_stop();
+    renew_thread.join();
+  }
+}
+
+void RGWDataChangesLog::renew_run() noexcept {
+  static constexpr auto runs_per_prune = 150;
+  auto run = 0;
+  for (;;) {
+    const DoutPrefix dp(cct, dout_subsys, "rgw data changes log: ");
+    ldpp_dout(&dp, 2) << "RGWDataChangesLog::ChangesRenewThread: start" << dendl;
+    int r = renew_entries(&dp);
+    if (r < 0) {
+      ldpp_dout(&dp, 0) << "ERROR: RGWDataChangesLog::renew_entries returned error r=" << r << dendl;
+    }
+
+    if (going_down())
+      break;
+
+    if (run == runs_per_prune) {
+      std::optional<uint64_t> through;
+      ldpp_dout(&dp, 2) << "RGWDataChangesLog::ChangesRenewThread: pruning old generations" << dendl;
+      trim_generations(&dp, through);
+      if (r < 0) {
+       derr << "RGWDataChangesLog::ChangesRenewThread: failed pruning r="
+            << r << dendl;
+      } else if (through) {
+       ldpp_dout(&dp, 2) << "RGWDataChangesLog::ChangesRenewThread: pruned generations "
+               << "through " << *through << "." << dendl;
+      } else {
+       ldpp_dout(&dp, 2) << "RGWDataChangesLog::ChangesRenewThread: nothing to prune."
+               << dendl;
+      }
+      run = 0;
+    } else {
+      ++run;
+    }
+
+    int interval = cct->_conf->rgw_data_log_window * 3 / 4;
+    std::unique_lock locker{renew_lock};
+    renew_cond.wait_for(locker, std::chrono::seconds(interval));
+  }
+}
+
+void RGWDataChangesLog::renew_stop()
+{
+  std::lock_guard l{renew_lock};
+  renew_cond.notify_all();
+}
+
+void RGWDataChangesLog::mark_modified(int shard_id, const rgw_bucket_shard& bs, uint64_t gen)
+{
+  if (!cct->_conf->rgw_data_notify_interval_msec) {
+    return;
+  }
+
+  auto key = bs.get_key();
+  {
+    std::shared_lock rl{modified_lock}; // read lock to check for existence
+    auto shard = modified_shards.find(shard_id);
+    if (shard != modified_shards.end() && shard->second.count({key, gen})) {
+      return;
+    }
+  }
+
+  std::unique_lock wl{modified_lock}; // write lock for insertion
+  modified_shards[shard_id].insert(rgw_data_notify_entry{key, gen});
+}
+
+std::string RGWDataChangesLog::max_marker() const {
+  return gencursor(std::numeric_limits<uint64_t>::max(),
+                  "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~");
+}
+
+int RGWDataChangesLog::change_format(const DoutPrefixProvider *dpp, log_type type, optional_yield y) {
+  return ceph::from_error_code(bes->new_backing(dpp, type, y));
+}
+
+int RGWDataChangesLog::trim_generations(const DoutPrefixProvider *dpp, std::optional<uint64_t>& through) {
+  return bes->trim_generations(dpp, through);
+}
+
+void RGWDataChangesLogInfo::dump(Formatter *f) const
+{
+  encode_json("marker", marker, f);
+  utime_t ut(last_update);
+  encode_json("last_update", ut, f);
+}
+
+void RGWDataChangesLogInfo::decode_json(JSONObj *obj)
+{
+  JSONDecoder::decode_json("marker", marker, obj);
+  utime_t ut;
+  JSONDecoder::decode_json("last_update", ut, obj);
+  last_update = ut.to_real_time();
+}
+
+
diff --git a/src/rgw/driver/rados/rgw_datalog.h b/src/rgw/driver/rados/rgw_datalog.h
new file mode 100644 (file)
index 0000000..0bc4837
--- /dev/null
@@ -0,0 +1,386 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#ifndef CEPH_RGW_DATALOG_H
+#define CEPH_RGW_DATALOG_H
+
+#include <cstdint>
+#include <list>
+#include <memory>
+#include <string>
+#include <string_view>
+#include <variant>
+#include <vector>
+
+#include <boost/container/flat_map.hpp>
+#include <boost/container/flat_set.hpp>
+#include <boost/smart_ptr/intrusive_ptr.hpp>
+#include <boost/smart_ptr/intrusive_ref_counter.hpp>
+
+#undef FMT_HEADER_ONLY
+#define FMT_HEADER_ONLY 1
+#include <fmt/format.h>
+
+#include "include/buffer.h"
+#include "include/encoding.h"
+#include "include/function2.hpp"
+
+#include "include/rados/librados.hpp"
+
+#include "common/ceph_context.h"
+#include "common/ceph_json.h"
+#include "common/ceph_time.h"
+#include "common/Formatter.h"
+#include "common/lru_map.h"
+#include "common/RefCountedObj.h"
+
+#include "cls/log/cls_log_types.h"
+
+#include "rgw_basic_types.h"
+#include "rgw_log_backing.h"
+#include "rgw_sync_policy.h"
+#include "rgw_zone.h"
+#include "rgw_trim_bilog.h"
+
+namespace bc = boost::container;
+
+enum DataLogEntityType {
+  ENTITY_TYPE_UNKNOWN = 0,
+  ENTITY_TYPE_BUCKET = 1,
+};
+
+struct rgw_data_change {
+  DataLogEntityType entity_type;
+  std::string key;
+  ceph::real_time timestamp;
+  uint64_t gen = 0;
+
+  void encode(ceph::buffer::list& bl) const {
+    // require decoders to recognize v2 when gen>0
+    const uint8_t compat = (gen == 0) ? 1 : 2;
+    ENCODE_START(2, compat, bl);
+    auto t = std::uint8_t(entity_type);
+    encode(t, bl);
+    encode(key, bl);
+    encode(timestamp, bl);
+    encode(gen, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) {
+     DECODE_START(2, bl);
+     std::uint8_t t;
+     decode(t, bl);
+     entity_type = DataLogEntityType(t);
+     decode(key, bl);
+     decode(timestamp, bl);
+     if (struct_v < 2) {
+       gen = 0;
+     } else {
+       decode(gen, bl);
+     }
+     DECODE_FINISH(bl);
+  }
+
+  void dump(ceph::Formatter* f) const;
+  void decode_json(JSONObj* obj);
+};
+WRITE_CLASS_ENCODER(rgw_data_change)
+
+struct rgw_data_change_log_entry {
+  std::string log_id;
+  ceph::real_time log_timestamp;
+  rgw_data_change entry;
+
+  void encode(ceph::buffer::list& bl) const {
+    ENCODE_START(1, 1, bl);
+    encode(log_id, bl);
+    encode(log_timestamp, bl);
+    encode(entry, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(ceph::buffer::list::const_iterator& bl) {
+     DECODE_START(1, bl);
+     decode(log_id, bl);
+     decode(log_timestamp, bl);
+     decode(entry, bl);
+     DECODE_FINISH(bl);
+  }
+
+  void dump(ceph::Formatter* f) const;
+  void decode_json(JSONObj* obj);
+};
+WRITE_CLASS_ENCODER(rgw_data_change_log_entry)
+
+struct RGWDataChangesLogInfo {
+  std::string marker;
+  ceph::real_time last_update;
+
+  void dump(ceph::Formatter* f) const;
+  void decode_json(JSONObj* obj);
+};
+
+struct RGWDataChangesLogMarker {
+  int shard = 0;
+  std::string marker;
+
+  RGWDataChangesLogMarker() = default;
+};
+
+class RGWDataChangesLog;
+
+struct rgw_data_notify_entry {
+  std::string key;
+  uint64_t gen = 0;
+
+  void dump(ceph::Formatter* f) const;
+  void decode_json(JSONObj* obj);
+
+  rgw_data_notify_entry& operator=(const rgw_data_notify_entry&) = default;
+
+  bool operator <(const rgw_data_notify_entry& d) const {
+    if (key < d.key) {
+      return true;
+    }
+    if (d.key < key) {
+      return false;
+    }
+    return gen < d.gen;
+  }
+  friend std::ostream& operator <<(std::ostream& m,
+                                  const rgw_data_notify_entry& e) {
+    return m << "[key: " << e.key << ", gen: " << e.gen << "]";
+  }
+};
+
+class RGWDataChangesBE;
+
+class DataLogBackends final
+  : public logback_generations,
+    private bc::flat_map<uint64_t, boost::intrusive_ptr<RGWDataChangesBE>> {
+  friend class logback_generations;
+  friend class GenTrim;
+
+  std::mutex m;
+  RGWDataChangesLog& datalog;
+
+  DataLogBackends(librados::IoCtx& ioctx,
+                 std::string oid,
+                 fu2::unique_function<std::string(
+                   uint64_t, int) const>&& get_oid,
+                 int shards, RGWDataChangesLog& datalog) noexcept
+    : logback_generations(ioctx, oid, std::move(get_oid),
+                         shards), datalog(datalog) {}
+public:
+
+  boost::intrusive_ptr<RGWDataChangesBE> head() {
+    std::unique_lock l(m);
+    auto i = end();
+    --i;
+    return i->second;
+  }
+  int list(const DoutPrefixProvider *dpp, int shard, int max_entries,
+          std::vector<rgw_data_change_log_entry>& entries,
+          std::string_view marker,
+          std::string* out_marker, bool* truncated);
+  int trim_entries(const DoutPrefixProvider *dpp, int shard_id, std::string_view marker);
+  void trim_entries(const DoutPrefixProvider *dpp, int shard_id, std::string_view marker,
+                   librados::AioCompletion* c);
+  void set_zero(RGWDataChangesBE* be) {
+    emplace(0, be);
+  }
+
+  bs::error_code handle_init(entries_t e) noexcept override;
+  bs::error_code handle_new_gens(entries_t e) noexcept override;
+  bs::error_code handle_empty_to(uint64_t new_tail) noexcept override;
+
+  int trim_generations(const DoutPrefixProvider *dpp, std::optional<uint64_t>& through);
+};
+
+struct BucketGen {
+  rgw_bucket_shard shard;
+  uint64_t gen;
+
+  BucketGen(const rgw_bucket_shard& shard, uint64_t gen)
+    : shard(shard), gen(gen) {}
+
+  BucketGen(rgw_bucket_shard&& shard, uint64_t gen)
+    : shard(std::move(shard)), gen(gen) {}
+
+  BucketGen(const BucketGen&) = default;
+  BucketGen(BucketGen&&) = default;
+  BucketGen& operator =(const BucketGen&) = default;
+  BucketGen& operator =(BucketGen&&) = default;
+
+  ~BucketGen() = default;
+};
+
+inline bool operator ==(const BucketGen& l, const BucketGen& r) {
+  return (l.shard == r.shard) && (l.gen == r.gen);
+}
+
+inline bool operator <(const BucketGen& l, const BucketGen& r) {
+  if (l.shard < r.shard) {
+    return true;
+  } else if (l.shard == r.shard) {
+    return l.gen < r.gen;
+  } else {
+    return false;
+  }
+}
+
+class RGWDataChangesLog {
+  friend DataLogBackends;
+  CephContext *cct;
+  librados::IoCtx ioctx;
+  rgw::BucketChangeObserver *observer = nullptr;
+  const RGWZone* zone;
+  std::unique_ptr<DataLogBackends> bes;
+
+  const int num_shards;
+  std::string get_prefix() {
+    auto prefix = cct->_conf->rgw_data_log_obj_prefix;
+    return prefix.empty() ? prefix : "data_log";
+  }
+  std::string metadata_log_oid() {
+    return get_prefix() + "generations_metadata";
+  }
+  std::string prefix;
+
+  ceph::mutex lock = ceph::make_mutex("RGWDataChangesLog::lock");
+  ceph::shared_mutex modified_lock =
+    ceph::make_shared_mutex("RGWDataChangesLog::modified_lock");
+  bc::flat_map<int, bc::flat_set<rgw_data_notify_entry>> modified_shards;
+
+  std::atomic<bool> down_flag = { false };
+
+  struct ChangeStatus {
+    std::shared_ptr<const rgw_sync_policy_info> sync_policy;
+    ceph::real_time cur_expiration;
+    ceph::real_time cur_sent;
+    bool pending = false;
+    RefCountedCond* cond = nullptr;
+    ceph::mutex lock = ceph::make_mutex("RGWDataChangesLog::ChangeStatus");
+  };
+
+  using ChangeStatusPtr = std::shared_ptr<ChangeStatus>;
+
+  lru_map<BucketGen, ChangeStatusPtr> changes;
+
+  bc::flat_set<BucketGen> cur_cycle;
+
+  ChangeStatusPtr _get_change(const rgw_bucket_shard& bs, uint64_t gen);
+  void register_renew(const rgw_bucket_shard& bs,
+                     const rgw::bucket_log_layout_generation& gen);
+  void update_renewed(const rgw_bucket_shard& bs,
+                     uint64_t gen,
+                     ceph::real_time expiration);
+
+  ceph::mutex renew_lock = ceph::make_mutex("ChangesRenewThread::lock");
+  ceph::condition_variable renew_cond;
+  void renew_run() noexcept;
+  void renew_stop();
+  std::thread renew_thread;
+
+  std::function<bool(const rgw_bucket& bucket, optional_yield y, const DoutPrefixProvider *dpp)> bucket_filter;
+  bool going_down() const;
+  bool filter_bucket(const DoutPrefixProvider *dpp, const rgw_bucket& bucket, optional_yield y) const;
+  int renew_entries(const DoutPrefixProvider *dpp);
+
+public:
+
+  RGWDataChangesLog(CephContext* cct);
+  ~RGWDataChangesLog();
+
+  int start(const DoutPrefixProvider *dpp, const RGWZone* _zone, const RGWZoneParams& zoneparams,
+           librados::Rados* lr);
+  int choose_oid(const rgw_bucket_shard& bs);
+  int add_entry(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info,
+               const rgw::bucket_log_layout_generation& gen, int shard_id);
+  int get_log_shard_id(rgw_bucket& bucket, int shard_id);
+  int list_entries(const DoutPrefixProvider *dpp, int shard, int max_entries,
+                  std::vector<rgw_data_change_log_entry>& entries,
+                  std::string_view marker,
+                  std::string* out_marker, bool* truncated);
+  int trim_entries(const DoutPrefixProvider *dpp, int shard_id, std::string_view marker);
+  int trim_entries(const DoutPrefixProvider *dpp, int shard_id, std::string_view marker,
+                  librados::AioCompletion* c); // :(
+  int get_info(const DoutPrefixProvider *dpp, int shard_id, RGWDataChangesLogInfo *info);
+
+  using LogMarker = RGWDataChangesLogMarker;
+
+  int list_entries(const DoutPrefixProvider *dpp, int max_entries,
+                  std::vector<rgw_data_change_log_entry>& entries,
+                  LogMarker& marker, bool* ptruncated);
+
+  void mark_modified(int shard_id, const rgw_bucket_shard& bs, uint64_t gen);
+  auto read_clear_modified() {
+    std::unique_lock wl{modified_lock};
+    decltype(modified_shards) modified;
+    modified.swap(modified_shards);
+    modified_shards.clear();
+    return modified;
+  }
+
+  void set_observer(rgw::BucketChangeObserver *observer) {
+    this->observer = observer;
+  }
+
+  void set_bucket_filter(decltype(bucket_filter)&& f) {
+    bucket_filter = std::move(f);
+  }
+  // a marker that compares greater than any other
+  std::string max_marker() const;
+  std::string get_oid(uint64_t gen_id, int shard_id) const;
+
+
+  int change_format(const DoutPrefixProvider *dpp, log_type type, optional_yield y);
+  int trim_generations(const DoutPrefixProvider *dpp, std::optional<uint64_t>& through);
+};
+
+class RGWDataChangesBE : public boost::intrusive_ref_counter<RGWDataChangesBE> {
+protected:
+  librados::IoCtx& ioctx;
+  CephContext* const cct;
+  RGWDataChangesLog& datalog;
+
+  std::string get_oid(int shard_id) {
+    return datalog.get_oid(gen_id, shard_id);
+  }
+public:
+  using entries = std::variant<std::list<cls_log_entry>,
+                              std::vector<ceph::buffer::list>>;
+
+  const uint64_t gen_id;
+
+  RGWDataChangesBE(librados::IoCtx& ioctx,
+                  RGWDataChangesLog& datalog,
+                  uint64_t gen_id)
+    : ioctx(ioctx), cct(static_cast<CephContext*>(ioctx.cct())),
+      datalog(datalog), gen_id(gen_id) {}
+  virtual ~RGWDataChangesBE() = default;
+
+  virtual void prepare(ceph::real_time now,
+                      const std::string& key,
+                      ceph::buffer::list&& entry,
+                      entries& out) = 0;
+  virtual int push(const DoutPrefixProvider *dpp, int index, entries&& items) = 0;
+  virtual int push(const DoutPrefixProvider *dpp, int index, ceph::real_time now,
+                  const std::string& key,
+                  ceph::buffer::list&& bl) = 0;
+  virtual int list(const DoutPrefixProvider *dpp, int shard, int max_entries,
+                  std::vector<rgw_data_change_log_entry>& entries,
+                  std::optional<std::string_view> marker,
+                  std::string* out_marker, bool* truncated) = 0;
+  virtual int get_info(const DoutPrefixProvider *dpp, int index, RGWDataChangesLogInfo *info) = 0;
+  virtual int trim(const DoutPrefixProvider *dpp, int index, std::string_view marker) = 0;
+  virtual int trim(const DoutPrefixProvider *dpp, int index, std::string_view marker,
+                  librados::AioCompletion* c) = 0;
+  virtual std::string_view max_marker() const = 0;
+  // 1 on empty, 0 on non-empty, negative on error.
+  virtual int is_empty(const DoutPrefixProvider *dpp) = 0;
+};
+
+
+#endif
diff --git a/src/rgw/driver/rados/rgw_datalog_notify.cc b/src/rgw/driver/rados/rgw_datalog_notify.cc
new file mode 100644 (file)
index 0000000..12cdc53
--- /dev/null
@@ -0,0 +1,76 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "rgw_datalog_notify.h"
+#include "rgw_datalog.h"
+
+// custom encoding for v1 notify API
+struct EntryEncoderV1 {
+  const rgw_data_notify_entry& entry;
+};
+struct SetEncoderV1 {
+  const bc::flat_set<rgw_data_notify_entry>& entries;
+};
+
+// encode rgw_data_notify_entry as string
+void encode_json(const char *name, const EntryEncoderV1& e, Formatter *f)
+{
+  f->dump_string(name, e.entry.key); // encode the key only
+}
+// encode set<rgw_data_notify_entry> as set<string>
+void encode_json(const char *name, const SetEncoderV1& e, Formatter *f)
+{
+  f->open_array_section(name);
+  for (auto& entry : e.entries) {
+    encode_json("obj", EntryEncoderV1{entry}, f);
+  }
+  f->close_section();
+}
+// encode map<int, set<rgw_data_notify_entry>> as map<int, set<string>>
+void encode_json(const char *name, const rgw_data_notify_v1_encoder& e, Formatter *f)
+{
+  f->open_array_section(name);
+  for (auto& [key, val] : e.shards) {
+    f->open_object_section("entry");
+    encode_json("key", key, f);
+    encode_json("val", SetEncoderV1{val}, f);
+    f->close_section();
+  }
+  f->close_section();
+}
+
+struct EntryDecoderV1 {
+  rgw_data_notify_entry& entry;
+};
+struct SetDecoderV1 {
+  bc::flat_set<rgw_data_notify_entry>& entries;
+};
+
+// decode string into rgw_data_notify_entry
+void decode_json_obj(EntryDecoderV1& d, JSONObj *obj)
+{
+  decode_json_obj(d.entry.key, obj);
+  d.entry.gen = 0;
+}
+// decode set<string> into set<rgw_data_notify_entry>
+void decode_json_obj(SetDecoderV1& d, JSONObj *obj)
+{
+  for (JSONObjIter o = obj->find_first(); !o.end(); ++o) {
+    rgw_data_notify_entry val;
+    auto decoder = EntryDecoderV1{val};
+    decode_json_obj(decoder, *o);
+    d.entries.insert(std::move(val));
+  }
+}
+// decode map<int, set<string>> into map<int, set<rgw_data_notify_entry>>
+void decode_json_obj(rgw_data_notify_v1_decoder& d, JSONObj *obj)
+{
+  for (JSONObjIter o = obj->find_first(); !o.end(); ++o) {
+    int shard_id = 0;
+    JSONDecoder::decode_json("key", shard_id, *o);
+    bc::flat_set<rgw_data_notify_entry> val;
+    SetDecoderV1 decoder{val};
+    JSONDecoder::decode_json("val", decoder, *o);
+    d.shards[shard_id] = std::move(val);
+  }
+}
diff --git a/src/rgw/driver/rados/rgw_datalog_notify.h b/src/rgw/driver/rados/rgw_datalog_notify.h
new file mode 100644 (file)
index 0000000..4cd1b3c
--- /dev/null
@@ -0,0 +1,31 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include <boost/container/flat_map.hpp>
+#include <boost/container/flat_set.hpp>
+
+#include "rgw_datalog.h"
+
+namespace bc = boost::container;
+
+namespace ceph { class Formatter; }
+class JSONObj;
+
+class RGWCoroutine;
+class RGWHTTPManager;
+class RGWRESTConn;
+
+struct rgw_data_notify_entry;
+
+// json encoder and decoder for notify v1 API
+struct rgw_data_notify_v1_encoder {
+  const bc::flat_map<int, bc::flat_set<rgw_data_notify_entry>>& shards;
+};
+void encode_json(const char *name, const rgw_data_notify_v1_encoder& e,
+                 ceph::Formatter *f);
+struct rgw_data_notify_v1_decoder {
+  bc::flat_map<int, bc::flat_set<rgw_data_notify_entry>>& shards;
+};
+void decode_json_obj(rgw_data_notify_v1_decoder& d, JSONObj *obj);
diff --git a/src/rgw/driver/rados/rgw_etag_verifier.cc b/src/rgw/driver/rados/rgw_etag_verifier.cc
new file mode 100644 (file)
index 0000000..52f7c79
--- /dev/null
@@ -0,0 +1,191 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "rgw_etag_verifier.h"
+#include "rgw_obj_manifest.h"
+
+#define dout_subsys ceph_subsys_rgw
+
+namespace rgw::putobj {
+
+int create_etag_verifier(const DoutPrefixProvider *dpp, 
+                         CephContext* cct, rgw::sal::DataProcessor* filter,
+                         const bufferlist& manifest_bl,
+                         const std::optional<RGWCompressionInfo>& compression,
+                         etag_verifier_ptr& verifier)
+{
+  RGWObjManifest manifest;
+
+  try {
+    auto miter = manifest_bl.cbegin();
+    decode(manifest, miter);
+  } catch (buffer::error& err) {
+    ldpp_dout(dpp, 0) << "ERROR: couldn't decode manifest" << dendl;
+    return -EIO;
+  }
+
+  RGWObjManifestRule rule;
+  bool found = manifest.get_rule(0, &rule);
+  if (!found) {
+    ldpp_dout(dpp, -1) << "ERROR: manifest->get_rule() could not find rule" << dendl;
+    return -EIO;
+  }
+
+  if (rule.start_part_num == 0) {
+    /* Atomic object */
+    verifier.emplace<ETagVerifier_Atomic>(cct, filter);
+    return 0;
+  }
+
+  uint64_t cur_part_ofs = UINT64_MAX;
+  std::vector<uint64_t> part_ofs;
+
+  /*
+   * We must store the offset of each part to calculate the ETAGs for each
+   * MPU part. These part ETags then become the input for the MPU object
+   * Etag.
+   */
+  for (auto mi = manifest.obj_begin(dpp); mi != manifest.obj_end(dpp); ++mi) {
+    if (cur_part_ofs == mi.get_part_ofs())
+      continue;
+    cur_part_ofs = mi.get_part_ofs();
+    ldpp_dout(dpp, 20) << "MPU Part offset:" << cur_part_ofs << dendl;
+    part_ofs.push_back(cur_part_ofs);
+  }
+
+  if (compression) {
+    // if the source object was compressed, the manifest is storing
+    // compressed part offsets. transform the compressed offsets back to
+    // their original offsets by finding the first block of each part
+    const auto& blocks = compression->blocks;
+    auto block = blocks.begin();
+    for (auto& ofs : part_ofs) {
+      // find the compression_block with new_ofs == ofs
+      constexpr auto less = [] (const compression_block& block, uint64_t ofs) {
+        return block.new_ofs < ofs;
+      };
+      block = std::lower_bound(block, blocks.end(), ofs, less);
+      if (block == blocks.end() || block->new_ofs != ofs) {
+        ldpp_dout(dpp, 4) << "no match for compressed offset " << ofs
+            << ", disabling etag verification" << dendl;
+        return -EIO;
+      }
+      ofs = block->old_ofs;
+      ldpp_dout(dpp, 20) << "MPU Part uncompressed offset:" << ofs << dendl;
+    }
+  }
+
+  verifier.emplace<ETagVerifier_MPU>(cct, std::move(part_ofs), filter);
+  return 0;
+}
+
+int ETagVerifier_Atomic::process(bufferlist&& in, uint64_t logical_offset)
+{
+  bufferlist out;
+  if (in.length() > 0)
+    hash.Update((const unsigned char *)in.c_str(), in.length());
+
+  return Pipe::process(std::move(in), logical_offset);
+}
+
+void ETagVerifier_Atomic::calculate_etag()
+{
+  unsigned char m[CEPH_CRYPTO_MD5_DIGESTSIZE];
+  char calc_md5[CEPH_CRYPTO_MD5_DIGESTSIZE * 2 + 1];
+
+  /* Return early if ETag has already been calculated */
+  if (!calculated_etag.empty())
+    return;
+
+  hash.Final(m);
+  buf_to_hex(m, CEPH_CRYPTO_MD5_DIGESTSIZE, calc_md5);
+  calculated_etag = calc_md5;
+  ldout(cct, 20) << "Single part object: " << " etag:" << calculated_etag
+          << dendl;
+}
+
+void ETagVerifier_MPU::process_end_of_MPU_part()
+{
+  unsigned char m[CEPH_CRYPTO_MD5_DIGESTSIZE];
+  char calc_md5_part[CEPH_CRYPTO_MD5_DIGESTSIZE * 2 + 1];
+  std::string calculated_etag_part;
+
+  hash.Final(m);
+  mpu_etag_hash.Update((const unsigned char *)m, sizeof(m));
+  hash.Restart();
+
+  if (cct->_conf->subsys.should_gather(dout_subsys, 20)) {
+    buf_to_hex(m, CEPH_CRYPTO_MD5_DIGESTSIZE, calc_md5_part);
+    calculated_etag_part = calc_md5_part;
+    ldout(cct, 20) << "Part etag: " << calculated_etag_part << dendl;
+  }
+
+  cur_part_index++;
+  next_part_index++;
+}
+
+int ETagVerifier_MPU::process(bufferlist&& in, uint64_t logical_offset)
+{
+  uint64_t bl_end = in.length() + logical_offset;
+
+  /* Handle the last MPU part */
+  if (size_t(next_part_index) == part_ofs.size()) {
+    hash.Update((const unsigned char *)in.c_str(), in.length());
+    goto done;
+  }
+
+  /* Incoming bufferlist spans two MPU parts. Calculate separate ETags */
+  if (bl_end > part_ofs[next_part_index]) {
+
+    uint64_t part_one_len = part_ofs[next_part_index] - logical_offset;
+    hash.Update((const unsigned char *)in.c_str(), part_one_len);
+    process_end_of_MPU_part();
+
+    hash.Update((const unsigned char *)in.c_str() + part_one_len,
+      bl_end - part_ofs[cur_part_index]);
+    /*
+     * If we've moved to the last part of the MPU, avoid usage of
+     * parts_ofs[next_part_index] as it will lead to our-of-range access.
+     */
+    if (size_t(next_part_index) == part_ofs.size())
+      goto done;
+  } else {
+    hash.Update((const unsigned char *)in.c_str(), in.length());
+  }
+
+  /* Update the MPU Etag if the current part has ended */
+  if (logical_offset + in.length() + 1 == part_ofs[next_part_index])
+    process_end_of_MPU_part();
+
+done:
+  return Pipe::process(std::move(in), logical_offset);
+}
+
+void ETagVerifier_MPU::calculate_etag()
+{
+  const uint32_t parts = part_ofs.size();
+  constexpr auto digits10 = std::numeric_limits<uint32_t>::digits10;
+  constexpr auto extra = 2 + digits10; // add "-%u\0" at the end
+
+  unsigned char m[CEPH_CRYPTO_MD5_DIGESTSIZE], mpu_m[CEPH_CRYPTO_MD5_DIGESTSIZE];
+  char final_etag_str[CEPH_CRYPTO_MD5_DIGESTSIZE * 2 + extra];
+
+  /* Return early if ETag has already been calculated */
+  if (!calculated_etag.empty())
+    return;
+
+  hash.Final(m);
+  mpu_etag_hash.Update((const unsigned char *)m, sizeof(m));
+
+  /* Refer RGWCompleteMultipart::execute() for ETag calculation for MPU object */
+  mpu_etag_hash.Final(mpu_m);
+  buf_to_hex(mpu_m, CEPH_CRYPTO_MD5_DIGESTSIZE, final_etag_str);
+  snprintf(&final_etag_str[CEPH_CRYPTO_MD5_DIGESTSIZE * 2],
+           sizeof(final_etag_str) - CEPH_CRYPTO_MD5_DIGESTSIZE * 2,
+           "-%u", parts);
+
+  calculated_etag = final_etag_str;
+  ldout(cct, 20) << "MPU calculated ETag:" << calculated_etag << dendl;
+}
+
+} // namespace rgw::putobj
diff --git a/src/rgw/driver/rados/rgw_etag_verifier.h b/src/rgw/driver/rados/rgw_etag_verifier.h
new file mode 100644 (file)
index 0000000..56a679e
--- /dev/null
@@ -0,0 +1,92 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * RGW Etag Verifier is an RGW filter which enables the objects copied using
+ * multisite sync to be verified using their ETag from source i.e. the MD5
+ * checksum of the object is computed at the destination and is verified to be
+ * identical to the ETag stored in the object HEAD at source cluster.
+ * 
+ * For MPU objects, a different filter named RGWMultipartEtagFilter is applied
+ * which re-computes ETag using RGWObjManifest. This computes the ETag using the
+ * same algorithm used at the source cluster i.e. MD5 sum of the individual ETag
+ * on the MPU parts.
+ */
+#ifndef CEPH_RGW_ETAG_VERIFIER_H
+#define CEPH_RGW_ETAG_VERIFIER_H
+
+#include "rgw_putobj.h"
+#include "rgw_op.h"
+#include "common/static_ptr.h"
+
+namespace rgw::putobj {
+
+class ETagVerifier : public rgw::putobj::Pipe
+{
+protected:
+  CephContext* cct;
+  MD5 hash;
+  std::string calculated_etag;
+
+public:
+  ETagVerifier(CephContext* cct_, rgw::sal::DataProcessor *next)
+    : Pipe(next), cct(cct_) {
+      // Allow use of MD5 digest in FIPS mode for non-cryptographic purposes
+      hash.SetFlags(EVP_MD_CTX_FLAG_NON_FIPS_ALLOW);
+    }
+
+  virtual void calculate_etag() = 0;
+  std::string get_calculated_etag() { return calculated_etag;}
+
+}; /* ETagVerifier */
+
+class ETagVerifier_Atomic : public ETagVerifier
+{
+public:
+  ETagVerifier_Atomic(CephContext* cct_, rgw::sal::DataProcessor *next)
+    : ETagVerifier(cct_, next) {}
+
+  int process(bufferlist&& data, uint64_t logical_offset) override;
+  void calculate_etag() override;
+
+}; /* ETagVerifier_Atomic */
+
+class ETagVerifier_MPU : public ETagVerifier
+{
+  std::vector<uint64_t> part_ofs;
+  uint64_t cur_part_index{0}, next_part_index{1};
+  MD5 mpu_etag_hash;
+  void process_end_of_MPU_part();
+
+public:
+  ETagVerifier_MPU(CephContext* cct,
+                             std::vector<uint64_t> part_ofs,
+                             rgw::sal::DataProcessor *next)
+    : ETagVerifier(cct, next),
+      part_ofs(std::move(part_ofs))
+  {
+    // Allow use of MD5 digest in FIPS mode for non-cryptographic purposes
+    hash.SetFlags(EVP_MD_CTX_FLAG_NON_FIPS_ALLOW);
+  }
+
+  int process(bufferlist&& data, uint64_t logical_offset) override;
+  void calculate_etag() override;
+
+}; /* ETagVerifier_MPU */
+
+constexpr auto max_etag_verifier_size = std::max(
+    sizeof(ETagVerifier_Atomic),
+    sizeof(ETagVerifier_MPU)
+  );
+using etag_verifier_ptr = ceph::static_ptr<ETagVerifier, max_etag_verifier_size>;
+
+int create_etag_verifier(const DoutPrefixProvider *dpp, 
+                         CephContext* cct, rgw::sal::DataProcessor* next,
+                         const bufferlist& manifest_bl,
+                         const std::optional<RGWCompressionInfo>& compression,
+                         etag_verifier_ptr& verifier);
+
+} // namespace rgw::putobj
+
+#endif /* CEPH_RGW_ETAG_VERIFIER_H */
diff --git a/src/rgw/driver/rados/rgw_gc.cc b/src/rgw/driver/rados/rgw_gc.cc
new file mode 100644 (file)
index 0000000..bd16bde
--- /dev/null
@@ -0,0 +1,811 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "rgw_gc.h"
+
+#include "rgw_tools.h"
+#include "include/scope_guard.h"
+#include "include/rados/librados.hpp"
+#include "cls/rgw/cls_rgw_client.h"
+#include "cls/rgw_gc/cls_rgw_gc_client.h"
+#include "cls/refcount/cls_refcount_client.h"
+#include "cls/version/cls_version_client.h"
+#include "rgw_perf_counters.h"
+#include "cls/lock/cls_lock_client.h"
+#include "include/random.h"
+#include "rgw_gc_log.h"
+
+#include <list> // XXX
+#include <sstream>
+#include "xxhash.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+using namespace librados;
+
+static string gc_oid_prefix = "gc";
+static string gc_index_lock_name = "gc_process";
+
+void RGWGC::initialize(CephContext *_cct, RGWRados *_store) {
+  cct = _cct;
+  store = _store;
+
+  max_objs = min(static_cast<int>(cct->_conf->rgw_gc_max_objs), rgw_shards_max());
+
+  obj_names = new string[max_objs];
+
+  for (int i = 0; i < max_objs; i++) {
+    obj_names[i] = gc_oid_prefix;
+    char buf[32];
+    snprintf(buf, 32, ".%d", i);
+    obj_names[i].append(buf);
+
+    auto it = transitioned_objects_cache.begin() + i;
+    transitioned_objects_cache.insert(it, false);
+
+    //version = 0 -> not ready for transition
+    //version = 1 -> marked ready for transition
+    librados::ObjectWriteOperation op;
+    op.create(false);
+    const uint64_t queue_size = cct->_conf->rgw_gc_max_queue_size, num_deferred_entries = cct->_conf->rgw_gc_max_deferred;
+    gc_log_init2(op, queue_size, num_deferred_entries);
+    store->gc_operate(this, obj_names[i], &op);
+  }
+}
+
+void RGWGC::finalize()
+{
+  delete[] obj_names;
+}
+
+int RGWGC::tag_index(const string& tag)
+{
+  return rgw_shards_mod(XXH64(tag.c_str(), tag.size(), seed), max_objs);
+}
+
+std::tuple<int, std::optional<cls_rgw_obj_chain>> RGWGC::send_split_chain(const cls_rgw_obj_chain& chain, const std::string& tag)
+{
+  ldpp_dout(this, 20) << "RGWGC::send_split_chain - tag is: " << tag << dendl;
+
+  if (cct->_conf->rgw_max_chunk_size) {
+    cls_rgw_obj_chain broken_chain;
+    ldpp_dout(this, 20) << "RGWGC::send_split_chain - rgw_max_chunk_size is: " << cct->_conf->rgw_max_chunk_size << dendl;
+
+    for (auto it = chain.objs.begin(); it != chain.objs.end(); it++) {
+      ldpp_dout(this, 20) << "RGWGC::send_split_chain - adding obj with name: " << it->key << dendl;
+      broken_chain.objs.emplace_back(*it);
+      cls_rgw_gc_obj_info info;
+      info.tag = tag;
+      info.chain = broken_chain;
+      cls_rgw_gc_set_entry_op op;
+      op.info = info;
+      size_t total_encoded_size = op.estimate_encoded_size();
+      ldpp_dout(this, 20) << "RGWGC::send_split_chain - total_encoded_size is: " << total_encoded_size << dendl;
+
+      if (total_encoded_size > cct->_conf->rgw_max_chunk_size) { //dont add to chain, and send to gc
+        broken_chain.objs.pop_back();
+        --it;
+        ldpp_dout(this, 20) << "RGWGC::send_split_chain - more than, dont add to broken chain and send chain" << dendl;
+        auto ret = send_chain(broken_chain, tag);
+        if (ret < 0) {
+          broken_chain.objs.insert(broken_chain.objs.end(), it, chain.objs.end()); // add all the remainder objs to the list to be deleted inline
+          ldpp_dout(this, 0) << "RGWGC::send_split_chain - send chain returned error: " << ret << dendl;
+          return {ret, {broken_chain}};
+        }
+        broken_chain.objs.clear();
+      }
+    }
+    if (!broken_chain.objs.empty()) { //when the chain is smaller than or equal to rgw_max_chunk_size
+      ldpp_dout(this, 20) << "RGWGC::send_split_chain - sending leftover objects" << dendl;
+      auto ret = send_chain(broken_chain, tag);
+      if (ret < 0) {
+        ldpp_dout(this, 0) << "RGWGC::send_split_chain - send chain returned error: " << ret << dendl;
+        return {ret, {broken_chain}};
+      }
+    }
+  } else {
+    auto ret = send_chain(chain, tag);
+    if (ret < 0) {
+      ldpp_dout(this, 0) << "RGWGC::send_split_chain - send chain returned error: " << ret << dendl;
+      return {ret, {std::move(chain)}};
+    }
+  }
+  return {0, {}};
+}
+
+int RGWGC::send_chain(const cls_rgw_obj_chain& chain, const string& tag)
+{
+  ObjectWriteOperation op;
+  cls_rgw_gc_obj_info info;
+  info.chain = chain;
+  info.tag = tag;
+  gc_log_enqueue2(op, cct->_conf->rgw_gc_obj_min_wait, info);
+
+  int i = tag_index(tag);
+
+  ldpp_dout(this, 20) << "RGWGC::send_chain - on object name: " << obj_names[i] << "tag is: " << tag << dendl;
+
+  auto ret = store->gc_operate(this, obj_names[i], &op);
+  if (ret != -ECANCELED && ret != -EPERM) {
+    return ret;
+  }
+  ObjectWriteOperation set_entry_op;
+  cls_rgw_gc_set_entry(set_entry_op, cct->_conf->rgw_gc_obj_min_wait, info);
+  return store->gc_operate(this, obj_names[i], &set_entry_op);
+}
+
+struct defer_chain_state {
+  librados::AioCompletion* completion = nullptr;
+  // TODO: hold a reference on the state in RGWGC to avoid use-after-free if
+  // RGWGC destructs before this completion fires
+  RGWGC* gc = nullptr;
+  cls_rgw_gc_obj_info info;
+
+  ~defer_chain_state() {
+    if (completion) {
+      completion->release();
+    }
+  }
+};
+
+static void async_defer_callback(librados::completion_t, void* arg)
+{
+  std::unique_ptr<defer_chain_state> state{static_cast<defer_chain_state*>(arg)};
+  if (state->completion->get_return_value() == -ECANCELED) {
+    state->gc->on_defer_canceled(state->info);
+  }
+}
+
+void RGWGC::on_defer_canceled(const cls_rgw_gc_obj_info& info)
+{
+  const std::string& tag = info.tag;
+  const int i = tag_index(tag);
+
+  // ECANCELED from cls_version_check() tells us that we've transitioned
+  transitioned_objects_cache[i] = true;
+
+  ObjectWriteOperation op;
+  cls_rgw_gc_queue_defer_entry(op, cct->_conf->rgw_gc_obj_min_wait, info);
+  cls_rgw_gc_remove(op, {tag});
+
+  auto c = librados::Rados::aio_create_completion(nullptr, nullptr);
+  store->gc_aio_operate(obj_names[i], c, &op);
+  c->release();
+}
+
+int RGWGC::async_defer_chain(const string& tag, const cls_rgw_obj_chain& chain)
+{
+  const int i = tag_index(tag);
+  cls_rgw_gc_obj_info info;
+  info.chain = chain;
+  info.tag = tag;
+
+  // if we've transitioned this shard object, we can rely on the cls_rgw_gc queue
+  if (transitioned_objects_cache[i]) {
+    ObjectWriteOperation op;
+    cls_rgw_gc_queue_defer_entry(op, cct->_conf->rgw_gc_obj_min_wait, info);
+
+    // this tag may still be present in omap, so remove it once the cls_rgw_gc
+    // enqueue succeeds
+    cls_rgw_gc_remove(op, {tag});
+
+    auto c = librados::Rados::aio_create_completion(nullptr, nullptr);
+    int ret = store->gc_aio_operate(obj_names[i], c, &op);
+    c->release();
+    return ret;
+  }
+
+  // if we haven't seen the transition yet, write the defer to omap with cls_rgw
+  ObjectWriteOperation op;
+
+  // assert that we haven't initialized cls_rgw_gc queue. this prevents us
+  // from writing new entries to omap after the transition
+  gc_log_defer1(op, cct->_conf->rgw_gc_obj_min_wait, info);
+
+  // prepare a callback to detect the transition via ECANCELED from cls_version_check()
+  auto state = std::make_unique<defer_chain_state>();
+  state->gc = this;
+  state->info.chain = chain;
+  state->info.tag = tag;
+  state->completion = librados::Rados::aio_create_completion(
+      state.get(), async_defer_callback);
+
+  int ret = store->gc_aio_operate(obj_names[i], state->completion, &op);
+  if (ret == 0) {
+    state.release(); // release ownership until async_defer_callback()
+  }
+  return ret;
+}
+
+int RGWGC::remove(int index, const std::vector<string>& tags, AioCompletion **pc)
+{
+  ObjectWriteOperation op;
+  cls_rgw_gc_remove(op, tags);
+
+  auto c = librados::Rados::aio_create_completion(nullptr, nullptr);
+  int ret = store->gc_aio_operate(obj_names[index], c, &op);
+  if (ret < 0) {
+    c->release();
+  } else {
+    *pc = c;
+  }
+  return ret;
+}
+
+int RGWGC::remove(int index, int num_entries)
+{
+  ObjectWriteOperation op;
+  cls_rgw_gc_queue_remove_entries(op, num_entries);
+
+  return store->gc_operate(this, obj_names[index], &op);
+}
+
+int RGWGC::list(int *index, string& marker, uint32_t max, bool expired_only, std::list<cls_rgw_gc_obj_info>& result, bool *truncated, bool& processing_queue)
+{
+  result.clear();
+  string next_marker;
+  bool check_queue = false;
+
+  for (; *index < max_objs && result.size() < max; (*index)++, marker.clear(), check_queue = false) {
+    std::list<cls_rgw_gc_obj_info> entries, queue_entries;
+    int ret = 0;
+
+    //processing_queue is set to true from previous iteration if the queue was under process and probably has more elements in it.
+    if (! transitioned_objects_cache[*index] && ! check_queue && ! processing_queue) {
+      ret = cls_rgw_gc_list(store->gc_pool_ctx, obj_names[*index], marker, max - result.size(), expired_only, entries, truncated, next_marker);
+      if (ret != -ENOENT && ret < 0) {
+        return ret;
+      }
+      obj_version objv;
+      cls_version_read(store->gc_pool_ctx, obj_names[*index], &objv);
+      if (ret == -ENOENT || entries.size() == 0) {
+        if (objv.ver == 0) {
+          continue;
+        } else {
+          if (! expired_only) {
+            transitioned_objects_cache[*index] = true;
+            marker.clear();
+          } else {
+            std::list<cls_rgw_gc_obj_info> non_expired_entries;
+            ret = cls_rgw_gc_list(store->gc_pool_ctx, obj_names[*index], marker, 1, false, non_expired_entries, truncated, next_marker);
+            if (non_expired_entries.size() == 0) {
+              transitioned_objects_cache[*index] = true;
+              marker.clear();
+            }
+          }
+        }
+      }
+      if ((objv.ver == 1) && (entries.size() < max - result.size())) {
+        check_queue = true;
+        marker.clear();
+      }
+    }
+    if (transitioned_objects_cache[*index] || check_queue || processing_queue) {
+      processing_queue = false;
+      ret = cls_rgw_gc_queue_list_entries(store->gc_pool_ctx, obj_names[*index], marker, (max - result.size()) - entries.size(), expired_only, queue_entries, truncated, next_marker);
+      if (ret < 0) {
+        return ret;
+      }
+    }
+    if (entries.size() == 0 && queue_entries.size() == 0)
+      continue;
+
+    std::list<cls_rgw_gc_obj_info>::iterator iter;
+    for (iter = entries.begin(); iter != entries.end(); ++iter) {
+      result.push_back(*iter);
+    }
+
+    for (iter = queue_entries.begin(); iter != queue_entries.end(); ++iter) {
+      result.push_back(*iter);
+    }
+
+    marker = next_marker;
+
+    if (*index == max_objs - 1) {
+      if (queue_entries.size() > 0 && *truncated) {
+        processing_queue = true;
+      } else {
+        processing_queue = false;
+      }
+      /* we cut short here, truncated will hold the correct value */
+      return 0;
+    }
+
+    if (result.size() == max) {
+      if (queue_entries.size() > 0 && *truncated) {
+        processing_queue = true;
+      } else {
+        processing_queue = false;
+        *index += 1; //move to next gc object
+      }
+
+      /* close approximation, it might be that the next of the objects don't hold
+       * anything, in this case truncated should have been false, but we can find
+       * that out on the next iteration
+       */
+      *truncated = true;
+      return 0;
+    }
+  }
+  *truncated = false;
+  processing_queue = false;
+
+  return 0;
+}
+
+class RGWGCIOManager {
+  const DoutPrefixProvider* dpp;
+  CephContext *cct;
+  RGWGC *gc;
+
+  struct IO {
+    enum Type {
+      UnknownIO = 0,
+      TailIO = 1,
+      IndexIO = 2,
+    } type{UnknownIO};
+    librados::AioCompletion *c{nullptr};
+    string oid;
+    int index{-1};
+    string tag;
+  };
+
+  deque<IO> ios;
+  vector<std::vector<string> > remove_tags;
+  /* tracks the number of remaining shadow objects for a given tag in order to
+   * only remove the tag once all shadow objects have themselves been removed
+   */
+  vector<map<string, size_t> > tag_io_size;
+
+#define MAX_AIO_DEFAULT 10
+  size_t max_aio{MAX_AIO_DEFAULT};
+
+public:
+  RGWGCIOManager(const DoutPrefixProvider* _dpp, CephContext *_cct, RGWGC *_gc) : dpp(_dpp),
+                                                                                  cct(_cct),
+                                                                                  gc(_gc) {
+    max_aio = cct->_conf->rgw_gc_max_concurrent_io;
+    remove_tags.resize(min(static_cast<int>(cct->_conf->rgw_gc_max_objs), rgw_shards_max()));
+    tag_io_size.resize(min(static_cast<int>(cct->_conf->rgw_gc_max_objs), rgw_shards_max()));
+  }
+
+  ~RGWGCIOManager() {
+    for (auto io : ios) {
+      io.c->release();
+    }
+  }
+
+  int schedule_io(IoCtx *ioctx, const string& oid, ObjectWriteOperation *op,
+                 int index, const string& tag) {
+    while (ios.size() > max_aio) {
+      if (gc->going_down()) {
+        return 0;
+      }
+      auto ret = handle_next_completion();
+      //Return error if we are using queue, else ignore it
+      if (gc->transitioned_objects_cache[index] && ret < 0) {
+        return ret;
+      }
+    }
+
+    auto c = librados::Rados::aio_create_completion(nullptr, nullptr);
+    int ret = ioctx->aio_operate(oid, c, op);
+    if (ret < 0) {
+      return ret;
+    }
+    ios.push_back(IO{IO::TailIO, c, oid, index, tag});
+
+    return 0;
+  }
+
+  int handle_next_completion() {
+    ceph_assert(!ios.empty());
+    IO& io = ios.front();
+    io.c->wait_for_complete();
+    int ret = io.c->get_return_value();
+    io.c->release();
+
+    if (ret == -ENOENT) {
+      ret = 0;
+    }
+
+    if (io.type == IO::IndexIO && ! gc->transitioned_objects_cache[io.index]) {
+      if (ret < 0) {
+        ldpp_dout(dpp, 0) << "WARNING: gc cleanup of tags on gc shard index=" <<
+         io.index << " returned error, ret=" << ret << dendl;
+      }
+      goto done;
+    }
+
+    if (ret < 0) {
+      ldpp_dout(dpp, 0) << "WARNING: gc could not remove oid=" << io.oid <<
+       ", ret=" << ret << dendl;
+      goto done;
+    }
+
+    if (! gc->transitioned_objects_cache[io.index]) {
+      schedule_tag_removal(io.index, io.tag);
+    }
+
+  done:
+    ios.pop_front();
+    return ret;
+  }
+
+  /* This is a request to schedule a tag removal. It will be called once when
+   * there are no shadow objects. But it will also be called for every shadow
+   * object when there are any. Since we do not want the tag to be removed
+   * until all shadow objects have been successfully removed, the scheduling
+   * will not happen until the shadow object count goes down to zero
+   */
+  void schedule_tag_removal(int index, string tag) {
+    auto& ts = tag_io_size[index];
+    auto ts_it = ts.find(tag);
+    if (ts_it != ts.end()) {
+      auto& size = ts_it->second;
+      --size;
+      // wait all shadow obj delete return
+      if (size != 0)
+        return;
+
+      ts.erase(ts_it);
+    }
+
+    auto& rt = remove_tags[index];
+
+    rt.push_back(tag);
+    if (rt.size() >= (size_t)cct->_conf->rgw_gc_max_trim_chunk) {
+      flush_remove_tags(index, rt);
+    }
+  }
+
+  void add_tag_io_size(int index, string tag, size_t size) {
+    auto& ts = tag_io_size[index];
+    ts.emplace(tag, size);
+  }
+
+  int drain_ios() {
+    int ret_val = 0;
+    while (!ios.empty()) {
+      if (gc->going_down()) {
+        return -EAGAIN;
+      }
+      auto ret = handle_next_completion();
+      if (ret < 0) {
+        ret_val = ret;
+      }
+    }
+    return ret_val;
+  }
+
+  void drain() {
+    drain_ios();
+    flush_remove_tags();
+    /* the tags draining might have generated more ios, drain those too */
+    drain_ios();
+  }
+
+  void flush_remove_tags(int index, vector<string>& rt) {
+    IO index_io;
+    index_io.type = IO::IndexIO;
+    index_io.index = index;
+
+    ldpp_dout(dpp, 20) << __func__ <<
+      " removing entries from gc log shard index=" << index << ", size=" <<
+      rt.size() << ", entries=" << rt << dendl;
+
+    auto rt_guard = make_scope_guard(
+      [&]
+       {
+         rt.clear();
+       }
+      );
+
+    int ret = gc->remove(index, rt, &index_io.c);
+    if (ret < 0) {
+      /* we already cleared list of tags, this prevents us from
+       * ballooning in case of a persistent problem
+       */
+      ldpp_dout(dpp, 0) << "WARNING: failed to remove tags on gc shard index=" <<
+       index << " ret=" << ret << dendl;
+      return;
+    }
+    if (perfcounter) {
+      /* log the count of tags retired for rate estimation */
+      perfcounter->inc(l_rgw_gc_retire, rt.size());
+    }
+    ios.push_back(index_io);
+  }
+
+  void flush_remove_tags() {
+    int index = 0;
+    for (auto& rt : remove_tags) {
+      if (! gc->transitioned_objects_cache[index]) {
+        flush_remove_tags(index, rt);
+      }
+      ++index;
+    }
+  }
+
+  int remove_queue_entries(int index, int num_entries) {
+    int ret = gc->remove(index, num_entries);
+    if (ret < 0) {
+      ldpp_dout(dpp, 0) << "ERROR: failed to remove queue entries on index=" <<
+           index << " ret=" << ret << dendl;
+      return ret;
+    }
+    if (perfcounter) {
+      /* log the count of tags retired for rate estimation */
+      perfcounter->inc(l_rgw_gc_retire, num_entries);
+    }
+    return 0;
+  }
+}; // class RGWGCIOManger
+
+int RGWGC::process(int index, int max_secs, bool expired_only,
+                   RGWGCIOManager& io_manager)
+{
+  ldpp_dout(this, 20) << "RGWGC::process entered with GC index_shard=" <<
+    index << ", max_secs=" << max_secs << ", expired_only=" <<
+    expired_only << dendl;
+
+  rados::cls::lock::Lock l(gc_index_lock_name);
+  utime_t end = ceph_clock_now();
+
+  /* max_secs should be greater than zero. We don't want a zero max_secs
+   * to be translated as no timeout, since we'd then need to break the
+   * lock and that would require a manual intervention. In this case
+   * we can just wait it out. */
+  if (max_secs <= 0)
+    return -EAGAIN;
+
+  end += max_secs;
+  utime_t time(max_secs, 0);
+  l.set_duration(time);
+
+  int ret = l.lock_exclusive(&store->gc_pool_ctx, obj_names[index]);
+  if (ret == -EBUSY) { /* already locked by another gc processor */
+    ldpp_dout(this, 10) << "RGWGC::process failed to acquire lock on " <<
+      obj_names[index] << dendl;
+    return 0;
+  }
+  if (ret < 0)
+    return ret;
+
+  string marker;
+  string next_marker;
+  bool truncated;
+  IoCtx *ctx = new IoCtx;
+  do {
+    int max = 100;
+    std::list<cls_rgw_gc_obj_info> entries;
+
+    int ret = 0;
+
+    if (! transitioned_objects_cache[index]) {
+      ret = cls_rgw_gc_list(store->gc_pool_ctx, obj_names[index], marker, max, expired_only, entries, &truncated, next_marker);
+      ldpp_dout(this, 20) <<
+      "RGWGC::process cls_rgw_gc_list returned with returned:" << ret <<
+      ", entries.size=" << entries.size() << ", truncated=" << truncated <<
+      ", next_marker='" << next_marker << "'" << dendl;
+      obj_version objv;
+      cls_version_read(store->gc_pool_ctx, obj_names[index], &objv);
+      if ((objv.ver == 1) && entries.size() == 0) {
+        std::list<cls_rgw_gc_obj_info> non_expired_entries;
+        ret = cls_rgw_gc_list(store->gc_pool_ctx, obj_names[index], marker, 1, false, non_expired_entries, &truncated, next_marker);
+        if (non_expired_entries.size() == 0) {
+          transitioned_objects_cache[index] = true;
+          marker.clear();
+          ldpp_dout(this, 20) << "RGWGC::process cls_rgw_gc_list returned NO non expired entries, so setting cache entry to TRUE" << dendl;
+        } else {
+          ret = 0;
+          goto done;
+        }
+      }
+      if ((objv.ver == 0) && (ret == -ENOENT || entries.size() == 0)) {
+        ret = 0;
+        goto done;
+      }
+    }
+
+    if (transitioned_objects_cache[index]) {
+      ret = cls_rgw_gc_queue_list_entries(store->gc_pool_ctx, obj_names[index], marker, max, expired_only, entries, &truncated, next_marker);
+      ldpp_dout(this, 20) <<
+      "RGWGC::process cls_rgw_gc_queue_list_entries returned with return value:" << ret <<
+      ", entries.size=" << entries.size() << ", truncated=" << truncated <<
+      ", next_marker='" << next_marker << "'" << dendl;
+      if (entries.size() == 0) {
+        ret = 0;
+        goto done;
+      }
+    }
+
+    if (ret < 0)
+      goto done;
+
+    marker = next_marker;
+
+    string last_pool;
+    std::list<cls_rgw_gc_obj_info>::iterator iter;
+    for (iter = entries.begin(); iter != entries.end(); ++iter) {
+      cls_rgw_gc_obj_info& info = *iter;
+
+      ldpp_dout(this, 20) << "RGWGC::process iterating over entry tag='" <<
+       info.tag << "', time=" << info.time << ", chain.objs.size()=" <<
+       info.chain.objs.size() << dendl;
+
+      std::list<cls_rgw_obj>::iterator liter;
+      cls_rgw_obj_chain& chain = info.chain;
+
+      utime_t now = ceph_clock_now();
+      if (now >= end) {
+        goto done;
+      }
+      if (! transitioned_objects_cache[index]) {
+        if (chain.objs.empty()) {
+          io_manager.schedule_tag_removal(index, info.tag);
+        } else {
+          io_manager.add_tag_io_size(index, info.tag, chain.objs.size());
+        }
+      }
+      if (! chain.objs.empty()) {
+       for (liter = chain.objs.begin(); liter != chain.objs.end(); ++liter) {
+         cls_rgw_obj& obj = *liter;
+
+         if (obj.pool != last_pool) {
+           delete ctx;
+           ctx = new IoCtx;
+           ret = rgw_init_ioctx(this, store->get_rados_handle(), obj.pool, *ctx);
+           if (ret < 0) {
+        if (transitioned_objects_cache[index]) {
+          goto done;
+        }
+             last_pool = "";
+             ldpp_dout(this, 0) << "ERROR: failed to create ioctx pool=" <<
+               obj.pool << dendl;
+             continue;
+           }
+           last_pool = obj.pool;
+         }
+
+         ctx->locator_set_key(obj.loc);
+
+         const string& oid = obj.key.name; /* just stored raw oid there */
+
+         ldpp_dout(this, 5) << "RGWGC::process removing " << obj.pool <<
+           ":" << obj.key.name << dendl;
+         ObjectWriteOperation op;
+         cls_refcount_put(op, info.tag, true);
+
+         ret = io_manager.schedule_io(ctx, oid, &op, index, info.tag);
+         if (ret < 0) {
+           ldpp_dout(this, 0) <<
+             "WARNING: failed to schedule deletion for oid=" << oid << dendl;
+      if (transitioned_objects_cache[index]) {
+        //If deleting oid failed for any of them, we will not delete queue entries
+        goto done;
+      }
+         }
+         if (going_down()) {
+           // leave early, even if tag isn't removed, it's ok since it
+           // will be picked up next time around
+           goto done;
+         }
+       } // chains loop
+      } // else -- chains not empty
+    } // entries loop
+    if (transitioned_objects_cache[index] && entries.size() > 0) {
+      ret = io_manager.drain_ios();
+      if (ret < 0) {
+        goto done;
+      }
+      //Remove the entries from the queue
+      ldpp_dout(this, 5) << "RGWGC::process removing entries, marker: " << marker << dendl;
+      ret = io_manager.remove_queue_entries(index, entries.size());
+      if (ret < 0) {
+        ldpp_dout(this, 0) <<
+          "WARNING: failed to remove queue entries" << dendl;
+        goto done;
+      }
+    }
+  } while (truncated);
+
+done:
+  /* we don't drain here, because if we're going down we don't want to
+   * hold the system if backend is unresponsive
+   */
+  l.unlock(&store->gc_pool_ctx, obj_names[index]);
+  delete ctx;
+
+  return 0;
+}
+
+int RGWGC::process(bool expired_only)
+{
+  int max_secs = cct->_conf->rgw_gc_processor_max_time;
+
+  const int start = ceph::util::generate_random_number(0, max_objs - 1);
+
+  RGWGCIOManager io_manager(this, store->ctx(), this);
+
+  for (int i = 0; i < max_objs; i++) {
+    int index = (i + start) % max_objs;
+    int ret = process(index, max_secs, expired_only, io_manager);
+    if (ret < 0)
+      return ret;
+  }
+  if (!going_down()) {
+    io_manager.drain();
+  }
+
+  return 0;
+}
+
+bool RGWGC::going_down()
+{
+  return down_flag;
+}
+
+void RGWGC::start_processor()
+{
+  worker = new GCWorker(this, cct, this);
+  worker->create("rgw_gc");
+}
+
+void RGWGC::stop_processor()
+{
+  down_flag = true;
+  if (worker) {
+    worker->stop();
+    worker->join();
+  }
+  delete worker;
+  worker = NULL;
+}
+
+unsigned RGWGC::get_subsys() const
+{
+  return dout_subsys;
+}
+
+std::ostream& RGWGC::gen_prefix(std::ostream& out) const
+{
+  return out << "garbage collection: ";
+}
+
+void *RGWGC::GCWorker::entry() {
+  do {
+    utime_t start = ceph_clock_now();
+    ldpp_dout(dpp, 2) << "garbage collection: start" << dendl;
+    int r = gc->process(true);
+    if (r < 0) {
+      ldpp_dout(dpp, 0) << "ERROR: garbage collection process() returned error r=" << r << dendl;
+    }
+    ldpp_dout(dpp, 2) << "garbage collection: stop" << dendl;
+
+    if (gc->going_down())
+      break;
+
+    utime_t end = ceph_clock_now();
+    end -= start;
+    int secs = cct->_conf->rgw_gc_processor_period;
+
+    if (secs <= end.sec())
+      continue; // next round
+
+    secs -= end.sec();
+
+    std::unique_lock locker{lock};
+    cond.wait_for(locker, std::chrono::seconds(secs));
+  } while (!gc->going_down());
+
+  return NULL;
+}
+
+void RGWGC::GCWorker::stop()
+{
+  std::lock_guard l{lock};
+  cond.notify_all();
+}
diff --git a/src/rgw/driver/rados/rgw_gc.h b/src/rgw/driver/rados/rgw_gc.h
new file mode 100644 (file)
index 0000000..196f280
--- /dev/null
@@ -0,0 +1,87 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#ifndef CEPH_RGW_GC_H
+#define CEPH_RGW_GC_H
+
+
+#include "include/types.h"
+#include "include/rados/librados.hpp"
+#include "common/ceph_mutex.h"
+#include "common/Cond.h"
+#include "common/Thread.h"
+#include "rgw_common.h"
+#include "rgw_sal.h"
+#include "rgw_rados.h"
+#include "cls/rgw/cls_rgw_types.h"
+
+#include <atomic>
+
+class RGWGCIOManager;
+
+class RGWGC : public DoutPrefixProvider {
+  CephContext *cct;
+  RGWRados *store;
+  int max_objs;
+  std::string *obj_names;
+  std::atomic<bool> down_flag = { false };
+
+  static constexpr uint64_t seed = 8675309;
+
+  int tag_index(const std::string& tag);
+  int send_chain(const cls_rgw_obj_chain& chain, const std::string& tag);
+
+  class GCWorker : public Thread {
+    const DoutPrefixProvider *dpp;
+    CephContext *cct;
+    RGWGC *gc;
+    ceph::mutex lock = ceph::make_mutex("GCWorker");
+    ceph::condition_variable cond;
+
+  public:
+    GCWorker(const DoutPrefixProvider *_dpp, CephContext *_cct, RGWGC *_gc) : dpp(_dpp), cct(_cct), gc(_gc) {}
+    void *entry() override;
+    void stop();
+  };
+
+  GCWorker *worker;
+public:
+  RGWGC() : cct(NULL), store(NULL), max_objs(0), obj_names(NULL), worker(NULL) {}
+  ~RGWGC() {
+    stop_processor();
+    finalize();
+  }
+  std::vector<bool> transitioned_objects_cache;
+  std::tuple<int, std::optional<cls_rgw_obj_chain>> send_split_chain(const cls_rgw_obj_chain& chain, const std::string& tag);
+
+  // asynchronously defer garbage collection on an object that's still being read
+  int async_defer_chain(const std::string& tag, const cls_rgw_obj_chain& info);
+
+  // callback for when async_defer_chain() fails with ECANCELED
+  void on_defer_canceled(const cls_rgw_gc_obj_info& info);
+
+  int remove(int index, const std::vector<std::string>& tags, librados::AioCompletion **pc);
+  int remove(int index, int num_entries);
+
+  void initialize(CephContext *_cct, RGWRados *_store);
+  void finalize();
+
+  int list(int *index, std::string& marker, uint32_t max, bool expired_only, std::list<cls_rgw_gc_obj_info>& result, bool *truncated, bool& processing_queue);
+  void list_init(int *index) { *index = 0; }
+  int process(int index, int process_max_secs, bool expired_only,
+              RGWGCIOManager& io_manager);
+  int process(bool expired_only);
+
+  bool going_down();
+  void start_processor();
+  void stop_processor();
+
+  CephContext *get_cct() const override { return store->ctx(); }
+  unsigned get_subsys() const;
+
+  std::ostream& gen_prefix(std::ostream& out) const;
+
+};
+
+
+#endif
diff --git a/src/rgw/driver/rados/rgw_gc_log.cc b/src/rgw/driver/rados/rgw_gc_log.cc
new file mode 100644 (file)
index 0000000..ad819ed
--- /dev/null
@@ -0,0 +1,55 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "rgw_gc_log.h"
+
+#include "cls/rgw/cls_rgw_client.h"
+#include "cls/rgw_gc/cls_rgw_gc_client.h"
+#include "cls/version/cls_version_client.h"
+
+
+void gc_log_init2(librados::ObjectWriteOperation& op,
+                  uint64_t max_size, uint64_t max_deferred)
+{
+  obj_version objv; // objv.ver = 0
+  cls_version_check(op, objv, VER_COND_EQ);
+  cls_rgw_gc_queue_init(op, max_size, max_deferred);
+  objv.ver = 1;
+  cls_version_set(op, objv);
+}
+
+void gc_log_enqueue1(librados::ObjectWriteOperation& op,
+                     uint32_t expiration, cls_rgw_gc_obj_info& info)
+{
+  obj_version objv; // objv.ver = 0
+  cls_version_check(op, objv, VER_COND_EQ);
+  cls_rgw_gc_set_entry(op, expiration, info);
+}
+
+void gc_log_enqueue2(librados::ObjectWriteOperation& op,
+                     uint32_t expiration, const cls_rgw_gc_obj_info& info)
+{
+  obj_version objv;
+  objv.ver = 1;
+  cls_version_check(op, objv, VER_COND_EQ);
+  cls_rgw_gc_queue_enqueue(op, expiration, info);
+}
+
+void gc_log_defer1(librados::ObjectWriteOperation& op,
+                   uint32_t expiration, const cls_rgw_gc_obj_info& info)
+{
+  obj_version objv; // objv.ver = 0
+  cls_version_check(op, objv, VER_COND_EQ);
+  cls_rgw_gc_defer_entry(op, expiration, info.tag);
+}
+
+void gc_log_defer2(librados::ObjectWriteOperation& op,
+                   uint32_t expiration, const cls_rgw_gc_obj_info& info)
+{
+  obj_version objv;
+  objv.ver = 1;
+  cls_version_check(op, objv, VER_COND_EQ);
+  cls_rgw_gc_queue_defer_entry(op, expiration, info);
+  // TODO: conditional on whether omap is known to be empty
+  cls_rgw_gc_remove(op, {info.tag});
+}
diff --git a/src/rgw/driver/rados/rgw_lc_tier.cc b/src/rgw/driver/rados/rgw_lc_tier.cc
new file mode 100644 (file)
index 0000000..0ad2169
--- /dev/null
@@ -0,0 +1,1336 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include <string.h>
+#include <iostream>
+#include <map>
+
+#include "common/Formatter.h"
+#include <common/errno.h>
+#include "rgw_lc.h"
+#include "rgw_lc_tier.h"
+#include "rgw_string.h"
+#include "rgw_zone.h"
+#include "rgw_common.h"
+#include "rgw_rest.h"
+#include "svc_zone.h"
+
+#include <boost/algorithm/string/split.hpp>
+#include <boost/algorithm/string.hpp>
+#include <boost/algorithm/string/predicate.hpp>
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+
+struct rgw_lc_multipart_part_info {
+  int part_num{0};
+  uint64_t ofs{0};
+  uint64_t size{0};
+  std::string etag;
+};
+
+struct rgw_lc_obj_properties {
+  ceph::real_time mtime;
+  std::string etag;
+  uint64_t versioned_epoch{0};
+  std::map<std::string, RGWTierACLMapping>& target_acl_mappings;
+  std::string target_storage_class;
+
+  rgw_lc_obj_properties(ceph::real_time _mtime, std::string _etag,
+      uint64_t _versioned_epoch, std::map<std::string,
+      RGWTierACLMapping>& _t_acl_mappings,
+      std::string _t_storage_class) :
+    mtime(_mtime), etag(_etag),
+    versioned_epoch(_versioned_epoch),
+    target_acl_mappings(_t_acl_mappings),
+    target_storage_class(_t_storage_class) {}
+};
+
+struct rgw_lc_multipart_upload_info {
+  std::string upload_id;
+  uint64_t obj_size;
+  ceph::real_time mtime;
+  std::string etag;
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(1, 1, bl);
+    encode(upload_id, bl);
+    encode(obj_size, bl);
+    encode(mtime, bl);
+    encode(etag, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) {
+    DECODE_START(1, bl);
+    decode(upload_id, bl);
+    decode(obj_size, bl);
+    decode(mtime, bl);
+    decode(etag, bl);
+    DECODE_FINISH(bl);
+  }
+};
+WRITE_CLASS_ENCODER(rgw_lc_multipart_upload_info)
+
+static inline string get_key_instance(const rgw_obj_key& key)
+{
+  if (!key.instance.empty() &&
+      !key.have_null_instance()) {
+    return "-" + key.instance;
+  }
+  return "";
+}
+
+static inline string get_key_oid(const rgw_obj_key& key)
+{
+  string oid = key.name;
+  if (!key.instance.empty() &&
+      !key.have_null_instance()) {
+    oid += string("-") + key.instance;
+  }
+  return oid;
+}
+
+static inline string obj_to_aws_path(const rgw_obj& obj)
+{
+  string path = obj.bucket.name + "/" + get_key_oid(obj.key);
+  return path;
+}
+
+static int read_upload_status(const DoutPrefixProvider *dpp, rgw::sal::Driver *driver,
+    const rgw_raw_obj *status_obj, rgw_lc_multipart_upload_info *status)
+{
+  int ret = 0;
+  rgw::sal::RadosStore *rados = dynamic_cast<rgw::sal::RadosStore*>(driver);
+
+  if (!rados) {
+    ldpp_dout(dpp, 0) << "ERROR: Not a RadosStore. Cannot be transitioned to cloud." << dendl;
+    return -1;
+  }
+
+  auto& pool = status_obj->pool;
+  const auto oid = status_obj->oid;
+  auto sysobj = rados->svc()->sysobj;
+  bufferlist bl;
+
+  ret = rgw_get_system_obj(sysobj, pool, oid, bl, nullptr, nullptr,
+      null_yield, dpp);
+
+  if (ret < 0) {
+    return ret;
+  }
+
+  if (bl.length() > 0) {
+    try {
+      auto p = bl.cbegin();
+      status->decode(p);
+    } catch (buffer::error& e) {
+      ldpp_dout(dpp, 10) << "failed to decode status obj: "
+        << e.what() << dendl;
+      return -EIO;
+    }
+  } else {
+    return -EIO;
+  }
+
+  return 0;
+}
+
+static int put_upload_status(const DoutPrefixProvider *dpp, rgw::sal::Driver *driver,
+    const rgw_raw_obj *status_obj, rgw_lc_multipart_upload_info *status)
+{
+  int ret = 0;
+  rgw::sal::RadosStore *rados = dynamic_cast<rgw::sal::RadosStore*>(driver);
+
+  if (!rados) {
+    ldpp_dout(dpp, 0) << "ERROR: Not a RadosStore. Cannot be transitioned to cloud." << dendl;
+    return -1;
+  }
+
+  auto& pool = status_obj->pool;
+  const auto oid = status_obj->oid;
+  auto sysobj = rados->svc()->sysobj;
+  bufferlist bl;
+  status->encode(bl);
+
+  ret = rgw_put_system_obj(dpp, sysobj, pool, oid, bl, true, nullptr,
+      real_time{}, null_yield);
+
+  return ret;
+}
+
+static int delete_upload_status(const DoutPrefixProvider *dpp, rgw::sal::Driver *driver,
+    const rgw_raw_obj *status_obj)
+{
+  int ret = 0;
+  rgw::sal::RadosStore *rados = dynamic_cast<rgw::sal::RadosStore*>(driver);
+
+  if (!rados) {
+    ldpp_dout(dpp, 0) << "ERROR: Not a RadosStore. Cannot be transitioned to cloud." << dendl;
+    return -1;
+  }
+
+  auto& pool = status_obj->pool;
+  const auto oid = status_obj->oid;
+  auto sysobj = rados->svc()->sysobj;
+
+  ret = rgw_delete_system_obj(dpp, sysobj, pool, oid, nullptr, null_yield);
+
+  return ret;
+}
+
+static std::set<string> keep_headers = { "CONTENT_TYPE",
+                                         "CONTENT_ENCODING",
+                                         "CONTENT_DISPOSITION",
+                                         "CONTENT_LANGUAGE" };
+
+/*
+ * mapping between rgw object attrs and output http fields
+ *
+ static const struct rgw_http_attr base_rgw_to_http_attrs[] = {
+ { RGW_ATTR_CONTENT_LANG,      "Content-Language" },
+ { RGW_ATTR_EXPIRES,           "Expires" },
+ { RGW_ATTR_CACHE_CONTROL,     "Cache-Control" },
+ { RGW_ATTR_CONTENT_DISP,      "Content-Disposition" },
+ { RGW_ATTR_CONTENT_ENC,       "Content-Encoding" },
+ { RGW_ATTR_USER_MANIFEST,     "X-Object-Manifest" },
+ { RGW_ATTR_X_ROBOTS_TAG ,     "X-Robots-Tag" },
+ { RGW_ATTR_STORAGE_CLASS ,    "X-Amz-Storage-Class" },
+// RGW_ATTR_AMZ_WEBSITE_REDIRECT_LOCATION header depends on access mode:
+// S3 endpoint: x-amz-website-redirect-location
+// S3Website endpoint: Location
+{ RGW_ATTR_AMZ_WEBSITE_REDIRECT_LOCATION, "x-amz-website-redirect-location" },
+}; */
+
+static void init_headers(map<string, bufferlist>& attrs,
+    map<string, string>& headers)
+{
+  for (auto& kv : attrs) {
+    const char * name = kv.first.c_str();
+    const auto aiter = rgw_to_http_attrs.find(name);
+
+    if (aiter != std::end(rgw_to_http_attrs)) {
+      headers[aiter->second] = rgw_bl_str(kv.second);
+    } else if (strncmp(name, RGW_ATTR_META_PREFIX,
+          sizeof(RGW_ATTR_META_PREFIX)-1) == 0) {
+      name += sizeof(RGW_ATTR_META_PREFIX) - 1;
+      string sname(name);
+      string name_prefix = RGW_ATTR_META_PREFIX;
+      char full_name_buf[name_prefix.size() + sname.size() + 1];
+      snprintf(full_name_buf, sizeof(full_name_buf), "%.*s%.*s",
+          static_cast<int>(name_prefix.length()),
+          name_prefix.data(),
+          static_cast<int>(sname.length()),
+          sname.data());
+      headers[full_name_buf] = rgw_bl_str(kv.second);
+    } else if (strcmp(name,RGW_ATTR_CONTENT_TYPE) == 0) {
+      headers["CONTENT_TYPE"] = rgw_bl_str(kv.second);
+    }
+  }
+}
+
+/* Read object or just head from remote endpoint. For now initializes only headers,
+ * but can be extended to fetch etag, mtime etc if needed.
+ */
+static int cloud_tier_get_object(RGWLCCloudTierCtx& tier_ctx, bool head,
+                         std::map<std::string, std::string>& headers) {
+  RGWRESTConn::get_obj_params req_params;
+  RGWBucketInfo b;
+  std::string target_obj_name;
+  int ret = 0;
+  std::unique_ptr<rgw::sal::Bucket> dest_bucket;
+  std::unique_ptr<rgw::sal::Object> dest_obj;
+  rgw_lc_obj_properties obj_properties(tier_ctx.o.meta.mtime, tier_ctx.o.meta.etag,
+        tier_ctx.o.versioned_epoch, tier_ctx.acl_mappings,
+        tier_ctx.target_storage_class);
+  std::string etag;
+  RGWRESTStreamRWRequest *in_req;
+
+  b.bucket.name = tier_ctx.target_bucket_name;
+  target_obj_name = tier_ctx.bucket_info.bucket.name + "/" +
+                    tier_ctx.obj->get_name();
+  if (!tier_ctx.o.is_current()) {
+    target_obj_name += get_key_instance(tier_ctx.obj->get_key());
+  }
+
+  ret = tier_ctx.driver->get_bucket(nullptr, b, &dest_bucket);
+  if (ret < 0) {
+    ldpp_dout(tier_ctx.dpp, 0) << "ERROR: failed to initialize dest_bucket - " << tier_ctx.target_bucket_name << " , reterr = " << ret << dendl;
+    return ret;
+  }
+
+  dest_obj = dest_bucket->get_object(rgw_obj_key(target_obj_name));
+  if (!dest_obj) {
+    ldpp_dout(tier_ctx.dpp, 0) << "ERROR: failed to initialize dest_object path - " << target_obj_name << dendl;
+    return -1;
+  }
+  /* init input connection */
+  req_params.get_op = !head;
+  req_params.prepend_metadata = true;
+  req_params.rgwx_stat = true;
+  req_params.sync_manifest = true;
+  req_params.skip_decrypt = true;
+
+  ret = tier_ctx.conn.get_obj(tier_ctx.dpp, dest_obj.get(), req_params, true /* send */, &in_req);
+  if (ret < 0) {
+    ldpp_dout(tier_ctx.dpp, 0) << "ERROR: " << __func__ << "(): conn.get_obj() returned ret=" << ret << dendl;
+    return ret;
+  }
+
+  /* fetch headers */
+  ret = tier_ctx.conn.complete_request(in_req, nullptr, nullptr, nullptr, nullptr, &headers, null_yield);
+  if (ret < 0 && ret != -ENOENT) {
+    ldpp_dout(tier_ctx.dpp, 20) << "ERROR: " << __func__ << "(): conn.complete_request() returned ret=" << ret << dendl;
+    return ret;
+  }
+  return 0;
+}
+
+static bool is_already_tiered(const DoutPrefixProvider *dpp,
+                             std::map<std::string, std::string>& headers,
+                             ceph::real_time& mtime) {
+  char buf[32];
+  map<string, string> attrs = headers;
+
+  for (const auto& a : attrs) {
+    ldpp_dout(dpp, 20) << "GetCrf attr[" << a.first << "] = " << a.second <<dendl;
+  }
+  utime_t ut(mtime);
+  snprintf(buf, sizeof(buf), "%lld.%09lld",
+      (long long)ut.sec(),
+      (long long)ut.nsec());
+
+  string s = attrs["X_AMZ_META_RGWX_SOURCE_MTIME"];
+
+  if (s.empty())
+    s = attrs["x_amz_meta_rgwx_source_mtime"];
+
+  ldpp_dout(dpp, 20) << "is_already_tiered attrs[X_AMZ_META_RGWX_SOURCE_MTIME] = " << s <<dendl;
+  ldpp_dout(dpp, 20) << "is_already_tiered mtime buf = " << buf <<dendl;
+
+  if (!s.empty() && !strcmp(s.c_str(), buf)){
+    return 1;
+  }
+  return 0;
+}
+
+/* Read object locally & also initialize dest rest obj based on read attrs */
+class RGWLCStreamRead
+{
+  CephContext *cct;
+  const DoutPrefixProvider *dpp;
+  std::map<std::string, bufferlist> attrs;
+  uint64_t obj_size;
+  rgw::sal::Object *obj;
+  const real_time &mtime;
+
+  bool multipart{false};
+  uint64_t m_part_size{0};
+  off_t m_part_off{0};
+  off_t m_part_end{0};
+
+  std::unique_ptr<rgw::sal::Object::ReadOp> read_op;
+  off_t ofs{0};
+  off_t end{0};
+  rgw_rest_obj rest_obj;
+
+  int retcode{0};
+
+  public:
+  RGWLCStreamRead(CephContext *_cct, const DoutPrefixProvider *_dpp,
+      rgw::sal::Object *_obj, const real_time &_mtime) :
+    cct(_cct), dpp(_dpp), obj(_obj), mtime(_mtime),
+    read_op(obj->get_read_op()) {}
+
+  ~RGWLCStreamRead() {};
+  int set_range(off_t _ofs, off_t _end);
+  int get_range(off_t &_ofs, off_t &_end);
+  rgw_rest_obj& get_rest_obj();
+  void set_multipart(uint64_t part_size, off_t part_off, off_t part_end);
+  int init();
+  int init_rest_obj();
+  int read(off_t ofs, off_t end, RGWGetDataCB *out_cb);
+};
+
+/* Send PUT op to remote endpoint */
+class RGWLCCloudStreamPut
+{
+  const DoutPrefixProvider *dpp;
+  rgw_lc_obj_properties obj_properties;
+  RGWRESTConn& conn;
+  rgw::sal::Object *dest_obj;
+  std::string etag;
+  RGWRESTStreamS3PutObj *out_req{nullptr};
+
+  struct multipart_info {
+    bool is_multipart{false};
+    std::string upload_id;
+    int part_num{0};
+    uint64_t part_size;
+  } multipart;
+
+  int retcode;
+
+  public:
+  RGWLCCloudStreamPut(const DoutPrefixProvider *_dpp,
+      const rgw_lc_obj_properties&  _obj_properties,
+      RGWRESTConn& _conn,
+      rgw::sal::Object *_dest_obj) :
+    dpp(_dpp), obj_properties(_obj_properties), conn(_conn), dest_obj(_dest_obj) {
+    }
+  int init();
+  static bool keep_attr(const std::string& h);
+  static void init_send_attrs(const DoutPrefixProvider *dpp, const rgw_rest_obj& rest_obj,
+      const rgw_lc_obj_properties& obj_properties,
+      std::map<std::string, std::string>& attrs);
+  void send_ready(const DoutPrefixProvider *dpp, const rgw_rest_obj& rest_obj);
+  void handle_headers(const std::map<std::string, std::string>& headers);
+  bool get_etag(std::string *petag);
+  void set_multipart(const std::string& upload_id, int part_num, uint64_t part_size);
+  int send();
+  RGWGetDataCB *get_cb();
+  int complete_request();
+};
+
+int RGWLCStreamRead::set_range(off_t _ofs, off_t _end) {
+  ofs = _ofs;
+  end = _end;
+
+  return 0;
+}
+
+int RGWLCStreamRead::get_range(off_t &_ofs, off_t &_end) {
+  _ofs = ofs;
+  _end = end;
+
+  return 0;
+}
+
+rgw_rest_obj& RGWLCStreamRead::get_rest_obj() {
+  return rest_obj;
+}
+
+void RGWLCStreamRead::set_multipart(uint64_t part_size, off_t part_off, off_t part_end) {
+  multipart = true;
+  m_part_size = part_size;
+  m_part_off = part_off;
+  m_part_end = part_end;
+}
+
+int RGWLCStreamRead::init() {
+  optional_yield y = null_yield;
+  real_time read_mtime;
+
+  read_op->params.lastmod = &read_mtime;
+
+  int ret = read_op->prepare(y, dpp);
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: fail to prepare read_op, ret = " << ret << dendl;
+    return ret;
+  }
+
+  if (read_mtime != mtime) {
+    /* raced */
+    return -ECANCELED;
+  }
+
+  attrs = obj->get_attrs();
+  obj_size = obj->get_obj_size();
+
+  ret = init_rest_obj();
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: fail to initialize rest_obj, ret = " << ret << dendl;
+    return ret;
+  }
+
+  if (!multipart) {
+    set_range(0, obj_size - 1);
+  } else {
+    set_range(m_part_off, m_part_end);
+  }
+  return 0;
+}
+
+int RGWLCStreamRead::init_rest_obj() {
+  /* Initialize rgw_rest_obj. 
+   * Reference: do_decode_rest_obj
+   * Check how to copy headers content */ 
+  rest_obj.init(obj->get_key());
+
+  if (!multipart) {
+    rest_obj.content_len = obj_size;
+  } else {
+    rest_obj.content_len = m_part_size;
+  }
+
+  /* For mulitpart attrs are sent as part of InitMultipartCR itself */
+  if (multipart) {
+    return 0;
+  }
+
+  /*
+   * XXX: verify if its right way to copy attrs into rest obj
+   */
+  init_headers(attrs, rest_obj.attrs);
+
+  rest_obj.acls.set_ctx(cct);
+  const auto aiter = attrs.find(RGW_ATTR_ACL);
+  if (aiter != attrs.end()) {
+    bufferlist& bl = aiter->second;
+    auto bliter = bl.cbegin();
+    try {
+      rest_obj.acls.decode(bliter);
+    } catch (buffer::error& err) {
+      ldpp_dout(dpp, 0) << "ERROR: failed to decode policy off attrs" << dendl;
+      return -EIO;
+    }
+  } else {
+    ldpp_dout(dpp, 0) << "WARNING: acl attrs not provided" << dendl;
+  }
+  return 0;
+}
+
+int RGWLCStreamRead::read(off_t ofs, off_t end, RGWGetDataCB *out_cb) {
+  int ret = read_op->iterate(dpp, ofs, end, out_cb, null_yield);
+  return ret;
+}
+
+int RGWLCCloudStreamPut::init() {
+  /* init output connection */
+  if (multipart.is_multipart) {
+    char buf[32];
+    snprintf(buf, sizeof(buf), "%d", multipart.part_num);
+    rgw_http_param_pair params[] = { { "uploadId", multipart.upload_id.c_str() },
+                                     { "partNumber", buf },
+                                     { nullptr, nullptr } };
+    conn.put_obj_send_init(dest_obj, params, &out_req);
+  } else {
+    conn.put_obj_send_init(dest_obj, nullptr, &out_req);
+  }
+
+  return 0;
+}
+
+bool RGWLCCloudStreamPut::keep_attr(const string& h) {
+  return (keep_headers.find(h) != keep_headers.end() ||
+      boost::algorithm::starts_with(h, "X_AMZ_"));
+}
+
+void RGWLCCloudStreamPut::init_send_attrs(const DoutPrefixProvider *dpp,
+    const rgw_rest_obj& rest_obj,
+    const rgw_lc_obj_properties& obj_properties,
+    std::map<string, string>& attrs) {
+
+  map<string, RGWTierACLMapping>& acl_mappings(obj_properties.target_acl_mappings);
+  const std::string& target_storage_class = obj_properties.target_storage_class;
+
+  attrs.clear();
+
+  for (auto& hi : rest_obj.attrs) {
+    if (keep_attr(hi.first)) {
+      attrs.insert(hi);
+    }
+  }
+
+  const auto acl = rest_obj.acls.get_acl();
+
+  map<int, vector<string> > access_map;
+
+  if (!acl_mappings.empty()) {
+    for (auto& grant : acl.get_grant_map()) {
+      auto& orig_grantee = grant.first;
+      auto& perm = grant.second;
+
+      string grantee;
+
+      const auto& am = acl_mappings;
+
+      const auto iter = am.find(orig_grantee);
+      if (iter == am.end()) {
+        ldpp_dout(dpp, 20) << "acl_mappings: Could not find " << orig_grantee << " .. ignoring" << dendl;
+        continue;
+      }
+
+      grantee = iter->second.dest_id;
+
+      string type;
+
+      switch (iter->second.type) {
+        case ACL_TYPE_CANON_USER:
+          type = "id";
+          break;
+        case ACL_TYPE_EMAIL_USER:
+          type = "emailAddress";
+          break;
+        case ACL_TYPE_GROUP:
+          type = "uri";
+          break;
+        default:
+          continue;
+      }
+
+      string tv = type + "=" + grantee;
+
+      int flags = perm.get_permission().get_permissions();
+      if ((flags & RGW_PERM_FULL_CONTROL) == RGW_PERM_FULL_CONTROL) {
+        access_map[flags].push_back(tv);
+        continue;
+      }
+
+      for (int i = 1; i <= RGW_PERM_WRITE_ACP; i <<= 1) {
+        if (flags & i) {
+          access_map[i].push_back(tv);
+        }
+      }
+    }
+  }
+
+  for (const auto& aiter : access_map) {
+    int grant_type = aiter.first;
+
+    string header_str("x-amz-grant-");
+
+    switch (grant_type) {
+      case RGW_PERM_READ:
+        header_str.append("read");
+        break;
+      case RGW_PERM_WRITE:
+        header_str.append("write");
+        break;
+      case RGW_PERM_READ_ACP:
+        header_str.append("read-acp");
+        break;
+      case RGW_PERM_WRITE_ACP:
+        header_str.append("write-acp");
+        break;
+      case RGW_PERM_FULL_CONTROL:
+        header_str.append("full-control");
+        break;
+    }
+
+    string s;
+
+    for (const auto& viter : aiter.second) {
+      if (!s.empty()) {
+        s.append(", ");
+      }
+      s.append(viter);
+    }
+
+    ldpp_dout(dpp, 20) << "acl_mappings: set acl: " << header_str << "=" << s << dendl;
+
+    attrs[header_str] = s;
+  }
+
+  /* Copy target storage class */
+  if (!target_storage_class.empty()) {
+    attrs["x-amz-storage-class"] = target_storage_class;
+  } else {
+    attrs["x-amz-storage-class"] = "STANDARD";
+  }
+
+  /* New attribute to specify its transitioned from RGW */
+  attrs["x-amz-meta-rgwx-source"] = "rgw";
+
+  char buf[32];
+  snprintf(buf, sizeof(buf), "%llu", (long long)obj_properties.versioned_epoch);
+  attrs["x-amz-meta-rgwx-versioned-epoch"] = buf;
+
+  utime_t ut(obj_properties.mtime);
+  snprintf(buf, sizeof(buf), "%lld.%09lld",
+      (long long)ut.sec(),
+      (long long)ut.nsec());
+
+  attrs["x-amz-meta-rgwx-source-mtime"] = buf;
+  attrs["x-amz-meta-rgwx-source-etag"] = obj_properties.etag;
+  attrs["x-amz-meta-rgwx-source-key"] = rest_obj.key.name;
+  if (!rest_obj.key.instance.empty()) {
+    attrs["x-amz-meta-rgwx-source-version-id"] = rest_obj.key.instance;
+  }
+  for (const auto& a : attrs) {
+    ldpp_dout(dpp, 30) << "init_send_attrs attr[" << a.first << "] = " << a.second <<dendl;
+  }
+}
+
+void RGWLCCloudStreamPut::send_ready(const DoutPrefixProvider *dpp, const rgw_rest_obj& rest_obj) {
+  auto r = static_cast<RGWRESTStreamS3PutObj *>(out_req);
+
+  std::map<std::string, std::string> new_attrs;
+  if (!multipart.is_multipart) {
+    init_send_attrs(dpp, rest_obj, obj_properties, new_attrs);
+  }
+
+  r->set_send_length(rest_obj.content_len);
+
+  RGWAccessControlPolicy policy;
+
+  r->send_ready(dpp, conn.get_key(), new_attrs, policy);
+}
+
+void RGWLCCloudStreamPut::handle_headers(const map<string, string>& headers) {
+  for (const auto& h : headers) {
+    if (h.first == "ETAG") {
+      etag = h.second;
+    }
+  }
+}
+
+bool RGWLCCloudStreamPut::get_etag(string *petag) {
+  if (etag.empty()) {
+    return false;
+  }
+  *petag = etag;
+  return true;
+}
+
+void RGWLCCloudStreamPut::set_multipart(const string& upload_id, int part_num, uint64_t part_size) {
+  multipart.is_multipart = true;
+  multipart.upload_id = upload_id;
+  multipart.part_num = part_num;
+  multipart.part_size = part_size;
+}
+
+int RGWLCCloudStreamPut::send() {
+  int ret = RGWHTTP::send(out_req);
+  return ret;
+}
+
+RGWGetDataCB *RGWLCCloudStreamPut::get_cb() {
+  return out_req->get_out_cb();
+}
+
+int RGWLCCloudStreamPut::complete_request() {
+  int ret = conn.complete_request(out_req, etag, &obj_properties.mtime, null_yield);
+  return ret;
+}
+
+/* Read local copy and write to Cloud endpoint */
+static int cloud_tier_transfer_object(const DoutPrefixProvider* dpp,
+                            RGWLCStreamRead* readf, RGWLCCloudStreamPut* writef) {
+  std::string url;
+  bufferlist bl;
+  bool sent_attrs{false};
+  int ret{0};
+  off_t ofs;
+  off_t end;
+
+  ret = readf->init();
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: fail to initialize in_crf, ret = " << ret << dendl;
+    return ret;
+  }
+  readf->get_range(ofs, end);
+  rgw_rest_obj& rest_obj = readf->get_rest_obj();
+  if (!sent_attrs) {
+    ret = writef->init();
+    if (ret < 0) {
+      ldpp_dout(dpp, 0) << "ERROR: fail to initialize out_crf, ret = " << ret << dendl;
+      return ret;
+    }
+
+    writef->send_ready(dpp, rest_obj);
+    ret = writef->send();
+    if (ret < 0) {
+      return ret;
+    }
+    sent_attrs = true;
+  }
+
+  ret = readf->read(ofs, end, writef->get_cb());
+
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: fail to read from in_crf, ret = " << ret << dendl;
+    return ret;
+  }
+
+  ret = writef->complete_request();
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: fail to complete request, ret = " << ret << dendl;
+    return ret;
+  }
+
+  return 0;
+}
+
+static int cloud_tier_plain_transfer(RGWLCCloudTierCtx& tier_ctx) {
+  int ret;
+  std::unique_ptr<rgw::sal::Bucket> dest_bucket;
+  std::unique_ptr<rgw::sal::Object> dest_obj;
+
+  rgw_lc_obj_properties obj_properties(tier_ctx.o.meta.mtime, tier_ctx.o.meta.etag,
+                        tier_ctx.o.versioned_epoch, tier_ctx.acl_mappings,
+                        tier_ctx.target_storage_class);
+  RGWBucketInfo b;
+  std::string target_obj_name;
+
+  b.bucket.name = tier_ctx.target_bucket_name;
+  target_obj_name = tier_ctx.bucket_info.bucket.name + "/" +
+    tier_ctx.obj->get_name();
+  if (!tier_ctx.o.is_current()) {
+    target_obj_name += get_key_instance(tier_ctx.obj->get_key());
+  }
+
+  ret = tier_ctx.driver->get_bucket(nullptr, b, &dest_bucket);
+  if (ret < 0) {
+    ldpp_dout(tier_ctx.dpp, 0) << "ERROR: failed to initialize dest_bucket - " << tier_ctx.target_bucket_name << " , ret = " << ret << dendl;
+    return ret;
+  }
+
+  dest_obj = dest_bucket->get_object(rgw_obj_key(target_obj_name));
+  if (!dest_obj) {
+    ldpp_dout(tier_ctx.dpp, 0) << "ERROR: failed to initialize dest_object path - " << target_obj_name << dendl;
+    return -1;
+  }
+
+  tier_ctx.obj->set_atomic();
+
+  /* Prepare Read from source */
+  /* TODO: Define readf, writef as stack variables. For some reason,
+   * when used as stack variables (esp., readf), the transition seems to
+   * be taking lot of time eventually erroring out at times.
+   */
+  std::shared_ptr<RGWLCStreamRead> readf;
+  readf.reset(new RGWLCStreamRead(tier_ctx.cct, tier_ctx.dpp,
+        tier_ctx.obj, tier_ctx.o.meta.mtime));
+
+  std::shared_ptr<RGWLCCloudStreamPut> writef;
+  writef.reset(new RGWLCCloudStreamPut(tier_ctx.dpp, obj_properties, tier_ctx.conn,
+               dest_obj.get()));
+
+  /* actual Read & Write */
+  ret = cloud_tier_transfer_object(tier_ctx.dpp, readf.get(), writef.get());
+
+  return ret;
+}
+
+static int cloud_tier_send_multipart_part(RGWLCCloudTierCtx& tier_ctx,
+                                const std::string& upload_id,
+                                const rgw_lc_multipart_part_info& part_info,
+                                std::string *petag) {
+  int ret;
+  std::unique_ptr<rgw::sal::Bucket> dest_bucket;
+  std::unique_ptr<rgw::sal::Object> dest_obj;
+
+  rgw_lc_obj_properties obj_properties(tier_ctx.o.meta.mtime, tier_ctx.o.meta.etag,
+                        tier_ctx.o.versioned_epoch, tier_ctx.acl_mappings,
+                        tier_ctx.target_storage_class);
+  RGWBucketInfo b;
+  std::string target_obj_name;
+  off_t end;
+
+  b.bucket.name = tier_ctx.target_bucket_name;
+  target_obj_name = tier_ctx.bucket_info.bucket.name + "/" +
+    tier_ctx.obj->get_name();
+  if (!tier_ctx.o.is_current()) {
+    target_obj_name += get_key_instance(tier_ctx.obj->get_key());
+  }
+
+  ret = tier_ctx.driver->get_bucket(nullptr, b, &dest_bucket);
+  if (ret < 0) {
+    ldpp_dout(tier_ctx.dpp, 0) << "ERROR: failed to initialize dest_bucket - " << tier_ctx.target_bucket_name << " , ret = " << ret << dendl;
+    return ret;
+  }
+
+  dest_obj = dest_bucket->get_object(rgw_obj_key(target_obj_name));
+  if (!dest_obj) {
+    ldpp_dout(tier_ctx.dpp, 0) << "ERROR: failed to initialize dest_object path - " << target_obj_name << dendl;
+    return -1;
+  }
+
+  tier_ctx.obj->set_atomic();
+
+  /* TODO: Define readf, writef as stack variables. For some reason,
+   * when used as stack variables (esp., readf), the transition seems to
+   * be taking lot of time eventually erroring out at times. */
+  std::shared_ptr<RGWLCStreamRead> readf;
+  readf.reset(new RGWLCStreamRead(tier_ctx.cct, tier_ctx.dpp,
+        tier_ctx.obj, tier_ctx.o.meta.mtime));
+
+  std::shared_ptr<RGWLCCloudStreamPut> writef;
+  writef.reset(new RGWLCCloudStreamPut(tier_ctx.dpp, obj_properties, tier_ctx.conn,
+               dest_obj.get()));
+
+  /* Prepare Read from source */
+  end = part_info.ofs + part_info.size - 1;
+  readf->set_multipart(part_info.size, part_info.ofs, end);
+
+  /* Prepare write */
+  writef->set_multipart(upload_id, part_info.part_num, part_info.size);
+
+  /* actual Read & Write */
+  ret = cloud_tier_transfer_object(tier_ctx.dpp, readf.get(), writef.get());
+  if (ret < 0) {
+    return ret;
+  }
+
+  if (!(writef->get_etag(petag))) {
+    ldpp_dout(tier_ctx.dpp, 0) << "ERROR: failed to get etag from PUT request" << dendl;
+    return -EIO;
+  }
+
+  return 0;
+}
+
+static int cloud_tier_abort_multipart(const DoutPrefixProvider *dpp,
+      RGWRESTConn& dest_conn, const rgw_obj& dest_obj,
+      const std::string& upload_id) {
+  int ret;
+  bufferlist out_bl;
+  bufferlist bl;
+  rgw_http_param_pair params[] = { { "uploadId", upload_id.c_str() }, {nullptr, nullptr} };
+
+  string resource = obj_to_aws_path(dest_obj);
+  ret = dest_conn.send_resource(dpp, "DELETE", resource, params, nullptr,
+      out_bl, &bl, nullptr, null_yield);
+
+
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: failed to abort multipart upload for dest object=" << dest_obj << " (ret=" << ret << ")" << dendl;
+    return ret;
+  }
+
+  return 0;
+}
+
+static int cloud_tier_init_multipart(const DoutPrefixProvider *dpp,
+      RGWRESTConn& dest_conn, const rgw_obj& dest_obj,
+      uint64_t obj_size, std::map<std::string, std::string>& attrs,
+      std::string& upload_id) {
+  bufferlist out_bl;
+  bufferlist bl;
+
+  struct InitMultipartResult {
+    std::string bucket;
+    std::string key;
+    std::string upload_id;
+
+    void decode_xml(XMLObj *obj) {
+      RGWXMLDecoder::decode_xml("Bucket", bucket, obj);
+      RGWXMLDecoder::decode_xml("Key", key, obj);
+      RGWXMLDecoder::decode_xml("UploadId", upload_id, obj);
+    }
+  } result;
+
+  int ret;
+  rgw_http_param_pair params[] = { { "uploads", nullptr }, {nullptr, nullptr} };
+
+  string resource = obj_to_aws_path(dest_obj);
+
+  ret = dest_conn.send_resource(dpp, "POST", resource, params, &attrs,
+      out_bl, &bl, nullptr, null_yield);
+
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: failed to initialize multipart upload for dest object=" << dest_obj << dendl;
+    return ret;
+  }
+  /*
+   * If one of the following fails we cannot abort upload, as we cannot
+   * extract the upload id. If one of these fail it's very likely that that's
+   * the least of our problem.
+   */
+  RGWXMLDecoder::XMLParser parser;
+  if (!parser.init()) {
+    ldpp_dout(dpp, 0) << "ERROR: failed to initialize xml parser for parsing multipart init response from server" << dendl;
+    return -EIO;
+  }
+
+  if (!parser.parse(out_bl.c_str(), out_bl.length(), 1)) {
+    string str(out_bl.c_str(), out_bl.length());
+    ldpp_dout(dpp, 5) << "ERROR: failed to parse xml initmultipart: " << str << dendl;
+    return -EIO;
+  }
+
+  try {
+    RGWXMLDecoder::decode_xml("InitiateMultipartUploadResult", result, &parser, true);
+  } catch (RGWXMLDecoder::err& err) {
+    string str(out_bl.c_str(), out_bl.length());
+    ldpp_dout(dpp, 5) << "ERROR: unexpected xml: " << str << dendl;
+    return -EIO;
+  }
+
+  ldpp_dout(dpp, 20) << "init multipart result: bucket=" << result.bucket << " key=" << result.key << " upload_id=" << result.upload_id << dendl;
+
+  upload_id = result.upload_id;
+
+  return 0;
+}
+
+static int cloud_tier_complete_multipart(const DoutPrefixProvider *dpp,
+      RGWRESTConn& dest_conn, const rgw_obj& dest_obj,
+      std::string& upload_id,
+      const std::map<int, rgw_lc_multipart_part_info>& parts) {
+  rgw_http_param_pair params[] = { { "uploadId", upload_id.c_str() }, {nullptr, nullptr} };
+
+  stringstream ss;
+  XMLFormatter formatter;
+  int ret;
+
+  bufferlist bl, out_bl;
+  string resource = obj_to_aws_path(dest_obj);
+
+  struct CompleteMultipartReq {
+    std::map<int, rgw_lc_multipart_part_info> parts;
+
+    explicit CompleteMultipartReq(const std::map<int, rgw_lc_multipart_part_info>& _parts) : parts(_parts) {}
+
+    void dump_xml(Formatter *f) const {
+      for (const auto& p : parts) {
+        f->open_object_section("Part");
+        encode_xml("PartNumber", p.first, f);
+        encode_xml("ETag", p.second.etag, f);
+        f->close_section();
+      };
+    }
+  } req_enc(parts);
+
+  struct CompleteMultipartResult {
+    std::string location;
+    std::string bucket;
+    std::string key;
+    std::string etag;
+
+    void decode_xml(XMLObj *obj) {
+      RGWXMLDecoder::decode_xml("Location", bucket, obj);
+      RGWXMLDecoder::decode_xml("Bucket", bucket, obj);
+      RGWXMLDecoder::decode_xml("Key", key, obj);
+      RGWXMLDecoder::decode_xml("ETag", etag, obj);
+    }
+  } result;
+
+  encode_xml("CompleteMultipartUpload", req_enc, &formatter);
+
+  formatter.flush(ss);
+  bl.append(ss.str());
+
+  ret = dest_conn.send_resource(dpp, "POST", resource, params, nullptr,
+      out_bl, &bl, nullptr, null_yield);
+
+
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: failed to complete multipart upload for dest object=" << dest_obj << dendl;
+    return ret;
+  }
+  /*
+   * If one of the following fails we cannot abort upload, as we cannot
+   * extract the upload id. If one of these fail it's very likely that that's
+   * the least of our problem.
+   */
+  RGWXMLDecoder::XMLParser parser;
+  if (!parser.init()) {
+    ldpp_dout(dpp, 0) << "ERROR: failed to initialize xml parser for parsing multipart init response from server" << dendl;
+    return -EIO;
+  }
+
+  if (!parser.parse(out_bl.c_str(), out_bl.length(), 1)) {
+    string str(out_bl.c_str(), out_bl.length());
+    ldpp_dout(dpp, 5) << "ERROR: failed to parse xml Completemultipart: " << str << dendl;
+    return -EIO;
+  }
+
+  try {
+    RGWXMLDecoder::decode_xml("CompleteMultipartUploadResult", result, &parser, true);
+  } catch (RGWXMLDecoder::err& err) {
+    string str(out_bl.c_str(), out_bl.length());
+    ldpp_dout(dpp, 5) << "ERROR: unexpected xml: " << str << dendl;
+    return -EIO;
+  }
+
+  ldpp_dout(dpp, 20) << "complete multipart result: location=" << result.location << " bucket=" << result.bucket << " key=" << result.key << " etag=" << result.etag << dendl;
+
+  return ret;
+}
+
+static int cloud_tier_abort_multipart_upload(RGWLCCloudTierCtx& tier_ctx,
+      const rgw_obj& dest_obj, const rgw_raw_obj& status_obj,
+      const std::string& upload_id) {
+  int ret;
+
+  ret = cloud_tier_abort_multipart(tier_ctx.dpp, tier_ctx.conn, dest_obj, upload_id);
+
+  if (ret < 0) {
+    ldpp_dout(tier_ctx.dpp, 0) << "ERROR: failed to abort multipart upload dest obj=" << dest_obj << " upload_id=" << upload_id << " ret=" << ret << dendl;
+    /* ignore error, best effort */
+  }
+  /* remove status obj */
+  ret = delete_upload_status(tier_ctx.dpp, tier_ctx.driver, &status_obj);
+  if (ret < 0) {
+    ldpp_dout(tier_ctx.dpp, 0) << "ERROR: failed to remove sync status obj obj=" << status_obj << " ret=" << ret << dendl;
+    // ignore error, best effort 
+  }
+  return 0;
+}
+
+static int cloud_tier_multipart_transfer(RGWLCCloudTierCtx& tier_ctx) {
+  rgw_obj src_obj;
+  rgw_obj dest_obj;
+
+  uint64_t obj_size;
+  std::string src_etag;
+  rgw_rest_obj rest_obj;
+
+  rgw_lc_multipart_upload_info status;
+
+  std::map<std::string, std::string> new_attrs;
+
+  rgw_raw_obj status_obj;
+
+  RGWBucketInfo b;
+  std::string target_obj_name;
+  rgw_bucket target_bucket;
+
+  int ret;
+
+  rgw_lc_obj_properties obj_properties(tier_ctx.o.meta.mtime, tier_ctx.o.meta.etag,
+        tier_ctx.o.versioned_epoch, tier_ctx.acl_mappings,
+        tier_ctx.target_storage_class);
+
+  uint32_t part_size{0};
+  uint32_t num_parts{0};
+
+  int cur_part{0};
+  uint64_t cur_ofs{0};
+  std::map<int, rgw_lc_multipart_part_info> parts;
+
+  obj_size = tier_ctx.o.meta.size;
+
+  target_bucket.name = tier_ctx.target_bucket_name;
+
+  target_obj_name = tier_ctx.bucket_info.bucket.name + "/" +
+    tier_ctx.obj->get_name();
+  if (!tier_ctx.o.is_current()) {
+    target_obj_name += get_key_instance(tier_ctx.obj->get_key());
+  }
+  dest_obj.init(target_bucket, target_obj_name);
+
+  rgw_pool pool = static_cast<rgw::sal::RadosStore*>(tier_ctx.driver)->svc()->zone->get_zone_params().log_pool;
+  status_obj = rgw_raw_obj(pool, "lc_multipart_" + tier_ctx.obj->get_oid());
+
+  ret = read_upload_status(tier_ctx.dpp, tier_ctx.driver, &status_obj, &status);
+
+  if (ret < 0 && ret != -ENOENT) {
+    ldpp_dout(tier_ctx.dpp, 0) << "ERROR: failed to read sync status of object " << src_obj << " ret=" << ret << dendl;
+    return ret;
+  }
+
+  if (ret >= 0) {
+    // check here that mtime and size did not change 
+    if (status.mtime != obj_properties.mtime || status.obj_size != obj_size ||
+        status.etag != obj_properties.etag) {
+      cloud_tier_abort_multipart_upload(tier_ctx, dest_obj, status_obj, status.upload_id);
+      ret = -ENOENT;
+    }
+  }
+
+  if (ret == -ENOENT) { 
+    RGWLCStreamRead readf(tier_ctx.cct, tier_ctx.dpp, tier_ctx.obj, tier_ctx.o.meta.mtime);
+
+    readf.init();
+
+    rest_obj = readf.get_rest_obj();
+
+    RGWLCCloudStreamPut::init_send_attrs(tier_ctx.dpp, rest_obj, obj_properties, new_attrs);
+
+    ret = cloud_tier_init_multipart(tier_ctx.dpp, tier_ctx.conn, dest_obj, obj_size, new_attrs, status.upload_id);
+    if (ret < 0) {
+      return ret;
+    }
+
+    status.obj_size = obj_size;
+    status.mtime = obj_properties.mtime;
+    status.etag = obj_properties.etag;
+
+    ret = put_upload_status(tier_ctx.dpp, tier_ctx.driver, &status_obj, &status);
+
+    if (ret < 0) {
+      ldpp_dout(tier_ctx.dpp, 0) << "ERROR: failed to driver multipart upload state, ret=" << ret << dendl;
+      // continue with upload anyway 
+    }
+
+#define MULTIPART_MAX_PARTS 10000
+#define MULTIPART_MAX_PARTS 10000
+    uint64_t min_part_size = obj_size / MULTIPART_MAX_PARTS;
+    uint64_t min_conf_size = tier_ctx.multipart_min_part_size;
+
+    if (min_conf_size < MULTIPART_MIN_POSSIBLE_PART_SIZE) {
+      min_conf_size = MULTIPART_MIN_POSSIBLE_PART_SIZE;
+    }
+
+    part_size = std::max(min_conf_size, min_part_size);
+    num_parts = (obj_size + part_size - 1) / part_size;
+    cur_part = 1;
+    cur_ofs = 0;
+  }
+
+  for (; (uint32_t)cur_part <= num_parts; ++cur_part) {
+    ldpp_dout(tier_ctx.dpp, 20) << "cur_part = "<< cur_part << ", info.ofs = " << cur_ofs << ", info.size = " << part_size << ", obj size = " << obj_size<< ", num_parts:" << num_parts << dendl;
+    rgw_lc_multipart_part_info& cur_part_info = parts[cur_part];
+    cur_part_info.part_num = cur_part;
+    cur_part_info.ofs = cur_ofs;
+    cur_part_info.size = std::min((uint64_t)part_size, obj_size - cur_ofs);
+
+    cur_ofs += cur_part_info.size;
+
+    ret = cloud_tier_send_multipart_part(tier_ctx,
+            status.upload_id,
+            cur_part_info,
+            &cur_part_info.etag);
+
+    if (ret < 0) {
+      ldpp_dout(tier_ctx.dpp, 0) << "ERROR: failed to send multipart part of obj=" << tier_ctx.obj << ", sync via multipart upload, upload_id=" << status.upload_id << " part number " << cur_part << " (error: " << cpp_strerror(-ret) << ")" << dendl;
+      cloud_tier_abort_multipart_upload(tier_ctx, dest_obj, status_obj, status.upload_id);
+      return ret;
+    }
+
+  }
+
+  ret = cloud_tier_complete_multipart(tier_ctx.dpp, tier_ctx.conn, dest_obj, status.upload_id, parts);
+  if (ret < 0) {
+    ldpp_dout(tier_ctx.dpp, 0) << "ERROR: failed to complete multipart upload of obj=" << tier_ctx.obj << " (error: " << cpp_strerror(-ret) << ")" << dendl;
+    cloud_tier_abort_multipart_upload(tier_ctx, dest_obj, status_obj, status.upload_id);
+    return ret;
+  }
+
+  /* remove status obj */
+  ret = delete_upload_status(tier_ctx.dpp, tier_ctx.driver, &status_obj);
+  if (ret < 0) {
+    ldpp_dout(tier_ctx.dpp, 0) << "ERROR: failed to abort multipart upload obj=" << tier_ctx.obj << " upload_id=" << status.upload_id << " part number " << cur_part << " (" << cpp_strerror(-ret) << ")" << dendl;
+    // ignore error, best effort 
+  }
+  return 0;
+}
+
+/* Check if object has already been transitioned */
+static int cloud_tier_check_object(RGWLCCloudTierCtx& tier_ctx, bool& already_tiered) {
+  int ret;
+  std::map<std::string, std::string> headers;
+
+  /* Fetch Head object */
+  ret = cloud_tier_get_object(tier_ctx, true, headers);
+
+  if (ret < 0) {
+    ldpp_dout(tier_ctx.dpp, 0) << "ERROR: failed to fetch HEAD from cloud for obj=" << tier_ctx.obj << " , ret = " << ret << dendl;
+    return ret;
+  }
+
+  already_tiered = is_already_tiered(tier_ctx.dpp, headers, tier_ctx.o.meta.mtime);
+
+  if (already_tiered) {
+    ldpp_dout(tier_ctx.dpp, 20) << "is_already_tiered true" << dendl;
+  } else {
+    ldpp_dout(tier_ctx.dpp, 20) << "is_already_tiered false..going with out_crf writing" << dendl;
+  }
+
+  return ret;
+}
+
+static int cloud_tier_create_bucket(RGWLCCloudTierCtx& tier_ctx) {
+  bufferlist out_bl;
+  int ret = 0;
+  pair<string, string> key(tier_ctx.storage_class, tier_ctx.target_bucket_name);
+  struct CreateBucketResult {
+    std::string code;
+
+    void decode_xml(XMLObj *obj) {
+      RGWXMLDecoder::decode_xml("Code", code, obj);
+    }
+  } result;
+
+  ldpp_dout(tier_ctx.dpp, 30) << "Cloud_tier_ctx: creating bucket:" << tier_ctx.target_bucket_name << dendl;
+  bufferlist bl;
+  string resource = tier_ctx.target_bucket_name;
+
+  ret = tier_ctx.conn.send_resource(tier_ctx.dpp, "PUT", resource, nullptr, nullptr,
+                                    out_bl, &bl, nullptr, null_yield);
+
+  if (ret < 0 ) {
+    ldpp_dout(tier_ctx.dpp, 0) << "create target bucket : " << tier_ctx.target_bucket_name << " returned ret:" << ret << dendl;
+  }
+  if (out_bl.length() > 0) {
+    RGWXMLDecoder::XMLParser parser;
+    if (!parser.init()) {
+      ldpp_dout(tier_ctx.dpp, 0) << "ERROR: failed to initialize xml parser for parsing create_bucket response from server" << dendl;
+      return -EIO;
+    }
+
+    if (!parser.parse(out_bl.c_str(), out_bl.length(), 1)) {
+      string str(out_bl.c_str(), out_bl.length());
+      ldpp_dout(tier_ctx.dpp, 5) << "ERROR: failed to parse xml createbucket: " << str << dendl;
+      return -EIO;
+    }
+
+    try {
+      RGWXMLDecoder::decode_xml("Error", result, &parser, true);
+    } catch (RGWXMLDecoder::err& err) {
+      string str(out_bl.c_str(), out_bl.length());
+      ldpp_dout(tier_ctx.dpp, 5) << "ERROR: unexpected xml: " << str << dendl;
+      return -EIO;
+    }
+
+    if (result.code != "BucketAlreadyOwnedByYou" && result.code != "BucketAlreadyExists") {
+      ldpp_dout(tier_ctx.dpp, 0) << "ERROR: Creating target bucket failed with error: " << result.code << dendl;
+      return -EIO;
+    }
+  }
+
+  return 0;
+}
+
+int rgw_cloud_tier_transfer_object(RGWLCCloudTierCtx& tier_ctx, std::set<std::string>& cloud_targets) {
+  int ret = 0;
+
+  // check if target_path is already created
+  std::set<std::string>::iterator it;
+
+  it = cloud_targets.find(tier_ctx.target_bucket_name);
+  tier_ctx.target_bucket_created = (it != cloud_targets.end());
+
+  /* If run first time attempt to create the target bucket */
+  if (!tier_ctx.target_bucket_created) {
+    ret = cloud_tier_create_bucket(tier_ctx);
+
+    if (ret < 0) {
+      ldpp_dout(tier_ctx.dpp, 0) << "ERROR: failed to create target bucket on the cloud endpoint ret=" << ret << dendl;
+      return ret;
+    }
+    tier_ctx.target_bucket_created = true;
+    cloud_targets.insert(tier_ctx.target_bucket_name);
+  }
+
+  /* Since multiple zones may try to transition the same object to the cloud,
+   * verify if the object is already transitioned. And since its just a best
+   * effort, do not bail out in case of any errors.
+   */
+  bool already_tiered = false;
+  ret = cloud_tier_check_object(tier_ctx, already_tiered);
+
+  if (ret < 0) {
+    ldpp_dout(tier_ctx.dpp, 0) << "ERROR: failed to check object on the cloud endpoint ret=" << ret << dendl;
+  }
+
+  if (already_tiered) {
+    ldpp_dout(tier_ctx.dpp, 20) << "Object (" << tier_ctx.o.key << ") is already tiered" << dendl;
+    return 0;
+  }
+
+  uint64_t size = tier_ctx.o.meta.size;
+  uint64_t multipart_sync_threshold = tier_ctx.multipart_sync_threshold;
+
+  if (multipart_sync_threshold < MULTIPART_MIN_POSSIBLE_PART_SIZE) {
+    multipart_sync_threshold = MULTIPART_MIN_POSSIBLE_PART_SIZE;
+  }
+
+  if (size < multipart_sync_threshold) {
+    ret = cloud_tier_plain_transfer(tier_ctx);
+  } else {
+    tier_ctx.is_multipart_upload = true;
+    ret = cloud_tier_multipart_transfer(tier_ctx);
+  } 
+
+  if (ret < 0) {
+    ldpp_dout(tier_ctx.dpp, 0) << "ERROR: failed to transition object ret=" << ret << dendl;
+  }
+
+  return ret;
+}
diff --git a/src/rgw/driver/rados/rgw_lc_tier.h b/src/rgw/driver/rados/rgw_lc_tier.h
new file mode 100644 (file)
index 0000000..1b21f26
--- /dev/null
@@ -0,0 +1,54 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#ifndef CEPH_RGW_LC_TIER_H
+#define CEPH_RGW_LC_TIER_H
+
+#include "rgw_lc.h"
+#include "rgw_rest_conn.h"
+#include "rgw_rados.h"
+#include "rgw_zone.h"
+#include "rgw_sal_rados.h"
+#include "rgw_cr_rest.h"
+
+#define DEFAULT_MULTIPART_SYNC_PART_SIZE (32 * 1024 * 1024)
+#define MULTIPART_MIN_POSSIBLE_PART_SIZE (5 * 1024 * 1024)
+
+struct RGWLCCloudTierCtx {
+  CephContext *cct;
+  const DoutPrefixProvider *dpp;
+
+  /* Source */
+  rgw_bucket_dir_entry& o;
+  rgw::sal::Driver *driver;
+  RGWBucketInfo& bucket_info;
+  std::string storage_class;
+
+  rgw::sal::Object *obj;
+
+  /* Remote */
+  RGWRESTConn& conn;
+  std::string target_bucket_name;
+  std::string target_storage_class;
+
+  std::map<std::string, RGWTierACLMapping> acl_mappings;
+  uint64_t multipart_min_part_size;
+  uint64_t multipart_sync_threshold;
+
+  bool is_multipart_upload{false};
+  bool target_bucket_created{true};
+
+  RGWLCCloudTierCtx(CephContext* _cct, const DoutPrefixProvider *_dpp,
+      rgw_bucket_dir_entry& _o, rgw::sal::Driver *_driver,
+      RGWBucketInfo &_binfo, rgw::sal::Object *_obj,
+      RGWRESTConn& _conn, std::string& _bucket,
+      std::string& _storage_class) :
+    cct(_cct), dpp(_dpp), o(_o), driver(_driver), bucket_info(_binfo),
+    obj(_obj), conn(_conn), target_bucket_name(_bucket),
+    target_storage_class(_storage_class) {}
+};
+
+/* Transition object to cloud endpoint */
+int rgw_cloud_tier_transfer_object(RGWLCCloudTierCtx& tier_ctx, std::set<std::string>& cloud_targets);
+
+#endif
diff --git a/src/rgw/driver/rados/rgw_log_backing.cc b/src/rgw/driver/rados/rgw_log_backing.cc
new file mode 100644 (file)
index 0000000..7c9dafe
--- /dev/null
@@ -0,0 +1,708 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "cls/log/cls_log_client.h"
+#include "cls/version/cls_version_client.h"
+
+#include "rgw_log_backing.h"
+#include "rgw_tools.h"
+#include "cls_fifo_legacy.h"
+
+using namespace std::chrono_literals;
+namespace cb = ceph::buffer;
+
+static constexpr auto dout_subsys = ceph_subsys_rgw;
+
+enum class shard_check { dne, omap, fifo, corrupt };
+inline std::ostream& operator <<(std::ostream& m, const shard_check& t) {
+  switch (t) {
+  case shard_check::dne:
+    return m << "shard_check::dne";
+  case shard_check::omap:
+    return m << "shard_check::omap";
+  case shard_check::fifo:
+    return m << "shard_check::fifo";
+  case shard_check::corrupt:
+    return m << "shard_check::corrupt";
+  }
+
+  return m << "shard_check::UNKNOWN=" << static_cast<uint32_t>(t);
+}
+
+namespace {
+/// Return the shard type, and a bool to see whether it has entries.
+shard_check
+probe_shard(const DoutPrefixProvider *dpp, librados::IoCtx& ioctx, const std::string& oid,
+           bool& fifo_unsupported, optional_yield y)
+{
+  ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                    << " probing oid=" << oid
+                    << dendl;
+  if (!fifo_unsupported) {
+    std::unique_ptr<rgw::cls::fifo::FIFO> fifo;
+    auto r = rgw::cls::fifo::FIFO::open(dpp, ioctx, oid,
+                                       &fifo, y,
+                                       std::nullopt, true);
+    switch (r) {
+    case 0:
+      ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                        << ": oid=" << oid << " is FIFO"
+                        << dendl;
+      return shard_check::fifo;
+
+    case -ENODATA:
+      ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                        << ": oid=" << oid << " is empty and therefore OMAP"
+                        << dendl;
+      return shard_check::omap;
+
+    case -ENOENT:
+      ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                        << ": oid=" << oid << " does not exist"
+                        << dendl;
+      return shard_check::dne;
+
+    case -EPERM:
+      ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                        << ": FIFO is unsupported, marking."
+                        << dendl;
+      fifo_unsupported = true;
+      return shard_check::omap;
+
+    default:
+      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                        << ": error probing: r=" << r
+                        << ", oid=" << oid << dendl;
+      return shard_check::corrupt;
+    }
+  } else {
+    // Since FIFO is unsupported, OMAP is the only alternative
+    return shard_check::omap;
+  }
+}
+
+tl::expected<log_type, bs::error_code>
+handle_dne(const DoutPrefixProvider *dpp, librados::IoCtx& ioctx,
+          log_type def,
+          std::string oid,
+          bool fifo_unsupported,
+          optional_yield y)
+{
+  if (def == log_type::fifo) {
+    if (fifo_unsupported) {
+      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                << " WARNING: FIFO set as default but not supported by OSD. "
+                << "Falling back to OMAP." << dendl;
+      return log_type::omap;
+    }
+    std::unique_ptr<rgw::cls::fifo::FIFO> fifo;
+    auto r = rgw::cls::fifo::FIFO::create(dpp, ioctx, oid,
+                                         &fifo, y,
+                                         std::nullopt);
+    if (r < 0) {
+      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                << " error creating FIFO: r=" << r
+                << ", oid=" << oid << dendl;
+      return tl::unexpected(bs::error_code(-r, bs::system_category()));
+    }
+  }
+  return def;
+}
+}
+
+tl::expected<log_type, bs::error_code>
+log_backing_type(const DoutPrefixProvider *dpp, 
+                 librados::IoCtx& ioctx,
+                log_type def,
+                int shards,
+                const fu2::unique_function<std::string(int) const>& get_oid,
+                optional_yield y)
+{
+  auto check = shard_check::dne;
+  bool fifo_unsupported = false;
+  for (int i = 0; i < shards; ++i) {
+    auto c = probe_shard(dpp, ioctx, get_oid(i), fifo_unsupported, y);
+    if (c == shard_check::corrupt)
+      return tl::unexpected(bs::error_code(EIO, bs::system_category()));
+    if (c == shard_check::dne) continue;
+    if (check == shard_check::dne) {
+      check = c;
+      continue;
+    }
+
+    if (check != c) {
+      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                << " clashing types: check=" << check
+                << ", c=" << c << dendl;
+      return tl::unexpected(bs::error_code(EIO, bs::system_category()));
+    }
+  }
+  if (check == shard_check::corrupt) {
+    ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+              << " should be unreachable!" << dendl;
+    return tl::unexpected(bs::error_code(EIO, bs::system_category()));
+  }
+
+  if (check == shard_check::dne)
+    return handle_dne(dpp, ioctx,
+                     def,
+                     get_oid(0),
+                     fifo_unsupported,
+                     y);
+
+  return (check == shard_check::fifo ? log_type::fifo : log_type::omap);
+}
+
+bs::error_code log_remove(const DoutPrefixProvider *dpp, 
+                          librados::IoCtx& ioctx,
+                         int shards,
+                         const fu2::unique_function<std::string(int) const>& get_oid,
+                         bool leave_zero,
+                         optional_yield y)
+{
+  bs::error_code ec;
+  for (int i = 0; i < shards; ++i) {
+    auto oid = get_oid(i);
+    rados::cls::fifo::info info;
+    uint32_t part_header_size = 0, part_entry_overhead = 0;
+
+    auto r = rgw::cls::fifo::get_meta(dpp, ioctx, oid, std::nullopt, &info,
+                                     &part_header_size, &part_entry_overhead,
+                                     0, y, true);
+    if (r == -ENOENT) continue;
+    if (r == 0 && info.head_part_num > -1) {
+      for (auto j = info.tail_part_num; j <= info.head_part_num; ++j) {
+       librados::ObjectWriteOperation op;
+       op.remove();
+       auto part_oid = info.part_oid(j);
+       auto subr = rgw_rados_operate(dpp, ioctx, part_oid, &op, null_yield);
+       if (subr < 0 && subr != -ENOENT) {
+         if (!ec)
+           ec = bs::error_code(-subr, bs::system_category());
+         ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                    << ": failed removing FIFO part: part_oid=" << part_oid
+                    << ", subr=" << subr << dendl;
+       }
+      }
+    }
+    if (r < 0 && r != -ENODATA) {
+      if (!ec)
+       ec = bs::error_code(-r, bs::system_category());
+      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                << ": failed checking FIFO part: oid=" << oid
+                << ", r=" << r << dendl;
+    }
+    librados::ObjectWriteOperation op;
+    if (i == 0 && leave_zero) {
+      // Leave shard 0 in existence, but remove contents and
+      // omap. cls_lock stores things in the xattrs. And sync needs to
+      // rendezvous with locks on generation 0 shard 0.
+      op.omap_set_header({});
+      op.omap_clear();
+      op.truncate(0);
+    } else {
+      op.remove();
+    }
+    r = rgw_rados_operate(dpp, ioctx, oid, &op, null_yield);
+    if (r < 0 && r != -ENOENT) {
+      if (!ec)
+       ec = bs::error_code(-r, bs::system_category());
+      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                << ": failed removing shard: oid=" << oid
+                << ", r=" << r << dendl;
+    }
+  }
+  return ec;
+}
+
+logback_generations::~logback_generations() {
+  if (watchcookie > 0) {
+    auto cct = static_cast<CephContext*>(ioctx.cct());
+    auto r = ioctx.unwatch2(watchcookie);
+    if (r < 0) {
+      lderr(cct) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                << ": failed unwatching oid=" << oid
+                << ", r=" << r << dendl;
+    }
+  }
+}
+
+bs::error_code logback_generations::setup(const DoutPrefixProvider *dpp,
+                                          log_type def,
+                                         optional_yield y) noexcept
+{
+  try {
+    // First, read.
+    auto cct = static_cast<CephContext*>(ioctx.cct());
+    auto res = read(dpp, y);
+    if (!res && res.error() != bs::errc::no_such_file_or_directory) {
+      return res.error();
+    }
+    if (res) {
+      std::unique_lock lock(m);
+      std::tie(entries_, version) = std::move(*res);
+    } else {
+      // Are we the first? Then create generation 0 and the generations
+      // metadata.
+      librados::ObjectWriteOperation op;
+      auto type = log_backing_type(dpp, ioctx, def, shards,
+                                  [this](int shard) {
+                                    return this->get_oid(0, shard);
+                                  }, y);
+      if (!type)
+       return type.error();
+
+      logback_generation l;
+      l.type = *type;
+
+      std::unique_lock lock(m);
+      version.ver = 1;
+      static constexpr auto TAG_LEN = 24;
+      version.tag.clear();
+      append_rand_alpha(cct, version.tag, version.tag, TAG_LEN);
+      op.create(true);
+      cls_version_set(op, version);
+      cb::list bl;
+      entries_.emplace(0, std::move(l));
+      encode(entries_, bl);
+      lock.unlock();
+
+      op.write_full(bl);
+      auto r = rgw_rados_operate(dpp, ioctx, oid, &op, y);
+      if (r < 0 && r != -EEXIST) {
+       ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                  << ": failed writing oid=" << oid
+                  << ", r=" << r << dendl;
+       bs::system_error(-r, bs::system_category());
+      }
+      // Did someone race us? Then re-read.
+      if (r != 0) {
+       res = read(dpp, y);
+       if (!res)
+         return res.error();
+       if (res->first.empty())
+         return bs::error_code(EIO, bs::system_category());
+       auto l = res->first.begin()->second;
+       // In the unlikely event that someone raced us, created
+       // generation zero, incremented, then erased generation zero,
+       // don't leave generation zero lying around.
+       if (l.gen_id != 0) {
+         auto ec = log_remove(dpp, ioctx, shards,
+                              [this](int shard) {
+                                return this->get_oid(0, shard);
+                              }, true, y);
+         if (ec) return ec;
+       }
+       std::unique_lock lock(m);
+       std::tie(entries_, version) = std::move(*res);
+      }
+    }
+    // Pass all non-empty generations to the handler
+    std::unique_lock lock(m);
+    auto i = lowest_nomempty(entries_);
+    entries_t e;
+    std::copy(i, entries_.cend(),
+             std::inserter(e, e.end()));
+    m.unlock();
+    auto ec = watch();
+    if (ec) {
+      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                << ": failed to re-establish watch, unsafe to continue: oid="
+                << oid << ", ec=" << ec.message() << dendl;
+    }
+    return handle_init(std::move(e));
+  } catch (const std::bad_alloc&) {
+    return bs::error_code(ENOMEM, bs::system_category());
+  }
+}
+
+bs::error_code logback_generations::update(const DoutPrefixProvider *dpp, optional_yield y) noexcept
+{
+  try {
+    auto res = read(dpp, y);
+    if (!res) {
+      return res.error();
+    }
+
+    std::unique_lock l(m);
+    auto& [es, v] = *res;
+    if (v == version) {
+      // Nothing to do!
+      return {};
+    }
+
+    // Check consistency and prepare update
+    if (es.empty()) {
+      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                << ": INCONSISTENCY! Read empty update." << dendl;
+      return bs::error_code(EFAULT, bs::system_category());
+    }
+    auto cur_lowest = lowest_nomempty(entries_);
+    // Straight up can't happen
+    assert(cur_lowest != entries_.cend());
+    auto new_lowest = lowest_nomempty(es);
+    if (new_lowest == es.cend()) {
+      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                << ": INCONSISTENCY! Read update with no active head." << dendl;
+      return bs::error_code(EFAULT, bs::system_category());
+    }
+    if (new_lowest->first < cur_lowest->first) {
+      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                << ": INCONSISTENCY! Tail moved wrong way." << dendl;
+      return bs::error_code(EFAULT, bs::system_category());
+    }
+
+    std::optional<uint64_t> highest_empty;
+    if (new_lowest->first > cur_lowest->first && new_lowest != es.begin()) {
+      --new_lowest;
+      highest_empty = new_lowest->first;
+    }
+
+    entries_t new_entries;
+
+    if ((es.end() - 1)->first < (entries_.end() - 1)->first) {
+      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                << ": INCONSISTENCY! Head moved wrong way." << dendl;
+      return bs::error_code(EFAULT, bs::system_category());
+    }
+
+    if ((es.end() - 1)->first > (entries_.end() - 1)->first) {
+      auto ei = es.lower_bound((entries_.end() - 1)->first + 1);
+      std::copy(ei, es.end(), std::inserter(new_entries, new_entries.end()));
+    }
+
+    // Everything checks out!
+
+    version = v;
+    entries_ = es;
+    l.unlock();
+
+    if (highest_empty) {
+      auto ec = handle_empty_to(*highest_empty);
+      if (ec) return ec;
+    }
+
+    if (!new_entries.empty()) {
+      auto ec = handle_new_gens(std::move(new_entries));
+      if (ec) return ec;
+    }
+  } catch (const std::bad_alloc&) {
+    return bs::error_code(ENOMEM, bs::system_category());
+  }
+  return {};
+}
+
+auto logback_generations::read(const DoutPrefixProvider *dpp, optional_yield y) noexcept ->
+  tl::expected<std::pair<entries_t, obj_version>, bs::error_code>
+{
+  try {
+    librados::ObjectReadOperation op;
+    std::unique_lock l(m);
+    cls_version_check(op, version, VER_COND_GE);
+    l.unlock();
+    obj_version v2;
+    cls_version_read(op, &v2);
+    cb::list bl;
+    op.read(0, 0, &bl, nullptr);
+    auto r = rgw_rados_operate(dpp, ioctx, oid, &op, nullptr, y);
+    if (r < 0) {
+      if (r == -ENOENT) {
+       ldpp_dout(dpp, 5) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                     << ": oid=" << oid
+                     << " not found" << dendl;
+      } else {
+       ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                  << ": failed reading oid=" << oid
+                  << ", r=" << r << dendl;
+      }
+      return tl::unexpected(bs::error_code(-r, bs::system_category()));
+    }
+    auto bi = bl.cbegin();
+    entries_t e;
+    try {
+      decode(e, bi);
+    } catch (const cb::error& err) {
+      return tl::unexpected(err.code());
+    }
+    return std::pair{ std::move(e), std::move(v2) };
+  } catch (const std::bad_alloc&) {
+    return tl::unexpected(bs::error_code(ENOMEM, bs::system_category()));
+  }
+}
+
+bs::error_code logback_generations::write(const DoutPrefixProvider *dpp, entries_t&& e,
+                                         std::unique_lock<std::mutex>&& l_,
+                                         optional_yield y) noexcept
+{
+  auto l = std::move(l_);
+  ceph_assert(l.mutex() == &m &&
+             l.owns_lock());
+  try {
+    librados::ObjectWriteOperation op;
+    cls_version_check(op, version, VER_COND_GE);
+    cb::list bl;
+    encode(e, bl);
+    op.write_full(bl);
+    cls_version_inc(op);
+    auto r = rgw_rados_operate(dpp, ioctx, oid, &op, y);
+    if (r == 0) {
+      entries_ = std::move(e);
+      version.inc();
+      return {};
+    }
+    l.unlock();
+    if (r < 0 && r != -ECANCELED) {
+      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                << ": failed reading oid=" << oid
+                << ", r=" << r << dendl;
+      return { -r, bs::system_category() };
+    }
+    if (r == -ECANCELED) {
+      auto ec = update(dpp, y);
+      if (ec) {
+       return ec;
+      } else {
+       return { ECANCELED, bs::system_category() };
+      }
+    }
+  } catch (const std::bad_alloc&) {
+    return { ENOMEM, bs::system_category() };
+  }
+  return {};
+}
+
+
+bs::error_code logback_generations::watch() noexcept {
+  try {
+    auto cct = static_cast<CephContext*>(ioctx.cct());
+    auto r = ioctx.watch2(oid, &watchcookie, this);
+    if (r < 0) {
+      lderr(cct) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                << ": failed to set watch oid=" << oid
+                << ", r=" << r << dendl;
+      return { -r, bs::system_category() };
+    }
+  } catch (const std::bad_alloc&) {
+    return bs::error_code(ENOMEM, bs::system_category());
+  }
+  return {};
+}
+
+bs::error_code logback_generations::new_backing(const DoutPrefixProvider *dpp, 
+                                                log_type type,
+                                               optional_yield y) noexcept {
+  static constexpr auto max_tries = 10;
+  try {
+    auto ec = update(dpp, y);
+    if (ec) return ec;
+    auto tries = 0;
+    entries_t new_entries;
+    do {
+      std::unique_lock l(m);
+      auto last = entries_.end() - 1;
+      if (last->second.type == type) {
+       // Nothing to be done
+       return {};
+      }
+      auto newgenid = last->first + 1;
+      logback_generation newgen;
+      newgen.gen_id = newgenid;
+      newgen.type = type;
+      new_entries.emplace(newgenid, newgen);
+      auto es = entries_;
+      es.emplace(newgenid, std::move(newgen));
+      ec = write(dpp, std::move(es), std::move(l), y);
+      ++tries;
+    } while (ec == bs::errc::operation_canceled &&
+            tries < max_tries);
+    if (tries >= max_tries) {
+      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                << ": exhausted retry attempts." << dendl;
+      return ec;
+    }
+
+    if (ec) {
+      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                << ": write failed with ec=" << ec.message() << dendl;
+      return ec;
+    }
+
+    cb::list bl, rbl;
+
+    auto r = rgw_rados_notify(dpp, ioctx, oid, bl, 10'000, &rbl, y);
+    if (r < 0) {
+      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                << ": notify failed with r=" << r << dendl;
+      return { -r, bs::system_category() };
+    }
+    ec = handle_new_gens(new_entries);
+  } catch (const std::bad_alloc&) {
+    return bs::error_code(ENOMEM, bs::system_category());
+  }
+  return {};
+}
+
+bs::error_code logback_generations::empty_to(const DoutPrefixProvider *dpp, 
+                                             uint64_t gen_id,
+                                            optional_yield y) noexcept {
+  static constexpr auto max_tries = 10;
+  try {
+    auto ec = update(dpp, y);
+    if (ec) return ec;
+    auto tries = 0;
+    uint64_t newtail = 0;
+    do {
+      std::unique_lock l(m);
+      {
+       auto last = entries_.end() - 1;
+       if (gen_id >= last->first) {
+         ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                    << ": Attempt to trim beyond the possible." << dendl;
+         return bs::error_code(EINVAL, bs::system_category());
+       }
+      }
+      auto es = entries_;
+      auto ei = es.upper_bound(gen_id);
+      if (ei == es.begin()) {
+       // Nothing to be done.
+       return {};
+      }
+      for (auto i = es.begin(); i < ei; ++i) {
+       newtail = i->first;
+       i->second.pruned = ceph::real_clock::now();
+      }
+      ec = write(dpp, std::move(es), std::move(l), y);
+      ++tries;
+    } while (ec == bs::errc::operation_canceled &&
+            tries < max_tries);
+    if (tries >= max_tries) {
+      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                << ": exhausted retry attempts." << dendl;
+      return ec;
+    }
+
+    if (ec) {
+      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                << ": write failed with ec=" << ec.message() << dendl;
+      return ec;
+    }
+
+    cb::list bl, rbl;
+
+    auto r = rgw_rados_notify(dpp, ioctx, oid, bl, 10'000, &rbl, y);
+    if (r < 0) {
+      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                << ": notify failed with r=" << r << dendl;
+      return { -r, bs::system_category() };
+    }
+    ec = handle_empty_to(newtail);
+  } catch (const std::bad_alloc&) {
+    return bs::error_code(ENOMEM, bs::system_category());
+  }
+  return {};
+}
+
+bs::error_code logback_generations::remove_empty(const DoutPrefixProvider *dpp, optional_yield y) noexcept {
+  static constexpr auto max_tries = 10;
+  try {
+    auto ec = update(dpp, y);
+    if (ec) return ec;
+    auto tries = 0;
+    entries_t new_entries;
+    std::unique_lock l(m);
+    ceph_assert(!entries_.empty());
+    {
+      auto i = lowest_nomempty(entries_);
+      if (i == entries_.begin()) {
+       return {};
+      }
+    }
+    entries_t es;
+    auto now = ceph::real_clock::now();
+    l.unlock();
+    do {
+      std::copy_if(entries_.cbegin(), entries_.cend(),
+                  std::inserter(es, es.end()),
+                  [now](const auto& e) {
+                    if (!e.second.pruned)
+                      return false;
+
+                    auto pruned = *e.second.pruned;
+                    return (now - pruned) >= 1h;
+                  });
+      auto es2 = entries_;
+      for (const auto& [gen_id, e] : es) {
+       ceph_assert(e.pruned);
+       auto ec = log_remove(dpp, ioctx, shards,
+                            [this, gen_id = gen_id](int shard) {
+                              return this->get_oid(gen_id, shard);
+                            }, (gen_id == 0), y);
+       if (ec) {
+         ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                            << ": Error pruning: gen_id=" << gen_id
+                            << " ec=" << ec.message() << dendl;
+       }
+       if (auto i = es2.find(gen_id); i != es2.end()) {
+         es2.erase(i);
+       }
+      }
+      l.lock();
+      es.clear();
+      ec = write(dpp, std::move(es2), std::move(l), y);
+      ++tries;
+    } while (ec == bs::errc::operation_canceled &&
+            tries < max_tries);
+    if (tries >= max_tries) {
+      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                << ": exhausted retry attempts." << dendl;
+      return ec;
+    }
+
+    if (ec) {
+      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                << ": write failed with ec=" << ec.message() << dendl;
+      return ec;
+    }
+  } catch (const std::bad_alloc&) {
+    return bs::error_code(ENOMEM, bs::system_category());
+  }
+  return {};
+}
+
+void logback_generations::handle_notify(uint64_t notify_id,
+                                       uint64_t cookie,
+                                       uint64_t notifier_id,
+                                       bufferlist& bl)
+{
+  auto cct = static_cast<CephContext*>(ioctx.cct());
+  const DoutPrefix dp(cct, dout_subsys, "logback generations handle_notify: ");
+  if (notifier_id != my_id) {
+    auto ec = update(&dp, null_yield);
+    if (ec) {
+      lderr(cct)
+       << __PRETTY_FUNCTION__ << ":" << __LINE__
+       << ": update failed, no one to report to and no safe way to continue."
+       << dendl;
+      abort();
+    }
+  }
+  cb::list rbl;
+  ioctx.notify_ack(oid, notify_id, watchcookie, rbl);
+}
+
+void logback_generations::handle_error(uint64_t cookie, int err) {
+  auto cct = static_cast<CephContext*>(ioctx.cct());
+  auto r = ioctx.unwatch2(watchcookie);
+  if (r < 0) {
+    lderr(cct) << __PRETTY_FUNCTION__ << ":" << __LINE__
+              << ": failed to set unwatch oid=" << oid
+              << ", r=" << r << dendl;
+  }
+
+  auto ec = watch();
+  if (ec) {
+    lderr(cct) << __PRETTY_FUNCTION__ << ":" << __LINE__
+              << ": failed to re-establish watch, unsafe to continue: oid="
+              << oid << ", ec=" << ec.message() << dendl;
+  }
+}
diff --git a/src/rgw/driver/rados/rgw_log_backing.h b/src/rgw/driver/rados/rgw_log_backing.h
new file mode 100644 (file)
index 0000000..3fa67d7
--- /dev/null
@@ -0,0 +1,399 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#ifndef CEPH_RGW_LOGBACKING_H
+#define CEPH_RGW_LOGBACKING_H
+
+#include <optional>
+#include <iostream>
+#include <string>
+#include <string_view>
+
+#include <strings.h>
+
+#include <boost/container/flat_map.hpp>
+#include <boost/system/error_code.hpp>
+
+#undef FMT_HEADER_ONLY
+#define FMT_HEADER_ONLY 1
+#include <fmt/format.h>
+
+#include "include/rados/librados.hpp"
+#include "include/encoding.h"
+#include "include/expected.hpp"
+#include "include/function2.hpp"
+
+#include "cls/version/cls_version_types.h"
+
+#include "common/async/yield_context.h"
+#include "common/Formatter.h"
+#include "common/strtol.h"
+
+namespace bc = boost::container;
+namespace bs = boost::system;
+
+#include "cls_fifo_legacy.h"
+
+/// Type of log backing, stored in the mark used in the quick check,
+/// and passed to checking functions.
+enum class log_type {
+  omap = 0,
+  fifo = 1
+};
+
+inline void encode(const log_type& type, ceph::buffer::list& bl) {
+  auto t = static_cast<uint8_t>(type);
+  encode(t, bl);
+}
+
+inline void decode(log_type& type, bufferlist::const_iterator& bl) {
+  uint8_t t;
+  decode(t, bl);
+  type = static_cast<log_type>(t);
+}
+
+inline std::optional<log_type> to_log_type(std::string_view s) {
+  if (strncasecmp(s.data(), "omap", s.length()) == 0) {
+    return log_type::omap;
+  } else if (strncasecmp(s.data(), "fifo", s.length()) == 0) {
+    return log_type::fifo;
+  } else {
+    return std::nullopt;
+  }
+}
+inline std::ostream& operator <<(std::ostream& m, const log_type& t) {
+  switch (t) {
+  case log_type::omap:
+    return m << "log_type::omap";
+  case log_type::fifo:
+    return m << "log_type::fifo";
+  }
+
+  return m << "log_type::UNKNOWN=" << static_cast<uint32_t>(t);
+}
+
+/// Look over the shards in a log and determine the type.
+tl::expected<log_type, bs::error_code>
+log_backing_type(const DoutPrefixProvider *dpp, 
+                 librados::IoCtx& ioctx,
+                log_type def,
+                int shards, //< Total number of shards
+                /// A function taking a shard number and
+                /// returning an oid.
+                const fu2::unique_function<std::string(int) const>& get_oid,
+                optional_yield y);
+
+/// Remove all log shards and associated parts of fifos.
+bs::error_code log_remove(librados::IoCtx& ioctx,
+                         int shards, //< Total number of shards
+                         /// A function taking a shard number and
+                         /// returning an oid.
+                         const fu2::unique_function<std::string(int) const>& get_oid,
+                         bool leave_zero,
+                         optional_yield y);
+
+
+struct logback_generation {
+  uint64_t gen_id = 0;
+  log_type type;
+  std::optional<ceph::real_time> pruned;
+
+  void encode(ceph::buffer::list& bl) const {
+    ENCODE_START(1, 1, bl);
+    encode(gen_id, bl);
+    encode(type, bl);
+    encode(pruned, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) {
+    DECODE_START(1, bl);
+    decode(gen_id, bl);
+    decode(type, bl);
+    decode(pruned, bl);
+    DECODE_FINISH(bl);
+  }
+};
+WRITE_CLASS_ENCODER(logback_generation)
+inline std::ostream& operator <<(std::ostream& m, const logback_generation& g) {
+  return m << "[" << g.gen_id << "," << g.type << ","
+          << (g.pruned ? "PRUNED" : "NOT PRUNED") << "]";
+}
+
+class logback_generations : public librados::WatchCtx2 {
+public:
+  using entries_t = bc::flat_map<uint64_t, logback_generation>;
+
+protected:
+  librados::IoCtx& ioctx;
+  logback_generations(librados::IoCtx& ioctx,
+                     std::string oid,
+                     fu2::unique_function<std::string(
+                       uint64_t, int) const>&& get_oid,
+                     int shards) noexcept
+    : ioctx(ioctx), oid(oid), get_oid(std::move(get_oid)),
+      shards(shards) {}
+
+    uint64_t my_id = ioctx.get_instance_id();
+
+private:
+  const std::string oid;
+  const fu2::unique_function<std::string(uint64_t, int) const> get_oid;
+
+protected:
+  const int shards;
+
+private:
+
+  uint64_t watchcookie = 0;
+
+  obj_version version;
+  std::mutex m;
+  entries_t entries_;
+
+  tl::expected<std::pair<entries_t, obj_version>, bs::error_code>
+  read(const DoutPrefixProvider *dpp, optional_yield y) noexcept;
+  bs::error_code write(const DoutPrefixProvider *dpp, entries_t&& e, std::unique_lock<std::mutex>&& l_,
+                      optional_yield y) noexcept;
+  bs::error_code setup(const DoutPrefixProvider *dpp, log_type def, optional_yield y) noexcept;
+
+  bs::error_code watch() noexcept;
+
+  auto lowest_nomempty(const entries_t& es) {
+    return std::find_if(es.begin(), es.end(),
+                       [](const auto& e) {
+                         return !e.second.pruned;
+                       });
+  }
+
+public:
+
+  /// For the use of watch/notify.
+
+  void handle_notify(uint64_t notify_id,
+                    uint64_t cookie,
+                    uint64_t notifier_id,
+                    bufferlist& bl) override final;
+
+  void handle_error(uint64_t cookie, int err) override final;
+
+  /// Public interface
+
+  virtual ~logback_generations();
+
+  template<typename T, typename... Args>
+  static tl::expected<std::unique_ptr<T>, bs::error_code>
+  init(const DoutPrefixProvider *dpp, librados::IoCtx& ioctx_, std::string oid_,
+       fu2::unique_function<std::string(uint64_t, int) const>&& get_oid_,
+       int shards_, log_type def, optional_yield y,
+       Args&& ...args) noexcept {
+    try {
+      T* lgp = new T(ioctx_, std::move(oid_),
+                    std::move(get_oid_),
+                    shards_, std::forward<Args>(args)...);
+      std::unique_ptr<T> lg(lgp);
+      lgp = nullptr;
+      auto ec = lg->setup(dpp, def, y);
+      if (ec)
+       return tl::unexpected(ec);
+      // Obnoxiousness for C++ Compiler in Bionic Beaver
+      return tl::expected<std::unique_ptr<T>, bs::error_code>(std::move(lg));
+    } catch (const std::bad_alloc&) {
+      return tl::unexpected(bs::error_code(ENOMEM, bs::system_category()));
+    }
+  }
+
+  bs::error_code update(const DoutPrefixProvider *dpp, optional_yield y) noexcept;
+
+  entries_t entries() const {
+    return entries_;
+  }
+
+  bs::error_code new_backing(const DoutPrefixProvider *dpp, log_type type, optional_yield y) noexcept;
+
+  bs::error_code empty_to(const DoutPrefixProvider *dpp, uint64_t gen_id, optional_yield y) noexcept;
+
+  bs::error_code remove_empty(const DoutPrefixProvider *dpp, optional_yield y) noexcept;
+
+  // Callbacks, to be defined by descendant.
+
+  /// Handle initialization on startup
+  ///
+  /// @param e All non-empty generations
+  virtual bs::error_code handle_init(entries_t e) noexcept = 0;
+
+  /// Handle new generations.
+  ///
+  /// @param e Map of generations added since last update
+  virtual bs::error_code handle_new_gens(entries_t e) noexcept = 0;
+
+  /// Handle generations being marked empty
+  ///
+  /// @param new_tail Lowest non-empty generation
+  virtual bs::error_code handle_empty_to(uint64_t new_tail) noexcept = 0;
+};
+
+inline std::string gencursor(uint64_t gen_id, std::string_view cursor) {
+  return (gen_id > 0 ?
+         fmt::format("G{:0>20}@{}", gen_id, cursor) :
+         std::string(cursor));
+}
+
+inline std::pair<uint64_t, std::string_view>
+cursorgen(std::string_view cursor_) {
+  if (cursor_.empty()) {
+    return { 0, "" };
+  }
+  std::string_view cursor = cursor_;
+  if (cursor[0] != 'G') {
+    return { 0, cursor };
+  }
+  cursor.remove_prefix(1);
+  auto gen_id = ceph::consume<uint64_t>(cursor);
+  if (!gen_id || cursor[0] != '@') {
+    return { 0, cursor_ };
+  }
+  cursor.remove_prefix(1);
+  return { *gen_id, cursor };
+}
+
+class LazyFIFO {
+  librados::IoCtx& ioctx;
+  std::string oid;
+  std::mutex m;
+  std::unique_ptr<rgw::cls::fifo::FIFO> fifo;
+
+  int lazy_init(const DoutPrefixProvider *dpp, optional_yield y) {
+    std::unique_lock l(m);
+    if (fifo) return 0;
+    auto r = rgw::cls::fifo::FIFO::create(dpp, ioctx, oid, &fifo, y);
+    if (r) {
+      fifo.reset();
+    }
+    return r;
+  }
+
+public:
+
+  LazyFIFO(librados::IoCtx& ioctx, std::string oid)
+    : ioctx(ioctx), oid(std::move(oid)) {}
+
+  int read_meta(const DoutPrefixProvider *dpp, optional_yield y) {
+    auto r = lazy_init(dpp, y);
+    if (r < 0) return r;
+    return fifo->read_meta(dpp, y);
+  }
+
+  int meta(const DoutPrefixProvider *dpp, rados::cls::fifo::info& info, optional_yield y) {
+    auto r = lazy_init(dpp, y);
+    if (r < 0) return r;
+    info = fifo->meta();
+    return 0;
+  }
+
+  int get_part_layout_info(const DoutPrefixProvider *dpp, 
+                           std::uint32_t& part_header_size,
+                          std::uint32_t& part_entry_overhead,
+                          optional_yield y) {
+    auto r = lazy_init(dpp, y);
+    if (r < 0) return r;
+    std::tie(part_header_size, part_entry_overhead)
+      = fifo->get_part_layout_info();
+    return 0;
+  }
+
+  int push(const DoutPrefixProvider *dpp, 
+           const ceph::buffer::list& bl,
+          optional_yield y) {
+    auto r = lazy_init(dpp, y);
+    if (r < 0) return r;
+    return fifo->push(dpp, bl, y);
+  }
+
+  int push(const DoutPrefixProvider *dpp, 
+           ceph::buffer::list& bl,
+          librados::AioCompletion* c,
+          optional_yield y) {
+    auto r = lazy_init(dpp, y);
+    if (r < 0) return r;
+    fifo->push(dpp, bl, c);
+    return 0;
+  }
+
+  int push(const DoutPrefixProvider *dpp, 
+           const std::vector<ceph::buffer::list>& data_bufs,
+          optional_yield y) {
+    auto r = lazy_init(dpp, y);
+    if (r < 0) return r;
+    return fifo->push(dpp, data_bufs, y);
+  }
+
+  int push(const DoutPrefixProvider *dpp, 
+            const std::vector<ceph::buffer::list>& data_bufs,
+           librados::AioCompletion* c,
+           optional_yield y) {
+    auto r = lazy_init(dpp, y);
+    if (r < 0) return r;
+    fifo->push(dpp, data_bufs, c);
+    return 0;
+  }
+
+  int list(const DoutPrefixProvider *dpp, 
+           int max_entries, std::optional<std::string_view> markstr,
+          std::vector<rgw::cls::fifo::list_entry>* out,
+          bool* more, optional_yield y) {
+    auto r = lazy_init(dpp, y);
+    if (r < 0) return r;
+    return fifo->list(dpp, max_entries, markstr, out, more, y);
+  }
+
+  int list(const DoutPrefixProvider *dpp, int max_entries, std::optional<std::string_view> markstr,
+          std::vector<rgw::cls::fifo::list_entry>* out, bool* more,
+          librados::AioCompletion* c, optional_yield y) {
+    auto r = lazy_init(dpp, y);
+    if (r < 0) return r;
+    fifo->list(dpp, max_entries, markstr, out, more, c);
+    return 0;
+  }
+
+  int trim(const DoutPrefixProvider *dpp, std::string_view markstr, bool exclusive, optional_yield y) {
+    auto r = lazy_init(dpp, y);
+    if (r < 0) return r;
+    return fifo->trim(dpp, markstr, exclusive, y);
+  }
+
+  int trim(const DoutPrefixProvider *dpp, std::string_view markstr, bool exclusive, librados::AioCompletion* c,
+          optional_yield y) {
+    auto r = lazy_init(dpp, y);
+    if (r < 0) return r;
+    fifo->trim(dpp, markstr, exclusive, c);
+    return 0;
+  }
+
+  int get_part_info(const DoutPrefixProvider *dpp, int64_t part_num, rados::cls::fifo::part_header* header,
+                   optional_yield y) {
+    auto r = lazy_init(dpp, y);
+    if (r < 0) return r;
+    return fifo->get_part_info(dpp, part_num, header, y);
+  }
+
+  int get_part_info(const DoutPrefixProvider *dpp, int64_t part_num, rados::cls::fifo::part_header* header,
+                   librados::AioCompletion* c, optional_yield y) {
+    auto r = lazy_init(dpp, y);
+    if (r < 0) return r;
+    fifo->get_part_info(part_num, header, c);
+    return 0;
+  }
+
+  int get_head_info(const DoutPrefixProvider *dpp, fu2::unique_function<
+                     void(int r, rados::cls::fifo::part_header&&)>&& f,
+                   librados::AioCompletion* c,
+                   optional_yield y) {
+    auto r = lazy_init(dpp, y);
+    if (r < 0) return r;
+    fifo->get_head_info(dpp, std::move(f), c);
+    return 0;
+  }
+};
+
+#endif
diff --git a/src/rgw/driver/rados/rgw_metadata.cc b/src/rgw/driver/rados/rgw_metadata.cc
new file mode 100644 (file)
index 0000000..e3e4931
--- /dev/null
@@ -0,0 +1,233 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "rgw_metadata.h"
+
+#include "rgw_zone.h"
+#include "rgw_mdlog.h"
+
+#include "services/svc_zone.h"
+#include "services/svc_cls.h"
+
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+
+const std::string RGWMetadataLogHistory::oid = "meta.history";
+
+struct obj_version;
+
+void rgw_shard_name(const string& prefix, unsigned max_shards, const string& key, string& name, int *shard_id)
+{
+  uint32_t val = ceph_str_hash_linux(key.c_str(), key.size());
+  char buf[16];
+  if (shard_id) {
+    *shard_id = val % max_shards;
+  }
+  snprintf(buf, sizeof(buf), "%u", (unsigned)(val % max_shards));
+  name = prefix + buf;
+}
+
+void rgw_shard_name(const string& prefix, unsigned max_shards, const string& section, const string& key, string& name)
+{
+  uint32_t val = ceph_str_hash_linux(key.c_str(), key.size());
+  val ^= ceph_str_hash_linux(section.c_str(), section.size());
+  char buf[16];
+  snprintf(buf, sizeof(buf), "%u", (unsigned)(val % max_shards));
+  name = prefix + buf;
+}
+
+void rgw_shard_name(const string& prefix, unsigned shard_id, string& name)
+{
+  char buf[16];
+  snprintf(buf, sizeof(buf), "%u", shard_id);
+  name = prefix + buf;
+}
+
+int RGWMetadataLog::add_entry(const DoutPrefixProvider *dpp, const string& hash_key, const string& section, const string& key, bufferlist& bl) {
+  if (!svc.zone->need_to_log_metadata())
+    return 0;
+
+  string oid;
+  int shard_id;
+
+  rgw_shard_name(prefix, cct->_conf->rgw_md_log_max_shards, hash_key, oid, &shard_id);
+  mark_modified(shard_id);
+  real_time now = real_clock::now();
+  return svc.cls->timelog.add(dpp, oid, now, section, key, bl, null_yield);
+}
+
+int RGWMetadataLog::get_shard_id(const string& hash_key, int *shard_id)
+{
+  string oid;
+
+  rgw_shard_name(prefix, cct->_conf->rgw_md_log_max_shards, hash_key, oid, shard_id);
+  return 0;
+}
+
+int RGWMetadataLog::store_entries_in_shard(const DoutPrefixProvider *dpp, list<cls_log_entry>& entries, int shard_id, librados::AioCompletion *completion)
+{
+  string oid;
+
+  mark_modified(shard_id);
+  rgw_shard_name(prefix, shard_id, oid);
+  return svc.cls->timelog.add(dpp, oid, entries, completion, false, null_yield);
+}
+
+void RGWMetadataLog::init_list_entries(int shard_id, const real_time& from_time, const real_time& end_time, 
+                                       const string& marker, void **handle)
+{
+  LogListCtx *ctx = new LogListCtx();
+
+  ctx->cur_shard = shard_id;
+  ctx->from_time = from_time;
+  ctx->end_time  = end_time;
+  ctx->marker    = marker;
+
+  get_shard_oid(ctx->cur_shard, ctx->cur_oid);
+
+  *handle = (void *)ctx;
+}
+
+void RGWMetadataLog::complete_list_entries(void *handle) {
+  LogListCtx *ctx = static_cast<LogListCtx *>(handle);
+  delete ctx;
+}
+
+int RGWMetadataLog::list_entries(const DoutPrefixProvider *dpp, void *handle,
+                                int max_entries,
+                                list<cls_log_entry>& entries,
+                                string *last_marker,
+                                bool *truncated) {
+  LogListCtx *ctx = static_cast<LogListCtx *>(handle);
+
+  if (!max_entries) {
+    *truncated = false;
+    return 0;
+  }
+
+  std::string next_marker;
+  int ret = svc.cls->timelog.list(dpp, ctx->cur_oid, ctx->from_time, ctx->end_time,
+                                  max_entries, entries, ctx->marker,
+                                  &next_marker, truncated, null_yield);
+  if ((ret < 0) && (ret != -ENOENT))
+    return ret;
+
+  ctx->marker = std::move(next_marker);
+  if (last_marker) {
+    *last_marker = ctx->marker;
+  }
+
+  if (ret == -ENOENT)
+    *truncated = false;
+
+  return 0;
+}
+
+int RGWMetadataLog::get_info(const DoutPrefixProvider *dpp, int shard_id, RGWMetadataLogInfo *info)
+{
+  string oid;
+  get_shard_oid(shard_id, oid);
+
+  cls_log_header header;
+
+  int ret = svc.cls->timelog.info(dpp, oid, &header, null_yield);
+  if ((ret < 0) && (ret != -ENOENT))
+    return ret;
+
+  info->marker = header.max_marker;
+  info->last_update = header.max_time.to_real_time();
+
+  return 0;
+}
+
+static void _mdlog_info_completion(librados::completion_t cb, void *arg)
+{
+  auto infoc = static_cast<RGWMetadataLogInfoCompletion *>(arg);
+  infoc->finish(cb);
+  infoc->put(); // drop the ref from get_info_async()
+}
+
+RGWMetadataLogInfoCompletion::RGWMetadataLogInfoCompletion(info_callback_t cb)
+  : completion(librados::Rados::aio_create_completion((void *)this,
+                                                      _mdlog_info_completion)),
+    callback(cb)
+{
+}
+
+RGWMetadataLogInfoCompletion::~RGWMetadataLogInfoCompletion()
+{
+  completion->release();
+}
+
+int RGWMetadataLog::get_info_async(const DoutPrefixProvider *dpp, int shard_id, RGWMetadataLogInfoCompletion *completion)
+{
+  string oid;
+  get_shard_oid(shard_id, oid);
+
+  completion->get(); // hold a ref until the completion fires
+
+  return svc.cls->timelog.info_async(dpp, completion->get_io_obj(), oid,
+                                     &completion->get_header(),
+                                     completion->get_completion());
+}
+
+int RGWMetadataLog::trim(const DoutPrefixProvider *dpp, int shard_id, const real_time& from_time, const real_time& end_time,
+                         const string& start_marker, const string& end_marker)
+{
+  string oid;
+  get_shard_oid(shard_id, oid);
+
+  return svc.cls->timelog.trim(dpp, oid, from_time, end_time, start_marker,
+                               end_marker, nullptr, null_yield);
+}
+  
+int RGWMetadataLog::lock_exclusive(const DoutPrefixProvider *dpp, int shard_id, timespan duration, string& zone_id, string& owner_id) {
+  string oid;
+  get_shard_oid(shard_id, oid);
+
+  return svc.cls->lock.lock_exclusive(dpp, svc.zone->get_zone_params().log_pool, oid, duration, zone_id, owner_id);
+}
+
+int RGWMetadataLog::unlock(const DoutPrefixProvider *dpp, int shard_id, string& zone_id, string& owner_id) {
+  string oid;
+  get_shard_oid(shard_id, oid);
+
+  return svc.cls->lock.unlock(dpp, svc.zone->get_zone_params().log_pool, oid, zone_id, owner_id);
+}
+
+void RGWMetadataLog::mark_modified(int shard_id)
+{
+  lock.get_read();
+  if (modified_shards.find(shard_id) != modified_shards.end()) {
+    lock.unlock();
+    return;
+  }
+  lock.unlock();
+
+  std::unique_lock wl{lock};
+  modified_shards.insert(shard_id);
+}
+
+void RGWMetadataLog::read_clear_modified(set<int> &modified)
+{
+  std::unique_lock wl{lock};
+  modified.swap(modified_shards);
+  modified_shards.clear();
+}
+
+void RGWMetadataLogInfo::dump(Formatter *f) const
+{
+  encode_json("marker", marker, f);
+  utime_t ut(last_update);
+  encode_json("last_update", ut, f);
+}
+
+void RGWMetadataLogInfo::decode_json(JSONObj *obj)
+{
+  JSONDecoder::decode_json("marker", marker, obj);
+  utime_t ut;
+  JSONDecoder::decode_json("last_update", ut, obj);
+  last_update = ut.to_real_time();
+}
+
diff --git a/src/rgw/driver/rados/rgw_metadata.h b/src/rgw/driver/rados/rgw_metadata.h
new file mode 100644 (file)
index 0000000..7228370
--- /dev/null
@@ -0,0 +1,300 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#ifndef CEPH_RGW_METADATA_H
+#define CEPH_RGW_METADATA_H
+
+#include <string>
+#include <utility>
+#include <boost/optional.hpp>
+
+#include "include/types.h"
+#include "rgw_common.h"
+#include "rgw_period_history.h"
+#include "rgw_mdlog_types.h"
+#include "cls/version/cls_version_types.h"
+#include "cls/log/cls_log_types.h"
+#include "common/RefCountedObj.h"
+#include "common/ceph_time.h"
+#include "services/svc_meta_be.h"
+#include "rgw_sal_fwd.h"
+
+
+class RGWCoroutine;
+class JSONObj;
+struct RGWObjVersionTracker;
+
+struct obj_version;
+
+
+class RGWMetadataObject {
+protected:
+  obj_version objv;
+  ceph::real_time mtime;
+  std::map<std::string, bufferlist> *pattrs{nullptr};
+  
+public:
+  RGWMetadataObject() {}
+  RGWMetadataObject(const obj_version& v,
+                   real_time m) : objv(v), mtime(m) {}
+  virtual ~RGWMetadataObject() {}
+  obj_version& get_version();
+  real_time& get_mtime() { return mtime; }
+  void set_pattrs(std::map<std::string, bufferlist> *_pattrs) {
+    pattrs = _pattrs;
+  }
+  std::map<std::string, bufferlist> *get_pattrs() {
+    return pattrs;
+  }
+
+  virtual void dump(Formatter *f) const {}
+};
+
+class RGWMetadataManager;
+
+class RGWMetadataHandler {
+  friend class RGWMetadataManager;
+
+protected:
+  CephContext *cct;
+
+public:
+  RGWMetadataHandler() {}
+  virtual ~RGWMetadataHandler();
+  virtual std::string get_type() = 0;
+
+  void base_init(CephContext *_cct) {
+    cct = _cct;
+  }
+
+  virtual RGWMetadataObject *get_meta_obj(JSONObj *jo, const obj_version& objv, const ceph::real_time& mtime) = 0;
+
+  virtual int get(std::string& entry, RGWMetadataObject **obj, optional_yield, const DoutPrefixProvider *dpp) = 0;
+  virtual int put(std::string& entry,
+                  RGWMetadataObject *obj,
+                  RGWObjVersionTracker& objv_tracker,
+                  optional_yield, 
+                  const DoutPrefixProvider *dpp,
+                  RGWMDLogSyncType type,
+                  bool from_remote_zone) = 0;
+  virtual int remove(std::string& entry, RGWObjVersionTracker& objv_tracker, optional_yield, const DoutPrefixProvider *dpp) = 0;
+
+  virtual int mutate(const std::string& entry,
+                    const ceph::real_time& mtime,
+                    RGWObjVersionTracker *objv_tracker,
+                     optional_yield y,
+                     const DoutPrefixProvider *dpp,
+                    RGWMDLogStatus op_type,
+                    std::function<int()> f) = 0;
+
+  virtual int list_keys_init(const DoutPrefixProvider *dpp, const std::string& marker, void **phandle) = 0;
+  virtual int list_keys_next(const DoutPrefixProvider *dpp, void *handle, int max, std::list<std::string>& keys, bool *truncated) = 0;
+  virtual void list_keys_complete(void *handle) = 0;
+
+  virtual std::string get_marker(void *handle) = 0;
+
+  virtual int get_shard_id(const std::string& entry, int *shard_id) {
+    *shard_id = 0;
+    return 0;
+  }
+  virtual int attach(RGWMetadataManager *manager);
+};
+
+class RGWMetadataHandler_GenericMetaBE : public RGWMetadataHandler {
+  friend class RGWSI_MetaBackend;
+  friend class RGWMetadataManager;
+  friend class Put;
+
+public:
+  class Put;
+
+protected:
+  RGWSI_MetaBackend_Handler *be_handler;
+
+  virtual int do_get(RGWSI_MetaBackend_Handler::Op *op, std::string& entry, RGWMetadataObject **obj, optional_yield y, const DoutPrefixProvider *dpp) = 0;
+  virtual int do_put(RGWSI_MetaBackend_Handler::Op *op, std::string& entry, RGWMetadataObject *obj,
+                     RGWObjVersionTracker& objv_tracker, optional_yield y,
+                     const DoutPrefixProvider *dpp, RGWMDLogSyncType type, 
+                     bool from_remote_zone) = 0;
+  virtual int do_put_operate(Put *put_op, const DoutPrefixProvider *dpp);
+  virtual int do_remove(RGWSI_MetaBackend_Handler::Op *op, std::string& entry, RGWObjVersionTracker& objv_tracker, optional_yield y, const DoutPrefixProvider *dpp) = 0;
+
+public:
+  RGWMetadataHandler_GenericMetaBE() {}
+
+  void base_init(CephContext *_cct,
+            RGWSI_MetaBackend_Handler *_be_handler) {
+    RGWMetadataHandler::base_init(_cct);
+    be_handler = _be_handler;
+  }
+
+  RGWSI_MetaBackend_Handler *get_be_handler() {
+    return be_handler;
+  }
+
+  class Put {
+  protected:
+    RGWMetadataHandler_GenericMetaBE *handler;
+    RGWSI_MetaBackend_Handler::Op *op;
+    std::string& entry;
+    RGWMetadataObject *obj;
+    RGWObjVersionTracker& objv_tracker;
+    RGWMDLogSyncType apply_type;
+    optional_yield y;
+    bool from_remote_zone{false};
+
+    int get(RGWMetadataObject **obj, const DoutPrefixProvider *dpp) {
+      return handler->do_get(op, entry, obj, y, dpp);
+    }
+  public:
+    Put(RGWMetadataHandler_GenericMetaBE *_handler, RGWSI_MetaBackend_Handler::Op *_op,
+        std::string& _entry, RGWMetadataObject *_obj,
+        RGWObjVersionTracker& _objv_tracker, optional_yield _y,
+        RGWMDLogSyncType _type, bool from_remote_zone);
+
+    virtual ~Put() {}
+
+    virtual int put_pre(const DoutPrefixProvider *dpp) {
+      return 0;
+    }
+    virtual int put(const DoutPrefixProvider *dpp) {
+      return 0;
+    }
+    virtual int put_post(const DoutPrefixProvider *dpp) {
+      return 0;
+    }
+    virtual int finalize() {
+      return 0;
+    }
+  };
+
+  int get(std::string& entry, RGWMetadataObject **obj, optional_yield, const DoutPrefixProvider *dpp) override;
+  int put(std::string& entry, RGWMetadataObject *obj, RGWObjVersionTracker& objv_tracker, optional_yield, const DoutPrefixProvider *dpp, RGWMDLogSyncType type, bool from_remote_zone) override;
+  int remove(std::string& entry, RGWObjVersionTracker& objv_tracker, optional_yield, const DoutPrefixProvider *dpp) override;
+
+  int mutate(const std::string& entry,
+            const ceph::real_time& mtime,
+            RGWObjVersionTracker *objv_tracker,
+             optional_yield y,
+             const DoutPrefixProvider *dpp,
+            RGWMDLogStatus op_type,
+            std::function<int()> f) override;
+
+  int get_shard_id(const std::string& entry, int *shard_id) override;
+
+  int list_keys_init(const DoutPrefixProvider *dpp, const std::string& marker, void **phandle) override;
+  int list_keys_next(const DoutPrefixProvider *dpp, void *handle, int max, std::list<std::string>& keys, bool *truncated) override;
+  void list_keys_complete(void *handle) override;
+
+  std::string get_marker(void *handle) override;
+
+  /**
+   * Compare an incoming versus on-disk tag/version+mtime combo against
+   * the sync mode to see if the new one should replace the on-disk one.
+   *
+   * @return true if the update should proceed, false otherwise.
+   */
+  static bool check_versions(bool exists,
+                             const obj_version& ondisk, const real_time& ondisk_time,
+                             const obj_version& incoming, const real_time& incoming_time,
+                             RGWMDLogSyncType sync_mode) {
+    switch (sync_mode) {
+    case APPLY_UPDATES:
+      if ((ondisk.tag != incoming.tag) ||
+         (ondisk.ver >= incoming.ver))
+       return false;
+      break;
+    case APPLY_NEWER:
+      if (ondisk_time >= incoming_time)
+       return false;
+      break;
+    case APPLY_EXCLUSIVE:
+      if (exists)
+        return false;
+      break;
+    case APPLY_ALWAYS: //deliberate fall-thru -- we always apply!
+    default: break;
+    }
+    return true;
+  }
+};
+
+class RGWMetadataTopHandler;
+
+class RGWMetadataManager {
+  friend class RGWMetadataHandler;
+
+  CephContext *cct;
+  RGWSI_Meta *meta_svc;
+  std::map<std::string, RGWMetadataHandler *> handlers;
+  std::unique_ptr<RGWMetadataTopHandler> md_top_handler;
+
+  int find_handler(const std::string& metadata_key, RGWMetadataHandler **handler, std::string& entry);
+  int register_handler(RGWMetadataHandler *handler);
+
+public:
+  RGWMetadataManager(RGWSI_Meta *_meta_svc);
+  ~RGWMetadataManager();
+
+  RGWMetadataHandler *get_handler(const std::string& type);
+
+  int get(std::string& metadata_key, Formatter *f, optional_yield y, const DoutPrefixProvider *dpp);
+  int put(std::string& metadata_key, bufferlist& bl, optional_yield y,
+          const DoutPrefixProvider *dpp,
+          RGWMDLogSyncType sync_mode,
+          bool from_remote_zone,
+          obj_version *existing_version = NULL);
+  int remove(std::string& metadata_key, optional_yield y, const DoutPrefixProvider *dpp);
+
+  int mutate(const std::string& metadata_key,
+            const ceph::real_time& mtime,
+            RGWObjVersionTracker *objv_tracker,
+             optional_yield y,
+             const DoutPrefixProvider *dpp,
+            RGWMDLogStatus op_type,
+            std::function<int()> f);
+
+  int list_keys_init(const DoutPrefixProvider *dpp, const std::string& section, void **phandle);
+  int list_keys_init(const DoutPrefixProvider *dpp, const std::string& section, const std::string& marker, void **phandle);
+  int list_keys_next(const DoutPrefixProvider *dpp, void *handle, int max, std::list<std::string>& keys, bool *truncated);
+  void list_keys_complete(void *handle);
+
+  std::string get_marker(void *handle);
+
+  void dump_log_entry(cls_log_entry& entry, Formatter *f);
+
+  void get_sections(std::list<std::string>& sections);
+
+  void parse_metadata_key(const std::string& metadata_key, std::string& type, std::string& entry);
+
+  int get_shard_id(const std::string& section, const std::string& key, int *shard_id);
+};
+
+class RGWMetadataHandlerPut_SObj : public RGWMetadataHandler_GenericMetaBE::Put
+{
+protected:
+  std::unique_ptr<RGWMetadataObject> oo;
+  RGWMetadataObject *old_obj{nullptr};
+  bool exists{false};
+
+public:
+  RGWMetadataHandlerPut_SObj(RGWMetadataHandler_GenericMetaBE *handler, RGWSI_MetaBackend_Handler::Op *op,
+                             std::string& entry, RGWMetadataObject *obj, RGWObjVersionTracker& objv_tracker,
+                            optional_yield y,
+                             RGWMDLogSyncType type, bool from_remote_zone);
+  ~RGWMetadataHandlerPut_SObj();
+
+  int put_pre(const DoutPrefixProvider *dpp) override;
+  int put(const DoutPrefixProvider *dpp) override;
+  virtual int put_check(const DoutPrefixProvider *dpp) {
+    return 0;
+  }
+  virtual int put_checked(const DoutPrefixProvider *dpp);
+  virtual void encode_obj(bufferlist *bl) {}
+};
+
+void rgw_shard_name(const std::string& prefix, unsigned max_shards, const std::string& key, std::string& name, int *shard_id);
+void rgw_shard_name(const std::string& prefix, unsigned max_shards, const std::string& section, const std::string& key, std::string& name);
+void rgw_shard_name(const std::string& prefix, unsigned shard_id, std::string& name);
+
+#endif
diff --git a/src/rgw/driver/rados/rgw_notify.cc b/src/rgw/driver/rados/rgw_notify.cc
new file mode 100644 (file)
index 0000000..253a3bc
--- /dev/null
@@ -0,0 +1,1009 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+
+#include "rgw_notify.h"
+#include "cls/2pc_queue/cls_2pc_queue_client.h"
+#include "cls/lock/cls_lock_client.h"
+#include <memory>
+#include <boost/algorithm/hex.hpp>
+#include <boost/context/protected_fixedsize_stack.hpp>
+#include <spawn/spawn.hpp>
+#include "rgw_sal_rados.h"
+#include "rgw_pubsub.h"
+#include "rgw_pubsub_push.h"
+#include "rgw_perf_counters.h"
+#include "common/dout.h"
+#include <chrono>
+
+#define dout_subsys ceph_subsys_rgw
+
+namespace rgw::notify {
+
+struct event_entry_t {
+  rgw_pubsub_s3_event event;
+  std::string push_endpoint;
+  std::string push_endpoint_args;
+  std::string arn_topic;
+  
+  void encode(bufferlist& bl) const {
+    ENCODE_START(1, 1, bl);
+    encode(event, bl);
+    encode(push_endpoint, bl);
+    encode(push_endpoint_args, bl);
+    encode(arn_topic, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) {
+    DECODE_START(1, bl);
+    decode(event, bl);
+    decode(push_endpoint, bl);
+    decode(push_endpoint_args, bl);
+    decode(arn_topic, bl);
+    DECODE_FINISH(bl);
+  }
+};
+WRITE_CLASS_ENCODER(event_entry_t)
+
+using queues_t = std::set<std::string>;
+
+// use mmap/mprotect to allocate 128k coroutine stacks
+auto make_stack_allocator() {
+  return boost::context::protected_fixedsize_stack{128*1024};
+}
+
+class Manager : public DoutPrefixProvider {
+  const size_t max_queue_size;
+  const uint32_t queues_update_period_ms;
+  const uint32_t queues_update_retry_ms;
+  const uint32_t queue_idle_sleep_us;
+  const utime_t failover_time;
+  CephContext* const cct;
+  librados::IoCtx& rados_ioctx;
+  static constexpr auto COOKIE_LEN = 16;
+  const std::string lock_cookie;
+  boost::asio::io_context io_context;
+  boost::asio::executor_work_guard<boost::asio::io_context::executor_type> work_guard;
+  const uint32_t worker_count;
+  std::vector<std::thread> workers;
+  const uint32_t stale_reservations_period_s;
+  const uint32_t reservations_cleanup_period_s;
+  const std::string Q_LIST_OBJECT_NAME = "queues_list_object";
+
+  CephContext *get_cct() const override { return cct; }
+  unsigned get_subsys() const override { return dout_subsys; }
+  std::ostream& gen_prefix(std::ostream& out) const override { return out << "rgw notify: "; }
+
+  // read the list of queues from the queue list object
+  int read_queue_list(queues_t& queues, optional_yield y) {
+    constexpr auto max_chunk = 1024U;
+    std::string start_after;
+    bool more = true;
+    int rval;
+    while (more) {
+      librados::ObjectReadOperation op;
+      queues_t queues_chunk;
+      op.omap_get_keys2(start_after, max_chunk, &queues_chunk, &more, &rval);
+      const auto ret = rgw_rados_operate(this, rados_ioctx, Q_LIST_OBJECT_NAME, &op, nullptr, y);
+      if (ret == -ENOENT) {
+        // queue list object was not created - nothing to do
+        return 0;
+      }
+      if (ret < 0) {
+        // TODO: do we need to check on rval as well as ret?
+        ldpp_dout(this, 1) << "ERROR: failed to read queue list. error: " << ret << dendl;
+        return ret;
+      }
+      queues.merge(queues_chunk);
+    }
+    return 0;
+  }
+
+  // set m1 to be the minimum between m1 and m2
+  static int set_min_marker(std::string& m1, const std::string m2) {
+    cls_queue_marker mr1;
+    cls_queue_marker mr2;
+    if (mr1.from_str(m1.c_str()) < 0 || mr2.from_str(m2.c_str()) < 0) {
+      return -EINVAL;
+    }
+    if (mr2.gen <= mr1.gen && mr2.offset < mr1.offset) {
+      m1 = m2;
+    }
+    return 0;
+  }
+
+  using Clock = ceph::coarse_mono_clock;
+  using Executor = boost::asio::io_context::executor_type;
+  using Timer = boost::asio::basic_waitable_timer<Clock,
+        boost::asio::wait_traits<Clock>, Executor>;
+
+  class tokens_waiter {
+    const std::chrono::hours infinite_duration;
+    size_t pending_tokens;
+    Timer timer;
+    struct token {
+      tokens_waiter& waiter;
+      token(tokens_waiter& _waiter) : waiter(_waiter) {
+        ++waiter.pending_tokens;
+      }
+      
+      ~token() {
+        --waiter.pending_tokens;
+        if (waiter.pending_tokens == 0) {
+          waiter.timer.cancel();
+        }   
+      }   
+    };
+  
+  public:
+
+    tokens_waiter(boost::asio::io_context& io_context) :
+      infinite_duration(1000),
+      pending_tokens(0),
+      timer(io_context) {}  
+    void async_wait(yield_context yield) {
+      if (pending_tokens == 0) {
+        return;
+      }
+      timer.expires_from_now(infinite_duration);
+      boost::system::error_code ec; 
+      timer.async_wait(yield[ec]);
+      ceph_assert(ec == boost::system::errc::operation_canceled);
+    }   
+    token make_token() {    
+      return token(*this);
+    }   
+  };
+
+  // processing of a specific entry
+  // return whether processing was successfull (true) or not (false)
+  bool process_entry(const cls_queue_entry& entry, yield_context yield) {
+    event_entry_t event_entry;
+    auto iter = entry.data.cbegin();
+    try {
+      decode(event_entry, iter);
+    } catch (buffer::error& err) {
+      ldpp_dout(this, 5) << "WARNING: failed to decode entry. error: " << err.what() << dendl;
+      return false;
+    }
+    try {
+      // TODO move endpoint creation to queue level
+      const auto push_endpoint = RGWPubSubEndpoint::create(event_entry.push_endpoint, event_entry.arn_topic,
+          RGWHTTPArgs(event_entry.push_endpoint_args, this), 
+          cct);
+      ldpp_dout(this, 20) << "INFO: push endpoint created: " << event_entry.push_endpoint <<
+        " for entry: " << entry.marker << dendl;
+      const auto ret = push_endpoint->send_to_completion_async(cct, event_entry.event, optional_yield(io_context, yield));
+      if (ret < 0) {
+        ldpp_dout(this, 5) << "WARNING: push entry: " << entry.marker << " to endpoint: " << event_entry.push_endpoint 
+          << " failed. error: " << ret << " (will retry)" << dendl;
+        return false;
+      } else {
+        ldpp_dout(this, 20) << "INFO: push entry: " << entry.marker << " to endpoint: " << event_entry.push_endpoint 
+          << " ok" <<  dendl;
+        if (perfcounter) perfcounter->inc(l_rgw_pubsub_push_ok);
+        return true;
+      }
+    } catch (const RGWPubSubEndpoint::configuration_error& e) {
+      ldpp_dout(this, 5) << "WARNING: failed to create push endpoint: " 
+          << event_entry.push_endpoint << " for entry: " << entry.marker << ". error: " << e.what() << " (will retry) " << dendl;
+      return false;
+    }
+  }
+
+  // clean stale reservation from queue
+  void cleanup_queue(const std::string& queue_name, yield_context yield) {
+    while (true) {
+      ldpp_dout(this, 20) << "INFO: trying to perform stale reservation cleanup for queue: " << queue_name << dendl;
+      const auto now = ceph::coarse_real_time::clock::now();
+      const auto stale_time = now - std::chrono::seconds(stale_reservations_period_s);
+      librados::ObjectWriteOperation op;
+      op.assert_exists();
+      rados::cls::lock::assert_locked(&op, queue_name+"_lock", 
+        ClsLockType::EXCLUSIVE,
+        lock_cookie, 
+        "" /*no tag*/);
+      cls_2pc_queue_expire_reservations(op, stale_time);
+      // check ownership and do reservation cleanup in one batch
+      auto ret = rgw_rados_operate(this, rados_ioctx, queue_name, &op, optional_yield(io_context, yield));
+      if (ret == -ENOENT) {
+        // queue was deleted
+        ldpp_dout(this, 5) << "INFO: queue: " 
+          << queue_name << ". was removed. cleanup will stop" << dendl;
+        return;
+      }
+      if (ret == -EBUSY) {
+        ldpp_dout(this, 5) << "WARNING: queue: " << queue_name << " ownership moved to another daemon. processing will stop" << dendl;
+        return;
+      }
+      if (ret < 0) {
+        ldpp_dout(this, 5) << "WARNING: failed to cleanup stale reservation from queue and/or lock queue: " << queue_name
+          << ". error: " << ret << dendl;
+      }
+      Timer timer(io_context);
+      timer.expires_from_now(std::chrono::seconds(reservations_cleanup_period_s));
+      boost::system::error_code ec;
+           timer.async_wait(yield[ec]);
+    }
+  }
+
+  // processing of a specific queue
+  void process_queue(const std::string& queue_name, yield_context yield) {
+    constexpr auto max_elements = 1024;
+    auto is_idle = false;
+    const std::string start_marker;
+
+    // start a the cleanup coroutine for the queue
+    spawn::spawn(io_context, [this, queue_name](yield_context yield) {
+            cleanup_queue(queue_name, yield);
+            }, make_stack_allocator());
+    
+    while (true) {
+      // if queue was empty the last time, sleep for idle timeout
+      if (is_idle) {
+        Timer timer(io_context);
+        timer.expires_from_now(std::chrono::microseconds(queue_idle_sleep_us));
+        boost::system::error_code ec;
+             timer.async_wait(yield[ec]);
+      }
+
+      // get list of entries in the queue
+      is_idle = true;
+      bool truncated = false;
+      std::string end_marker;
+      std::vector<cls_queue_entry> entries;
+      auto total_entries = 0U;
+      {
+        librados::ObjectReadOperation op;
+        op.assert_exists();
+        bufferlist obl;
+        int rval;
+        rados::cls::lock::assert_locked(&op, queue_name+"_lock", 
+          ClsLockType::EXCLUSIVE,
+          lock_cookie, 
+          "" /*no tag*/);
+        cls_2pc_queue_list_entries(op, start_marker, max_elements, &obl, &rval);
+        // check ownership and list entries in one batch
+        auto ret = rgw_rados_operate(this, rados_ioctx, queue_name, &op, nullptr, optional_yield(io_context, yield));
+        if (ret == -ENOENT) {
+          // queue was deleted
+          ldpp_dout(this, 5) << "INFO: queue: " 
+            << queue_name << ". was removed. processing will stop" << dendl;
+          return;
+        }
+        if (ret == -EBUSY) {
+          ldpp_dout(this, 5) << "WARNING: queue: " << queue_name << " ownership moved to another daemon. processing will stop" << dendl;
+          return;
+        }
+        if (ret < 0) {
+          ldpp_dout(this, 5) << "WARNING: failed to get list of entries in queue and/or lock queue: " 
+            << queue_name << ". error: " << ret << " (will retry)" << dendl;
+          continue;
+        }
+        ret = cls_2pc_queue_list_entries_result(obl, entries, &truncated, end_marker);
+        if (ret < 0) {
+          ldpp_dout(this, 5) << "WARNING: failed to parse list of entries in queue: " 
+            << queue_name << ". error: " << ret << " (will retry)" << dendl;
+          continue;
+        }
+      }
+      total_entries = entries.size();
+      if (total_entries == 0) {
+        // nothing in the queue
+        continue;
+      }
+      // log when queue is not idle
+      ldpp_dout(this, 20) << "INFO: found: " << total_entries << " entries in: " << queue_name <<
+        ". end marker is: " << end_marker << dendl;
+      
+      is_idle = false;
+      auto has_error = false;
+      auto remove_entries = false;
+      auto entry_idx = 1U;
+      tokens_waiter waiter(io_context);
+      for (auto& entry : entries) {
+        if (has_error) {
+          // bail out on first error
+          break;
+        }
+        // TODO pass entry pointer instead of by-value
+        spawn::spawn(yield, [this, &queue_name, entry_idx, total_entries, &end_marker, &remove_entries, &has_error, &waiter, entry](yield_context yield) {
+            const auto token = waiter.make_token();
+            if (process_entry(entry, yield)) {
+              ldpp_dout(this, 20) << "INFO: processing of entry: " << 
+                entry.marker << " (" << entry_idx << "/" << total_entries << ") from: " << queue_name << " ok" << dendl;
+              remove_entries = true;
+            }  else {
+              if (set_min_marker(end_marker, entry.marker) < 0) {
+                ldpp_dout(this, 1) << "ERROR: cannot determin minimum between malformed markers: " << end_marker << ", " << entry.marker << dendl;
+              } else {
+                ldpp_dout(this, 20) << "INFO: new end marker for removal: " << end_marker << " from: " << queue_name << dendl;
+              }
+              has_error = true;
+              ldpp_dout(this, 20) << "INFO: processing of entry: " << 
+                entry.marker << " (" << entry_idx << "/" << total_entries << ") from: " << queue_name << " failed" << dendl;
+            } 
+        }, make_stack_allocator());
+        ++entry_idx;
+      }
+
+      // wait for all pending work to finish
+      waiter.async_wait(yield);
+
+      // delete all published entries from queue
+      if (remove_entries) {
+        librados::ObjectWriteOperation op;
+        op.assert_exists();
+        rados::cls::lock::assert_locked(&op, queue_name+"_lock", 
+          ClsLockType::EXCLUSIVE,
+          lock_cookie, 
+          "" /*no tag*/);
+        cls_2pc_queue_remove_entries(op, end_marker); 
+        // check ownership and deleted entries in one batch
+        const auto ret = rgw_rados_operate(this, rados_ioctx, queue_name, &op, optional_yield(io_context, yield)); 
+        if (ret == -ENOENT) {
+          // queue was deleted
+          ldpp_dout(this, 5) << "INFO: queue: " 
+            << queue_name << ". was removed. processing will stop" << dendl;
+          return;
+        }
+        if (ret == -EBUSY) {
+          ldpp_dout(this, 5) << "WARNING: queue: " << queue_name << " ownership moved to another daemon. processing will stop" << dendl;
+          return;
+        }
+        if (ret < 0) {
+          ldpp_dout(this, 1) << "ERROR: failed to remove entries and/or lock queue up to: " << end_marker <<  " from queue: " 
+            << queue_name << ". error: " << ret << dendl;
+        } else {
+          ldpp_dout(this, 20) << "INFO: removed entries up to: " << end_marker <<  " from queue: " 
+          << queue_name << dendl;
+        }
+      }
+    }
+  }
+
+  // lits of owned queues
+  using owned_queues_t = std::unordered_set<std::string>;
+
+  // process all queues
+  // find which of the queues is owned by this daemon and process it
+  void process_queues(yield_context yield) {
+    auto has_error = false;
+    owned_queues_t owned_queues;
+
+    // add randomness to the duration between queue checking
+    // to make sure that different daemons are not synced
+    std::random_device seed;
+    std::mt19937 rnd_gen(seed());
+    const auto min_jitter = 100; // ms
+    const auto max_jitter = 500; // ms
+    std::uniform_int_distribution<> duration_jitter(min_jitter, max_jitter);
+
+    std::vector<std::string> queue_gc;
+    std::mutex queue_gc_lock;
+    while (true) {
+      Timer timer(io_context);
+      const auto duration = (has_error ? 
+        std::chrono::milliseconds(queues_update_retry_ms) : std::chrono::milliseconds(queues_update_period_ms)) + 
+        std::chrono::milliseconds(duration_jitter(rnd_gen));
+      timer.expires_from_now(duration);
+      const auto tp = ceph::coarse_real_time::clock::to_time_t(ceph::coarse_real_time::clock::now() + duration);
+      ldpp_dout(this, 20) << "INFO: next queues processing will happen at: " << std::ctime(&tp)  << dendl;
+      boost::system::error_code ec;
+      timer.async_wait(yield[ec]);
+
+      queues_t queues;
+      auto ret = read_queue_list(queues, optional_yield(io_context, yield));
+      if (ret < 0) {
+        has_error = true;
+        continue;
+      }
+
+      for (const auto& queue_name : queues) {
+        // try to lock the queue to check if it is owned by this rgw
+        // or if ownershif needs to be taken
+        librados::ObjectWriteOperation op;
+        op.assert_exists();
+        rados::cls::lock::lock(&op, queue_name+"_lock", 
+              ClsLockType::EXCLUSIVE,
+              lock_cookie, 
+              "" /*no tag*/,
+              "" /*no description*/,
+              failover_time,
+              LOCK_FLAG_MAY_RENEW);
+
+        ret = rgw_rados_operate(this, rados_ioctx, queue_name, &op, optional_yield(io_context, yield));
+        if (ret == -EBUSY) {
+          // lock is already taken by another RGW
+          ldpp_dout(this, 20) << "INFO: queue: " << queue_name << " owned (locked) by another daemon" << dendl;
+          // if queue was owned by this RGW, processing should be stopped, queue would be deleted from list afterwards
+          continue;
+        }
+        if (ret == -ENOENT) {
+          // queue is deleted - processing will stop the next time we try to read from the queue
+          ldpp_dout(this, 10) << "INFO: queue: " << queue_name << " should not be locked - already deleted" << dendl;
+          continue;
+        }
+        if (ret < 0) {
+          // failed to lock for another reason, continue to process other queues
+          ldpp_dout(this, 1) << "ERROR: failed to lock queue: " << queue_name << ". error: " << ret << dendl;
+          has_error = true;
+          continue;
+        }
+        // add queue to list of owned queues
+        if (owned_queues.insert(queue_name).second) {
+          ldpp_dout(this, 10) << "INFO: queue: " << queue_name << " now owned (locked) by this daemon" << dendl;
+          // start processing this queue
+          spawn::spawn(io_context, [this, &queue_gc, &queue_gc_lock, queue_name](yield_context yield) {
+            process_queue(queue_name, yield);
+            // if queue processing ended, it measn that the queue was removed or not owned anymore
+            // mark it for deletion
+            std::lock_guard lock_guard(queue_gc_lock);
+            queue_gc.push_back(queue_name);
+            ldpp_dout(this, 10) << "INFO: queue: " << queue_name << " marked for removal" << dendl;
+          }, make_stack_allocator());
+        } else {
+          ldpp_dout(this, 20) << "INFO: queue: " << queue_name << " ownership (lock) renewed" << dendl;
+        }
+      }
+      // erase all queue that were deleted
+      {
+        std::lock_guard lock_guard(queue_gc_lock);
+        std::for_each(queue_gc.begin(), queue_gc.end(), [this, &owned_queues](const std::string& queue_name) {
+          owned_queues.erase(queue_name);
+          ldpp_dout(this, 20) << "INFO: queue: " << queue_name << " removed" << dendl;
+        });
+        queue_gc.clear();
+      }
+    }
+  }
+
+public:
+
+  ~Manager() {
+    work_guard.reset();
+    io_context.stop();
+    std::for_each(workers.begin(), workers.end(), [] (auto& worker) { worker.join(); });
+  }
+
+  // ctor: start all threads
+  Manager(CephContext* _cct, uint32_t _max_queue_size, uint32_t _queues_update_period_ms, 
+          uint32_t _queues_update_retry_ms, uint32_t _queue_idle_sleep_us, u_int32_t failover_time_ms, 
+          uint32_t _stale_reservations_period_s, uint32_t _reservations_cleanup_period_s,
+          uint32_t _worker_count, rgw::sal::RadosStore* store) :
+    max_queue_size(_max_queue_size),
+    queues_update_period_ms(_queues_update_period_ms),
+    queues_update_retry_ms(_queues_update_retry_ms),
+    queue_idle_sleep_us(_queue_idle_sleep_us),
+    failover_time(std::chrono::milliseconds(failover_time_ms)),
+    cct(_cct),
+    rados_ioctx(store->getRados()->get_notif_pool_ctx()),
+    lock_cookie(gen_rand_alphanumeric(cct, COOKIE_LEN)),
+    work_guard(boost::asio::make_work_guard(io_context)),
+    worker_count(_worker_count),
+    stale_reservations_period_s(_stale_reservations_period_s),
+    reservations_cleanup_period_s(_reservations_cleanup_period_s)
+    {
+      spawn::spawn(io_context, [this] (yield_context yield) {
+            process_queues(yield);
+          }, make_stack_allocator());
+
+      // start the worker threads to do the actual queue processing
+      const std::string WORKER_THREAD_NAME = "notif-worker";
+      for (auto worker_id = 0U; worker_id < worker_count; ++worker_id) {
+        workers.emplace_back([this]() {
+          try {
+            io_context.run(); 
+          } catch (const std::exception& err) {
+            ldpp_dout(this, 10) << "Notification worker failed with error: " << err.what() << dendl;
+            throw(err);
+          }
+        });
+        const auto rc = ceph_pthread_setname(workers.back().native_handle(), 
+          (WORKER_THREAD_NAME+std::to_string(worker_id)).c_str());
+        ceph_assert(rc == 0);
+      }
+      ldpp_dout(this, 10) << "Started notification manager with: " << worker_count << " workers" << dendl;
+    }
+
+  int add_persistent_topic(const std::string& topic_name, optional_yield y) {
+    if (topic_name == Q_LIST_OBJECT_NAME) {
+      ldpp_dout(this, 1) << "ERROR: topic name cannot be: " << Q_LIST_OBJECT_NAME << " (conflict with queue list object name)" << dendl;
+      return -EINVAL;
+    }
+    librados::ObjectWriteOperation op;
+    op.create(true);
+    cls_2pc_queue_init(op, topic_name, max_queue_size);
+    auto ret = rgw_rados_operate(this, rados_ioctx, topic_name, &op, y);
+    if (ret == -EEXIST) {
+      // queue already exists - nothing to do
+      ldpp_dout(this, 20) << "INFO: queue for topic: " << topic_name << " already exists. nothing to do" << dendl;
+      return 0;
+    }
+    if (ret < 0) {
+      // failed to create queue
+      ldpp_dout(this, 1) << "ERROR: failed to create queue for topic: " << topic_name << ". error: " << ret << dendl;
+      return ret;
+    }
+   
+    bufferlist empty_bl;
+    std::map<std::string, bufferlist> new_topic{{topic_name, empty_bl}};
+    op.omap_set(new_topic);
+    ret = rgw_rados_operate(this, rados_ioctx, Q_LIST_OBJECT_NAME, &op, y);
+    if (ret < 0) {
+      ldpp_dout(this, 1) << "ERROR: failed to add queue: " << topic_name << " to queue list. error: " << ret << dendl;
+      return ret;
+    } 
+    ldpp_dout(this, 20) << "INFO: queue: " << topic_name << " added to queue list"  << dendl;
+    return 0;
+  }
+  
+  int remove_persistent_topic(const std::string& topic_name, optional_yield y) {
+    librados::ObjectWriteOperation op;
+    op.remove();
+    auto ret = rgw_rados_operate(this, rados_ioctx, topic_name, &op, y);
+    if (ret == -ENOENT) {
+      // queue already removed - nothing to do
+      ldpp_dout(this, 20) << "INFO: queue for topic: " << topic_name << " already removed. nothing to do" << dendl;
+      return 0;
+    }
+    if (ret < 0) {
+      // failed to remove queue
+      ldpp_dout(this, 1) << "ERROR: failed to remove queue for topic: " << topic_name << ". error: " << ret << dendl;
+      return ret;
+    }
+  
+    std::set<std::string> topic_to_remove{{topic_name}};
+    op.omap_rm_keys(topic_to_remove);
+    ret = rgw_rados_operate(this, rados_ioctx, Q_LIST_OBJECT_NAME, &op, y);
+    if (ret < 0) {
+      ldpp_dout(this, 1) << "ERROR: failed to remove queue: " << topic_name << " from queue list. error: " << ret << dendl;
+      return ret;
+    } 
+    ldpp_dout(this, 20) << "INFO: queue: " << topic_name << " removed from queue list"  << dendl;
+    return 0;
+  }
+};
+
+// singleton manager
+// note that the manager itself is not a singleton, and multiple instances may co-exist
+// TODO make the pointer atomic in allocation and deallocation to avoid race conditions
+static Manager* s_manager = nullptr;
+
+constexpr size_t MAX_QUEUE_SIZE = 128*1000*1000; // 128MB
+constexpr uint32_t Q_LIST_UPDATE_MSEC = 1000*30;     // check queue list every 30seconds
+constexpr uint32_t Q_LIST_RETRY_MSEC = 1000;         // retry every second if queue list update failed
+constexpr uint32_t IDLE_TIMEOUT_USEC = 100*1000;     // idle sleep 100ms
+constexpr uint32_t FAILOVER_TIME_MSEC = 3*Q_LIST_UPDATE_MSEC; // FAILOVER TIME 3x renew time
+constexpr uint32_t WORKER_COUNT = 1;                 // 1 worker thread
+constexpr uint32_t STALE_RESERVATIONS_PERIOD_S = 120;   // cleanup reservations that are more than 2 minutes old
+constexpr uint32_t RESERVATIONS_CLEANUP_PERIOD_S = 30; // reservation cleanup every 30 seconds
+
+bool init(CephContext* cct, rgw::sal::RadosStore* store, const DoutPrefixProvider *dpp) {
+  if (s_manager) {
+    return false;
+  }
+  // TODO: take conf from CephContext
+  s_manager = new Manager(cct, MAX_QUEUE_SIZE, 
+      Q_LIST_UPDATE_MSEC, Q_LIST_RETRY_MSEC, 
+      IDLE_TIMEOUT_USEC, FAILOVER_TIME_MSEC, 
+      STALE_RESERVATIONS_PERIOD_S, RESERVATIONS_CLEANUP_PERIOD_S,
+      WORKER_COUNT,
+      store);
+  return true;
+}
+
+void shutdown() {
+  delete s_manager;
+  s_manager = nullptr;
+}
+
+int add_persistent_topic(const std::string& topic_name, optional_yield y) {
+  if (!s_manager) {
+    return -EAGAIN;
+  }
+  return s_manager->add_persistent_topic(topic_name, y);
+}
+
+int remove_persistent_topic(const std::string& topic_name, optional_yield y) {
+  if (!s_manager) {
+    return -EAGAIN;
+  }
+  return s_manager->remove_persistent_topic(topic_name, y);
+}
+
+rgw::sal::Object* get_object_with_atttributes(
+  const reservation_t& res, rgw::sal::Object* obj) {
+  // in case of copy obj, the tags and metadata are taken from source
+  const auto src_obj = res.src_object ? res.src_object : obj;
+  if (src_obj->get_attrs().empty()) {
+    if (!src_obj->get_bucket()) {
+      src_obj->set_bucket(res.bucket);
+    }
+    const auto ret = src_obj->get_obj_attrs(res.yield, res.dpp);
+    if (ret < 0) {
+      ldpp_dout(res.dpp, 20) << "failed to get attributes from object: " << 
+        src_obj->get_key() << ". ret = " << ret << dendl;
+      return nullptr;
+    }
+  }
+  return src_obj;
+}
+
+static inline void metadata_from_attributes(
+  reservation_t& res, rgw::sal::Object* obj) {
+  auto& metadata = res.x_meta_map;
+  const auto src_obj = get_object_with_atttributes(res, obj);
+  if (!src_obj) {
+    return;
+  }
+  res.metadata_fetched_from_attributes = true;
+  for (auto& attr : src_obj->get_attrs()) {
+    if (boost::algorithm::starts_with(attr.first, RGW_ATTR_META_PREFIX)) {
+      std::string_view key(attr.first);
+      key.remove_prefix(sizeof(RGW_ATTR_PREFIX)-1);
+      // we want to pass a null terminated version
+      // of the bufferlist, hence "to_str().c_str()"
+      metadata.emplace(key, attr.second.to_str().c_str());
+    }
+  }
+}
+
+static inline void tags_from_attributes(
+  const reservation_t& res, rgw::sal::Object* obj, KeyMultiValueMap& tags) {
+  const auto src_obj = get_object_with_atttributes(res, obj);
+  if (!src_obj) {
+    return;
+  }
+  const auto& attrs = src_obj->get_attrs();
+  const auto attr_iter = attrs.find(RGW_ATTR_TAGS);
+  if (attr_iter != attrs.end()) {
+    auto bliter = attr_iter->second.cbegin();
+    RGWObjTags obj_tags;
+    try {
+      ::decode(obj_tags, bliter);
+    } catch(buffer::error&) {
+      // not able to decode tags
+      return;
+    }
+    tags = std::move(obj_tags.get_tags());
+  }
+}
+
+// populate event from request
+static inline void populate_event(reservation_t& res,
+        rgw::sal::Object* obj,
+        uint64_t size,
+        const ceph::real_time& mtime, 
+        const std::string& etag, 
+        const std::string& version, 
+        EventType event_type,
+        rgw_pubsub_s3_event& event) {
+  event.eventTime = mtime;
+  event.eventName = to_event_string(event_type);
+  event.userIdentity = res.user_id;    // user that triggered the change
+  event.x_amz_request_id = res.req_id; // request ID of the original change
+  event.x_amz_id_2 = res.store->getRados()->host_id; // RGW on which the change was made
+  // configurationId is filled from notification configuration
+  event.bucket_name = res.bucket->get_name();
+  event.bucket_ownerIdentity = res.bucket->get_owner() ? res.bucket->get_owner()->get_id().id : "";
+  const auto region = res.store->get_zone()->get_zonegroup().get_api_name();
+  rgw::ARN bucket_arn(res.bucket->get_key());
+  bucket_arn.region = region; 
+  event.bucket_arn = to_string(bucket_arn);
+  event.object_key = res.object_name ? *res.object_name : obj->get_name();
+  event.object_size = size;
+  event.object_etag = etag;
+  event.object_versionId = version;
+  event.awsRegion = region;
+  // use timestamp as per key sequence id (hex encoded)
+  const utime_t ts(real_clock::now());
+  boost::algorithm::hex((const char*)&ts, (const char*)&ts + sizeof(utime_t), 
+          std::back_inserter(event.object_sequencer));
+  set_event_id(event.id, etag, ts);
+  event.bucket_id = res.bucket->get_bucket_id();
+  // pass meta data
+  if (!res.metadata_fetched_from_attributes) {
+    // either no metadata exist or no metadata filter was used
+    metadata_from_attributes(res, obj);
+  }
+  event.x_meta_map = res.x_meta_map;
+  // pass tags
+  if (!res.tagset ||
+      (*res.tagset).get_tags().empty()) {
+    // try to fetch the tags from the attributes
+    tags_from_attributes(res, obj, event.tags);
+  } else {
+    event.tags = (*res.tagset).get_tags();
+  }
+  // opaque data will be filled from topic configuration
+}
+
+static inline bool notification_match(reservation_t& res,
+                                     const rgw_pubsub_topic_filter& filter,
+                                     EventType event,
+                                     const RGWObjTags* req_tags) {
+  if (!match(filter.events, event)) { 
+    return false;
+  }
+  const auto obj = res.object;
+  if (!match(filter.s3_filter.key_filter, 
+        res.object_name ? *res.object_name : obj->get_name())) {
+    return false;
+  }
+
+  if (!filter.s3_filter.metadata_filter.kv.empty()) {
+    // metadata filter exists
+    if (res.s) {
+      res.x_meta_map = res.s->info.x_meta_map;
+    }
+    metadata_from_attributes(res, obj);
+    if (!match(filter.s3_filter.metadata_filter, res.x_meta_map)) {
+      return false;
+    }
+  }
+
+  if (!filter.s3_filter.tag_filter.kv.empty()) {
+    // tag filter exists
+    if (req_tags) {
+      // tags in the request
+      if (!match(filter.s3_filter.tag_filter, req_tags->get_tags())) {
+        return false;
+      }
+    } else if (res.tagset && !(*res.tagset).get_tags().empty()) {
+      // tags were cached in req_state
+      if (!match(filter.s3_filter.tag_filter, (*res.tagset).get_tags())) {
+        return false;
+      }
+    } else {
+      // try to fetch tags from the attributes
+      KeyMultiValueMap tags;
+      tags_from_attributes(res, obj, tags);
+      if (!match(filter.s3_filter.tag_filter, tags)) {
+        return false;
+      }
+    }
+  }
+
+  return true;
+}
+
+  int publish_reserve(const DoutPrefixProvider* dpp,
+                     EventType event_type,
+                     reservation_t& res,
+                     const RGWObjTags* req_tags)
+{
+  RGWPubSub ps(res.store, res.user_tenant);
+  RGWPubSub::Bucket ps_bucket(&ps, res.bucket->get_key());
+  rgw_pubsub_bucket_topics bucket_topics;
+  auto rc = ps_bucket.get_topics(&bucket_topics);
+  if (rc < 0) {
+    // failed to fetch bucket topics
+    return rc;
+  }
+  for (const auto& bucket_topic : bucket_topics.topics) {
+    const rgw_pubsub_topic_filter& topic_filter = bucket_topic.second;
+    const rgw_pubsub_topic& topic_cfg = topic_filter.topic;
+    if (!notification_match(res, topic_filter, event_type, req_tags)) {
+      // notification does not apply to req_state
+      continue;
+    }
+    ldpp_dout(res.dpp, 20) << "INFO: notification: '" << topic_filter.s3_id <<
+        "' on topic: '" << topic_cfg.dest.arn_topic << 
+        "' and bucket: '" << res.bucket->get_name() <<
+        "' (unique topic: '" << topic_cfg.name <<
+        "') apply to event of type: '" << to_string(event_type) << "'" << dendl;
+
+    cls_2pc_reservation::id_t res_id;
+    if (topic_cfg.dest.persistent) {
+      // TODO: take default reservation size from conf
+      constexpr auto DEFAULT_RESERVATION = 4*1024U; // 4K
+      res.size = DEFAULT_RESERVATION;
+      librados::ObjectWriteOperation op;
+      bufferlist obl;
+      int rval;
+      const auto& queue_name = topic_cfg.dest.arn_topic;
+      cls_2pc_queue_reserve(op, res.size, 1, &obl, &rval);
+      auto ret = rgw_rados_operate(
+       res.dpp, res.store->getRados()->get_notif_pool_ctx(),
+       queue_name, &op, res.yield, librados::OPERATION_RETURNVEC);
+      if (ret < 0) {
+        ldpp_dout(res.dpp, 1) <<
+         "ERROR: failed to reserve notification on queue: "
+                             << queue_name << ". error: " << ret << dendl;
+        // if no space is left in queue we ask client to slow down
+        return (ret == -ENOSPC) ? -ERR_RATE_LIMITED : ret;
+      }
+      ret = cls_2pc_queue_reserve_result(obl, res_id);
+      if (ret < 0) {
+        ldpp_dout(res.dpp, 1) << "ERROR: failed to parse reservation id. error: " << ret << dendl;
+        return ret;
+      }
+    }
+    res.topics.emplace_back(topic_filter.s3_id, topic_cfg, res_id);
+  }
+  return 0;
+}
+
+int publish_commit(rgw::sal::Object* obj,
+                  uint64_t size,
+                  const ceph::real_time& mtime,
+                  const std::string& etag,
+                  const std::string& version,
+                  EventType event_type,
+                  reservation_t& res,
+                  const DoutPrefixProvider* dpp)
+{
+  for (auto& topic : res.topics) {
+    if (topic.cfg.dest.persistent &&
+       topic.res_id == cls_2pc_reservation::NO_ID) {
+      // nothing to commit or already committed/aborted
+      continue;
+    }
+    event_entry_t event_entry;
+    populate_event(res, obj, size, mtime, etag, version, event_type, event_entry.event);
+    event_entry.event.configurationId = topic.configurationId;
+    event_entry.event.opaque_data = topic.cfg.opaque_data;
+    if (topic.cfg.dest.persistent) { 
+      event_entry.push_endpoint = std::move(topic.cfg.dest.push_endpoint);
+      event_entry.push_endpoint_args =
+       std::move(topic.cfg.dest.push_endpoint_args);
+      event_entry.arn_topic = topic.cfg.dest.arn_topic;
+      bufferlist bl;
+      encode(event_entry, bl);
+      const auto& queue_name = topic.cfg.dest.arn_topic;
+      if (bl.length() > res.size) {
+        // try to make a larger reservation, fail only if this is not possible
+        ldpp_dout(dpp, 5) << "WARNING: committed size: " << bl.length()
+                         << " exceeded reserved size: " << res.size
+                         <<
+          " . trying to make a larger reservation on queue:" << queue_name
+                         << dendl;
+        // first cancel the existing reservation
+        librados::ObjectWriteOperation op;
+        cls_2pc_queue_abort(op, topic.res_id);
+        auto ret = rgw_rados_operate(
+         dpp, res.store->getRados()->get_notif_pool_ctx(),
+         topic.cfg.dest.arn_topic, &op,
+         res.yield);
+        if (ret < 0) {
+          ldpp_dout(dpp, 1) << "ERROR: failed to abort reservation: "
+                           << topic.res_id << 
+            " when trying to make a larger reservation on queue: " << queue_name
+                           << ". error: " << ret << dendl;
+          return ret;
+        }
+        // now try to make a bigger one
+       buffer::list obl;
+        int rval;
+        cls_2pc_queue_reserve(op, bl.length(), 1, &obl, &rval);
+        ret = rgw_rados_operate(
+         dpp, res.store->getRados()->get_notif_pool_ctx(),
+          queue_name, &op, res.yield, librados::OPERATION_RETURNVEC);
+        if (ret < 0) {
+          ldpp_dout(dpp, 1) << "ERROR: failed to reserve extra space on queue: "
+                           << queue_name
+                           << ". error: " << ret << dendl;
+          return (ret == -ENOSPC) ? -ERR_RATE_LIMITED : ret;
+        }
+        ret = cls_2pc_queue_reserve_result(obl, topic.res_id);
+        if (ret < 0) {
+          ldpp_dout(dpp, 1) << "ERROR: failed to parse reservation id for "
+           "extra space. error: " << ret << dendl;
+          return ret;
+        }
+      }
+      std::vector<buffer::list> bl_data_vec{std::move(bl)};
+      librados::ObjectWriteOperation op;
+      cls_2pc_queue_commit(op, bl_data_vec, topic.res_id);
+      const auto ret = rgw_rados_operate(
+       dpp, res.store->getRados()->get_notif_pool_ctx(),
+       queue_name, &op, res.yield);
+      topic.res_id = cls_2pc_reservation::NO_ID;
+      if (ret < 0) {
+        ldpp_dout(dpp, 1) << "ERROR: failed to commit reservation to queue: "
+                         << queue_name << ". error: " << ret
+                         << dendl;
+        return ret;
+      }
+    } else {
+      try {
+        // TODO add endpoint LRU cache
+        const auto push_endpoint = RGWPubSubEndpoint::create(
+         topic.cfg.dest.push_endpoint,
+         topic.cfg.dest.arn_topic,
+         RGWHTTPArgs(topic.cfg.dest.push_endpoint_args, dpp),
+         dpp->get_cct());
+        ldpp_dout(res.dpp, 20) << "INFO: push endpoint created: "
+                              << topic.cfg.dest.push_endpoint << dendl;
+        const auto ret = push_endpoint->send_to_completion_async(
+         dpp->get_cct(), event_entry.event, res.yield);
+        if (ret < 0) {
+          ldpp_dout(dpp, 1) << "ERROR: push to endpoint "
+                           << topic.cfg.dest.push_endpoint
+                           << " failed. error: " << ret << dendl;
+          if (perfcounter) perfcounter->inc(l_rgw_pubsub_push_failed);
+          return ret;
+        }
+        if (perfcounter) perfcounter->inc(l_rgw_pubsub_push_ok);
+      } catch (const RGWPubSubEndpoint::configuration_error& e) {
+        ldpp_dout(dpp, 1) << "ERROR: failed to create push endpoint: " 
+            << topic.cfg.dest.push_endpoint << ". error: " << e.what() << dendl;
+        if (perfcounter) perfcounter->inc(l_rgw_pubsub_push_failed);
+        return -EINVAL;
+      }
+    }
+  }
+  return 0;
+}
+
+int publish_abort(reservation_t& res) {
+  for (auto& topic : res.topics) {
+    if (!topic.cfg.dest.persistent ||
+       topic.res_id == cls_2pc_reservation::NO_ID) {
+      // nothing to abort or already committed/aborted
+      continue;
+    }
+    const auto& queue_name = topic.cfg.dest.arn_topic;
+    librados::ObjectWriteOperation op;
+    cls_2pc_queue_abort(op, topic.res_id);
+    const auto ret = rgw_rados_operate(
+      res.dpp, res.store->getRados()->get_notif_pool_ctx(),
+      queue_name, &op, res.yield);
+    if (ret < 0) {
+      ldpp_dout(res.dpp, 1) << "ERROR: failed to abort reservation: "
+                           << topic.res_id <<
+        " from queue: " << queue_name << ". error: " << ret << dendl;
+      return ret;
+    }
+    topic.res_id = cls_2pc_reservation::NO_ID;
+  }
+  return 0;
+}
+
+reservation_t::reservation_t(const DoutPrefixProvider* _dpp,
+                            rgw::sal::RadosStore* _store,
+                            const req_state* _s,
+                            rgw::sal::Object* _object,
+                            rgw::sal::Object* _src_object,
+                            const std::string* _object_name) :
+  dpp(_s), store(_store), s(_s), size(0) /* XXX */,
+  object(_object), src_object(_src_object), bucket(_s->bucket.get()),
+  object_name(_object_name),
+  tagset(_s->tagset),
+  x_meta_map(_s->info.x_meta_map),
+  metadata_fetched_from_attributes(false),
+  user_id(_s->user->get_id().id),
+  user_tenant(_s->user->get_id().tenant),
+  req_id(_s->req_id),
+  yield(_s->yield)
+{}
+
+reservation_t::reservation_t(const DoutPrefixProvider* _dpp,
+                            rgw::sal::RadosStore* _store,
+                            rgw::sal::Object* _object,
+                            rgw::sal::Object* _src_object,
+                            rgw::sal::Bucket* _bucket,
+                            const std::string& _user_id,
+                            const std::string& _user_tenant,
+                            const std::string& _req_id,
+                            optional_yield y) :
+    dpp(_dpp), store(_store), s(nullptr), size(0) /* XXX */,
+    object(_object), src_object(_src_object), bucket(_bucket),
+    object_name(nullptr),
+    metadata_fetched_from_attributes(false),
+    user_id(_user_id),
+    user_tenant(_user_tenant),
+    req_id(_req_id),
+    yield(y)
+{}
+
+reservation_t::~reservation_t() {
+  publish_abort(*this);
+}
+
+} // namespace rgw::notify
diff --git a/src/rgw/driver/rados/rgw_notify.h b/src/rgw/driver/rados/rgw_notify.h
new file mode 100644 (file)
index 0000000..175dc11
--- /dev/null
@@ -0,0 +1,117 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include <string>
+#include "common/ceph_time.h"
+#include "include/common_fwd.h"
+#include "rgw_notify_event_type.h"
+#include "common/async/yield_context.h"
+#include "cls/2pc_queue/cls_2pc_queue_types.h"
+#include "rgw_pubsub.h"
+
+// forward declarations
+namespace rgw::sal {
+    class RadosStore;
+    class RGWObject;
+}
+
+class RGWRados;
+struct rgw_obj_key;
+
+namespace rgw::notify {
+
+// initialize the notification manager
+// notification manager is dequeing the 2-phase-commit queues
+// and send the notifications to the endpoints
+bool init(CephContext* cct, rgw::sal::RadosStore* store, const DoutPrefixProvider *dpp);
+
+// shutdown the notification manager
+void shutdown();
+
+// create persistent delivery queue for a topic (endpoint)
+// this operation also add a topic name to the common (to all RGWs) list of all topics
+int add_persistent_topic(const std::string& topic_name, optional_yield y);
+
+// remove persistent delivery queue for a topic (endpoint)
+// this operation also remove the topic name from the common (to all RGWs) list of all topics
+int remove_persistent_topic(const std::string& topic_name, optional_yield y);
+
+// struct holding reservation information
+// populated in the publish_reserve call
+// then used to commit or abort the reservation
+struct reservation_t {
+  struct topic_t {
+    topic_t(const std::string& _configurationId, const rgw_pubsub_topic& _cfg,
+           cls_2pc_reservation::id_t _res_id) :
+      configurationId(_configurationId), cfg(_cfg), res_id(_res_id) {}
+
+    const std::string configurationId;
+    const rgw_pubsub_topic cfg;
+    // res_id is reset after topic is committed/aborted
+    cls_2pc_reservation::id_t res_id;
+  };
+
+  const DoutPrefixProvider* const dpp;
+  std::vector<topic_t> topics;
+  rgw::sal::RadosStore* const store;
+  const req_state* const s;
+  size_t size;
+  rgw::sal::Object* const object;
+  rgw::sal::Object* const src_object; // may differ from object
+  rgw::sal::Bucket* const bucket;
+  const std::string* const object_name;
+  boost::optional<const RGWObjTags&> tagset;
+  meta_map_t x_meta_map; // metadata cached by value
+  bool metadata_fetched_from_attributes;
+  const std::string user_id;
+  const std::string user_tenant;
+  const std::string req_id;
+  optional_yield yield;
+
+  /* ctor for rgw_op callers */
+  reservation_t(const DoutPrefixProvider* _dpp,
+               rgw::sal::RadosStore* _store,
+               const req_state* _s,
+               rgw::sal::Object* _object,
+               rgw::sal::Object* _src_object,
+               const std::string* _object_name);
+
+  /* ctor for non-request caller (e.g., lifecycle) */
+  reservation_t(const DoutPrefixProvider* _dpp,
+               rgw::sal::RadosStore* _store,
+               rgw::sal::Object* _object,
+               rgw::sal::Object* _src_object,
+               rgw::sal::Bucket* _bucket,
+               const std::string& _user_id,
+               const std::string& _user_tenant,
+               const std::string& _req_id,
+               optional_yield y);
+
+  // dtor doing resource leak guarding
+  // aborting the reservation if not already committed or aborted
+  ~reservation_t();
+};
+
+// create a reservation on the 2-phase-commit queue
+  int publish_reserve(const DoutPrefixProvider *dpp,
+                     EventType event_type,
+                     reservation_t& reservation,
+                     const RGWObjTags* req_tags);
+
+// commit the reservation to the queue
+int publish_commit(rgw::sal::Object* obj,
+        uint64_t size,
+        const ceph::real_time& mtime, 
+        const std::string& etag, 
+        const std::string& version,
+        EventType event_type,
+        reservation_t& reservation,
+        const DoutPrefixProvider *dpp);
+
+// cancel the reservation
+int publish_abort(reservation_t& reservation);
+
+}
+
diff --git a/src/rgw/driver/rados/rgw_obj_manifest.cc b/src/rgw/driver/rados/rgw_obj_manifest.cc
new file mode 100644 (file)
index 0000000..3838f5c
--- /dev/null
@@ -0,0 +1,404 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "rgw_obj_manifest.h"
+
+#include "services/svc_zone.h"
+#include "rgw_rados.h"
+#include "rgw_bucket.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+
+int RGWObjManifest::generator::create_next(uint64_t ofs)
+{
+  if (ofs < last_ofs) /* only going forward */
+    return -EINVAL;
+
+  uint64_t max_head_size = manifest->get_max_head_size();
+
+  if (ofs < max_head_size) {
+    manifest->set_head_size(ofs);
+  }
+
+  if (ofs >= max_head_size) {
+    manifest->set_head_size(max_head_size);
+    cur_stripe = (ofs - max_head_size) / rule.stripe_max_size;
+    cur_stripe_size = rule.stripe_max_size;
+
+    if (cur_part_id == 0 && max_head_size > 0) {
+      cur_stripe++;
+    }
+  }
+
+  last_ofs = ofs;
+  manifest->set_obj_size(ofs);
+
+  manifest->get_implicit_location(cur_part_id, cur_stripe, ofs, NULL, &cur_obj);
+
+  return 0;
+}
+
+int RGWObjManifest::append(const DoutPrefixProvider *dpp, RGWObjManifest& m, const RGWZoneGroup& zonegroup,
+                           const RGWZoneParams& zone_params)
+{
+  if (explicit_objs || m.explicit_objs) {
+    return append_explicit(dpp, m, zonegroup, zone_params);
+  }
+
+  if (rules.empty()) {
+    *this = m;
+    return 0;
+  }
+
+  string override_prefix;
+
+  if (prefix.empty()) {
+    prefix = m.prefix;
+  }
+
+  if (prefix != m.prefix) {
+    override_prefix = m.prefix;
+  }
+
+  map<uint64_t, RGWObjManifestRule>::iterator miter = m.rules.begin();
+  if (miter == m.rules.end()) {
+    return append_explicit(dpp, m, zonegroup, zone_params);
+  }
+
+  for (; miter != m.rules.end(); ++miter) {
+    map<uint64_t, RGWObjManifestRule>::reverse_iterator last_rule = rules.rbegin();
+
+    RGWObjManifestRule& rule = last_rule->second;
+
+    if (rule.part_size == 0) {
+      rule.part_size = obj_size - rule.start_ofs;
+    }
+
+    RGWObjManifestRule& next_rule = miter->second;
+    if (!next_rule.part_size) {
+      next_rule.part_size = m.obj_size - next_rule.start_ofs;
+    }
+
+    string rule_prefix = prefix;
+    if (!rule.override_prefix.empty()) {
+      rule_prefix = rule.override_prefix;
+    }
+
+    string next_rule_prefix = m.prefix;
+    if (!next_rule.override_prefix.empty()) {
+      next_rule_prefix = next_rule.override_prefix;
+    }
+
+    if (rule.part_size != next_rule.part_size ||
+        rule.stripe_max_size != next_rule.stripe_max_size ||
+        rule_prefix != next_rule_prefix) {
+      if (next_rule_prefix != prefix) {
+        append_rules(m, miter, &next_rule_prefix);
+      } else {
+        append_rules(m, miter, NULL);
+      }
+      break;
+    }
+
+    uint64_t expected_part_num = rule.start_part_num + 1;
+    if (rule.part_size > 0) {
+      expected_part_num = rule.start_part_num + (obj_size + next_rule.start_ofs - rule.start_ofs) / rule.part_size;
+    }
+
+    if (expected_part_num != next_rule.start_part_num) {
+      append_rules(m, miter, NULL);
+      break;
+    }
+  }
+
+  set_obj_size(obj_size + m.obj_size);
+
+  return 0;
+}
+
+void RGWObjManifest::append_rules(RGWObjManifest& m, map<uint64_t, RGWObjManifestRule>::iterator& miter,
+                                  string *override_prefix)
+{
+  for (; miter != m.rules.end(); ++miter) {
+    RGWObjManifestRule rule = miter->second;
+    rule.start_ofs += obj_size;
+    if (override_prefix)
+      rule.override_prefix = *override_prefix;
+    rules[rule.start_ofs] = rule;
+  }
+}
+
+void RGWObjManifest::convert_to_explicit(const DoutPrefixProvider *dpp, const RGWZoneGroup& zonegroup, const RGWZoneParams& zone_params)
+{
+  if (explicit_objs) {
+    return;
+  }
+  obj_iterator iter = obj_begin(dpp);
+
+  while (iter != obj_end(dpp)) {
+    RGWObjManifestPart& part = objs[iter.get_stripe_ofs()];
+    const rgw_obj_select& os = iter.get_location();
+    const rgw_raw_obj& raw_loc = os.get_raw_obj(zonegroup, zone_params);
+    part.loc_ofs = 0;
+
+    uint64_t ofs = iter.get_stripe_ofs();
+
+    if (ofs == 0) {
+      part.loc = obj;
+    } else {
+      RGWSI_Tier_RADOS::raw_obj_to_obj(tail_placement.bucket, raw_loc, &part.loc);
+    }
+    ++iter;
+    uint64_t next_ofs = iter.get_stripe_ofs();
+
+    part.size = next_ofs - ofs;
+  }
+
+  explicit_objs = true;
+  rules.clear();
+  prefix.clear();
+}
+
+int RGWObjManifest::append_explicit(const DoutPrefixProvider *dpp, RGWObjManifest& m, const RGWZoneGroup& zonegroup, const RGWZoneParams& zone_params)
+{
+  if (!explicit_objs) {
+    convert_to_explicit(dpp, zonegroup, zone_params);
+  }
+  if (!m.explicit_objs) {
+    m.convert_to_explicit(dpp, zonegroup, zone_params);
+  }
+  map<uint64_t, RGWObjManifestPart>::iterator iter;
+  uint64_t base = obj_size;
+  for (iter = m.objs.begin(); iter != m.objs.end(); ++iter) {
+    RGWObjManifestPart& part = iter->second;
+    objs[base + iter->first] = part;
+  }
+  obj_size += m.obj_size;
+
+  return 0;
+}
+
+bool RGWObjManifest::get_rule(uint64_t ofs, RGWObjManifestRule *rule)
+{
+  if (rules.empty()) {
+    return false;
+  }
+
+  map<uint64_t, RGWObjManifestRule>::iterator iter = rules.upper_bound(ofs);
+  if (iter != rules.begin()) {
+    --iter;
+  }
+
+  *rule = iter->second;
+
+  return true;
+}
+
+int RGWObjManifest::generator::create_begin(CephContext *cct, RGWObjManifest *_m,
+                                            const rgw_placement_rule& head_placement_rule,
+                                            const rgw_placement_rule *tail_placement_rule,
+                                            const rgw_bucket& _b, const rgw_obj& _obj)
+{
+  manifest = _m;
+
+  if (!tail_placement_rule) {
+    manifest->set_tail_placement(head_placement_rule, _b);
+  } else {
+    rgw_placement_rule new_tail_rule = *tail_placement_rule;
+    new_tail_rule.inherit_from(head_placement_rule);
+    manifest->set_tail_placement(new_tail_rule, _b);
+  }
+
+  manifest->set_head(head_placement_rule, _obj, 0);
+  last_ofs = 0;
+
+  if (manifest->get_prefix().empty()) {
+    char buf[33];
+    gen_rand_alphanumeric(cct, buf, sizeof(buf) - 1);
+
+    string oid_prefix = ".";
+    oid_prefix.append(buf);
+    oid_prefix.append("_");
+
+    manifest->set_prefix(oid_prefix);
+  }
+
+  bool found = manifest->get_rule(0, &rule);
+  if (!found) {
+    derr << "ERROR: manifest->get_rule() could not find rule" << dendl;
+    return -EIO;
+  }
+
+  uint64_t head_size = manifest->get_head_size();
+
+  if (head_size > 0) {
+    cur_stripe_size = head_size;
+  } else {
+    cur_stripe_size = rule.stripe_max_size;
+  }
+  
+  cur_part_id = rule.start_part_num;
+
+  manifest->get_implicit_location(cur_part_id, cur_stripe, 0, NULL, &cur_obj);
+
+  // Normal object which not generated through copy operation 
+  manifest->set_tail_instance(_obj.key.instance);
+
+  return 0;
+}
+
+void RGWObjManifestPart::generate_test_instances(std::list<RGWObjManifestPart*>& o)
+{
+  o.push_back(new RGWObjManifestPart);
+
+  RGWObjManifestPart *p = new RGWObjManifestPart;
+  rgw_bucket b;
+  init_bucket(&b, "tenant", "bucket", ".pool", ".index_pool", "marker_", "12");
+
+  p->loc = rgw_obj(b, "object");
+  p->loc_ofs = 512 * 1024;
+  p->size = 128 * 1024;
+  o.push_back(p);
+}
+
+void RGWObjManifest::generate_test_instances(std::list<RGWObjManifest*>& o)
+{
+  RGWObjManifest *m = new RGWObjManifest;
+  map<uint64_t, RGWObjManifestPart> objs;
+  uint64_t total_size = 0;
+  for (int i = 0; i<10; i++) {
+    RGWObjManifestPart p;
+    rgw_bucket b;
+    init_bucket(&b, "tenant", "bucket", ".pool", ".index_pool", "marker_", "12");
+    p.loc = rgw_obj(b, "object");
+    p.loc_ofs = 0;
+    p.size = 512 * 1024;
+    total_size += p.size;
+    objs[total_size] = p;
+  }
+  m->set_explicit(total_size, objs);
+  o.push_back(m);
+  o.push_back(new RGWObjManifest);
+}
+
+void RGWObjManifestPart::dump(Formatter *f) const
+{
+  f->open_object_section("loc");
+  loc.dump(f);
+  f->close_section();
+  f->dump_unsigned("loc_ofs", loc_ofs);
+  f->dump_unsigned("size", size);
+}
+
+void RGWObjManifest::obj_iterator::dump(Formatter *f) const
+{
+  f->dump_unsigned("part_ofs", part_ofs);
+  f->dump_unsigned("stripe_ofs", stripe_ofs);
+  f->dump_unsigned("ofs", ofs);
+  f->dump_unsigned("stripe_size", stripe_size);
+  f->dump_int("cur_part_id", cur_part_id);
+  f->dump_int("cur_stripe", cur_stripe);
+  f->dump_string("cur_override_prefix", cur_override_prefix);
+  f->dump_object("location", location);
+}
+
+void RGWObjManifest::dump(Formatter *f) const
+{
+  map<uint64_t, RGWObjManifestPart>::const_iterator iter = objs.begin();
+  f->open_array_section("objs");
+  for (; iter != objs.end(); ++iter) {
+    f->dump_unsigned("ofs", iter->first);
+    f->open_object_section("part");
+    iter->second.dump(f);
+    f->close_section();
+  }
+  f->close_section();
+  f->dump_unsigned("obj_size", obj_size);
+  ::encode_json("explicit_objs", explicit_objs, f);
+  ::encode_json("head_size", head_size, f);
+  ::encode_json("max_head_size", max_head_size, f);
+  ::encode_json("prefix", prefix, f);
+  ::encode_json("rules", rules, f);
+  ::encode_json("tail_instance", tail_instance, f);
+  ::encode_json("tail_placement", tail_placement, f);
+
+  // nullptr being passed into iterators since there
+  // is no cct and we aren't doing anything with these
+  // iterators that would write do the log
+  f->dump_object("begin_iter", obj_begin(nullptr));
+  f->dump_object("end_iter", obj_end(nullptr));
+}
+
+void RGWObjManifestRule::dump(Formatter *f) const
+{
+  encode_json("start_part_num", start_part_num, f);
+  encode_json("start_ofs", start_ofs, f);
+  encode_json("part_size", part_size, f);
+  encode_json("stripe_max_size", stripe_max_size, f);
+  encode_json("override_prefix", override_prefix, f);
+}
+
+void rgw_obj_select::dump(Formatter *f) const
+{
+  f->dump_string("placement_rule", placement_rule.to_str());
+  f->dump_object("obj", obj);
+  f->dump_object("raw_obj", raw_obj);
+  f->dump_bool("is_raw", is_raw);
+}
+
+void RGWObjTier::dump(Formatter *f) const
+{
+  encode_json("name", name, f);
+  encode_json("tier_placement", tier_placement, f);
+  encode_json("is_multipart_upload", is_multipart_upload, f);
+}
+
+// returns true on success, false on failure
+static bool rgw_get_obj_data_pool(const RGWZoneGroup& zonegroup, const RGWZoneParams& zone_params,
+                                  const rgw_placement_rule& head_placement_rule,
+                                  const rgw_obj& obj, rgw_pool *pool)
+{
+  if (!zone_params.get_head_data_pool(head_placement_rule, obj, pool)) {
+    RGWZonePlacementInfo placement;
+    if (!zone_params.get_placement(zonegroup.default_placement.name, &placement)) {
+      return false;
+    }
+
+    if (!obj.in_extra_data) {
+      *pool = placement.get_data_pool(zonegroup.default_placement.storage_class);
+    } else {
+      *pool = placement.get_data_extra_pool();
+    }
+  }
+
+  return true;
+}
+
+static bool rgw_obj_to_raw(const RGWZoneGroup& zonegroup, const RGWZoneParams& zone_params,
+                           const rgw_placement_rule& head_placement_rule,
+                           const rgw_obj& obj, rgw_raw_obj *raw_obj)
+{
+  get_obj_bucket_and_oid_loc(obj, raw_obj->oid, raw_obj->loc);
+
+  return rgw_get_obj_data_pool(zonegroup, zone_params, head_placement_rule, obj, &raw_obj->pool);
+}
+
+rgw_raw_obj rgw_obj_select::get_raw_obj(const RGWZoneGroup& zonegroup, const RGWZoneParams& zone_params) const
+{
+  if (!is_raw) {
+    rgw_raw_obj r;
+    rgw_obj_to_raw(zonegroup, zone_params, placement_rule, obj, &r);
+    return r;
+  }
+  return raw_obj;
+}
+
+// returns true on success, false on failure
+bool RGWRados::get_obj_data_pool(const rgw_placement_rule& placement_rule, const rgw_obj& obj, rgw_pool *pool)
+{
+  return rgw_get_obj_data_pool(svc.zone->get_zonegroup(), svc.zone->get_zone_params(), placement_rule, obj, pool);
+}
+
diff --git a/src/rgw/driver/rados/rgw_obj_manifest.h b/src/rgw/driver/rados/rgw_obj_manifest.h
new file mode 100644 (file)
index 0000000..ac73359
--- /dev/null
@@ -0,0 +1,609 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2019 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#pragma once
+
+#include "rgw_common.h"
+#include "rgw_compression_types.h"
+#include "rgw_sal.h"
+#include "rgw_zone.h"
+
+class RGWSI_Zone;
+struct RGWZoneGroup;
+struct RGWZoneParams;
+class RGWRados;
+namespace rgw { namespace sal {
+  class RadosStore;
+} };
+
+class rgw_obj_select {
+  rgw_placement_rule placement_rule;
+  rgw_obj obj;
+  rgw_raw_obj raw_obj;
+  bool is_raw;
+
+public:
+  rgw_obj_select() : is_raw(false) {}
+  explicit rgw_obj_select(const rgw_obj& _obj) : obj(_obj), is_raw(false) {}
+  explicit rgw_obj_select(const rgw_raw_obj& _raw_obj) : raw_obj(_raw_obj), is_raw(true) {}
+  rgw_obj_select(const rgw_obj_select& rhs) {
+    placement_rule = rhs.placement_rule;
+    is_raw = rhs.is_raw;
+    if (is_raw) {
+      raw_obj = rhs.raw_obj;
+    } else {
+      obj = rhs.obj;
+    }
+  }
+
+  rgw_raw_obj get_raw_obj(const RGWZoneGroup& zonegroup, const RGWZoneParams& zone_params) const;
+  rgw_raw_obj get_raw_obj(rgw::sal::RadosStore* store) const;
+
+  rgw_obj_select& operator=(const rgw_obj& rhs) {
+    obj = rhs;
+    is_raw = false;
+    return *this;
+  }
+
+  rgw_obj_select& operator=(const rgw_raw_obj& rhs) {
+    raw_obj = rhs;
+    is_raw = true;
+    return *this;
+  }
+
+  void set_placement_rule(const rgw_placement_rule& rule) {
+    placement_rule = rule;
+  }
+  void dump(Formatter *f) const;
+};
+
+struct RGWObjManifestPart {
+  rgw_obj loc;   /* the object where the data is located */
+  uint64_t loc_ofs;  /* the offset at that object where the data is located */
+  uint64_t size;     /* the part size */
+
+  RGWObjManifestPart() : loc_ofs(0), size(0) {}
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(2, 2, bl);
+    encode(loc, bl);
+    encode(loc_ofs, bl);
+    encode(size, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) {
+     DECODE_START_LEGACY_COMPAT_LEN_32(2, 2, 2, bl);
+     decode(loc, bl);
+     decode(loc_ofs, bl);
+     decode(size, bl);
+     DECODE_FINISH(bl);
+  }
+
+  void dump(Formatter *f) const;
+  static void generate_test_instances(std::list<RGWObjManifestPart*>& o);
+};
+WRITE_CLASS_ENCODER(RGWObjManifestPart)
+
+/*
+ The manifest defines a set of rules for structuring the object parts.
+ There are a few terms to note:
+     - head: the head part of the object, which is the part that contains
+       the first chunk of data. An object might not have a head (as in the
+       case of multipart-part objects).
+     - stripe: data portion of a single rgw object that resides on a single
+       rados object.
+     - part: a collection of stripes that make a contiguous part of an
+       object. A regular object will only have one part (although might have
+       many stripes), a multipart object might have many parts. Each part
+       has a fixed stripe size, although the last stripe of a part might
+       be smaller than that. Consecutive parts may be merged if their stripe
+       value is the same.
+*/
+
+struct RGWObjManifestRule {
+  uint32_t start_part_num;
+  uint64_t start_ofs;
+  uint64_t part_size; /* each part size, 0 if there's no part size, meaning it's unlimited */
+  uint64_t stripe_max_size; /* underlying obj max size */
+  std::string override_prefix;
+
+  RGWObjManifestRule() : start_part_num(0), start_ofs(0), part_size(0), stripe_max_size(0) {}
+  RGWObjManifestRule(uint32_t _start_part_num, uint64_t _start_ofs, uint64_t _part_size, uint64_t _stripe_max_size) :
+                       start_part_num(_start_part_num), start_ofs(_start_ofs), part_size(_part_size), stripe_max_size(_stripe_max_size) {}
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(2, 1, bl);
+    encode(start_part_num, bl);
+    encode(start_ofs, bl);
+    encode(part_size, bl);
+    encode(stripe_max_size, bl);
+    encode(override_prefix, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) {
+    DECODE_START(2, bl);
+    decode(start_part_num, bl);
+    decode(start_ofs, bl);
+    decode(part_size, bl);
+    decode(stripe_max_size, bl);
+    if (struct_v >= 2)
+      decode(override_prefix, bl);
+    DECODE_FINISH(bl);
+  }
+  void dump(Formatter *f) const;
+};
+WRITE_CLASS_ENCODER(RGWObjManifestRule)
+
+struct RGWObjTier {
+    std::string name;
+    RGWZoneGroupPlacementTier tier_placement;
+    bool is_multipart_upload{false};
+
+    RGWObjTier(): name("none") {}
+
+    void encode(bufferlist& bl) const {
+      ENCODE_START(2, 2, bl);
+      encode(name, bl);
+      encode(tier_placement, bl);
+      encode(is_multipart_upload, bl);
+      ENCODE_FINISH(bl);
+    }
+
+    void decode(bufferlist::const_iterator& bl) {
+      DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl);
+      decode(name, bl);
+      decode(tier_placement, bl);
+      decode(is_multipart_upload, bl);
+      DECODE_FINISH(bl);
+    }
+    void dump(Formatter *f) const;
+};
+WRITE_CLASS_ENCODER(RGWObjTier)
+
+class RGWObjManifest {
+protected:
+  bool explicit_objs{false}; /* really old manifest? */
+  std::map<uint64_t, RGWObjManifestPart> objs;
+
+  uint64_t obj_size{0};
+
+  rgw_obj obj;
+  uint64_t head_size{0};
+  rgw_placement_rule head_placement_rule;
+
+  uint64_t max_head_size{0};
+  std::string prefix;
+  rgw_bucket_placement tail_placement; /* might be different than the original bucket,
+                                       as object might have been copied across pools */
+  std::map<uint64_t, RGWObjManifestRule> rules;
+
+  std::string tail_instance; /* tail object's instance */
+
+  std::string tier_type;
+  RGWObjTier tier_config;
+
+  void convert_to_explicit(const DoutPrefixProvider *dpp, const RGWZoneGroup& zonegroup, const RGWZoneParams& zone_params);
+  int append_explicit(const DoutPrefixProvider *dpp, RGWObjManifest& m, const RGWZoneGroup& zonegroup, const RGWZoneParams& zone_params);
+  void append_rules(RGWObjManifest& m, std::map<uint64_t, RGWObjManifestRule>::iterator& iter, std::string *override_prefix);
+
+public:
+
+  RGWObjManifest() = default;
+  RGWObjManifest(const RGWObjManifest& rhs) {
+    *this = rhs;
+  }
+  RGWObjManifest& operator=(const RGWObjManifest& rhs) {
+    explicit_objs = rhs.explicit_objs;
+    objs = rhs.objs;
+    obj_size = rhs.obj_size;
+    obj = rhs.obj;
+    head_size = rhs.head_size;
+    max_head_size = rhs.max_head_size;
+    prefix = rhs.prefix;
+    tail_placement = rhs.tail_placement;
+    rules = rhs.rules;
+    tail_instance = rhs.tail_instance;
+    tier_type = rhs.tier_type;
+    tier_config = rhs.tier_config;
+    return *this;
+  }
+
+  std::map<uint64_t, RGWObjManifestPart>& get_explicit_objs() {
+    return objs;
+  }
+
+
+  void set_explicit(uint64_t _size, std::map<uint64_t, RGWObjManifestPart>& _objs) {
+    explicit_objs = true;
+    objs.swap(_objs);
+    set_obj_size(_size);
+  }
+
+  void get_implicit_location(uint64_t cur_part_id, uint64_t cur_stripe, uint64_t ofs,
+                             std::string *override_prefix, rgw_obj_select *location) const;
+
+  void set_trivial_rule(uint64_t tail_ofs, uint64_t stripe_max_size) {
+    RGWObjManifestRule rule(0, tail_ofs, 0, stripe_max_size);
+    rules[0] = rule;
+    max_head_size = tail_ofs;
+  }
+
+  void set_multipart_part_rule(uint64_t stripe_max_size, uint64_t part_num) {
+    RGWObjManifestRule rule(0, 0, 0, stripe_max_size);
+    rule.start_part_num = part_num;
+    rules[0] = rule;
+    max_head_size = 0;
+  }
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(8, 6, bl);
+    encode(obj_size, bl);
+    encode(objs, bl);
+    encode(explicit_objs, bl);
+    encode(obj, bl);
+    encode(head_size, bl);
+    encode(max_head_size, bl);
+    encode(prefix, bl);
+    encode(rules, bl);
+    bool encode_tail_bucket = !(tail_placement.bucket == obj.bucket);
+    encode(encode_tail_bucket, bl);
+    if (encode_tail_bucket) {
+      encode(tail_placement.bucket, bl);
+    }
+    bool encode_tail_instance = (tail_instance != obj.key.instance);
+    encode(encode_tail_instance, bl);
+    if (encode_tail_instance) {
+      encode(tail_instance, bl);
+    }
+    encode(head_placement_rule, bl);
+    encode(tail_placement.placement_rule, bl);
+    encode(tier_type, bl);
+    encode(tier_config, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) {
+    DECODE_START_LEGACY_COMPAT_LEN_32(7, 2, 2, bl);
+    decode(obj_size, bl);
+    decode(objs, bl);
+    if (struct_v >= 3) {
+      decode(explicit_objs, bl);
+      decode(obj, bl);
+      decode(head_size, bl);
+      decode(max_head_size, bl);
+      decode(prefix, bl);
+      decode(rules, bl);
+    } else {
+      explicit_objs = true;
+      if (!objs.empty()) {
+        std::map<uint64_t, RGWObjManifestPart>::iterator iter = objs.begin();
+        obj = iter->second.loc;
+        head_size = iter->second.size;
+        max_head_size = head_size;
+      }
+    }
+
+    if (explicit_objs && head_size > 0 && !objs.empty()) {
+      /* patch up manifest due to issue 16435:
+       * the first object in the explicit objs list might not be the one we need to access, use the
+       * head object instead if set. This would happen if we had an old object that was created
+       * when the explicit objs manifest was around, and it got copied.
+       */
+      rgw_obj& obj_0 = objs[0].loc;
+      if (!obj_0.get_oid().empty() && obj_0.key.ns.empty()) {
+        objs[0].loc = obj;
+        objs[0].size = head_size;
+      }
+    }
+
+    if (struct_v >= 4) {
+      if (struct_v < 6) {
+        decode(tail_placement.bucket, bl);
+      } else {
+        bool need_to_decode;
+        decode(need_to_decode, bl);
+        if (need_to_decode) {
+          decode(tail_placement.bucket, bl);
+        } else {
+          tail_placement.bucket = obj.bucket;
+        }
+      }
+    }
+
+    if (struct_v >= 5) {
+      if (struct_v < 6) {
+        decode(tail_instance, bl);
+      } else {
+        bool need_to_decode;
+        decode(need_to_decode, bl);
+        if (need_to_decode) {
+          decode(tail_instance, bl);
+        } else {
+          tail_instance = obj.key.instance;
+        }
+      }
+    } else { // old object created before 'tail_instance' field added to manifest
+      tail_instance = obj.key.instance;
+    }
+
+    if (struct_v >= 7) {
+      decode(head_placement_rule, bl);
+      decode(tail_placement.placement_rule, bl);
+    }
+
+    if (struct_v >= 8) {
+      decode(tier_type, bl);
+      decode(tier_config, bl);
+    }
+
+    DECODE_FINISH(bl);
+  }
+
+  void dump(Formatter *f) const;
+  static void generate_test_instances(std::list<RGWObjManifest*>& o);
+
+  int append(const DoutPrefixProvider *dpp, RGWObjManifest& m, const RGWZoneGroup& zonegroup,
+             const RGWZoneParams& zone_params);
+
+  bool get_rule(uint64_t ofs, RGWObjManifestRule *rule);
+
+  bool empty() const {
+    if (explicit_objs)
+      return objs.empty();
+    return rules.empty();
+  }
+
+  bool has_explicit_objs() const {
+    return explicit_objs;
+  }
+
+  bool has_tail() const {
+    if (explicit_objs) {
+      if (objs.size() == 1) {
+        auto iter = objs.begin();
+        const rgw_obj& o = iter->second.loc;
+        return !(obj == o);
+      }
+      return (objs.size() >= 2);
+    }
+    return (obj_size > head_size);
+  }
+
+  void set_head(const rgw_placement_rule& placement_rule, const rgw_obj& _o, uint64_t _s) {
+    head_placement_rule = placement_rule;
+    obj = _o;
+    head_size = _s;
+
+    if (explicit_objs && head_size > 0) {
+      objs[0].loc = obj;
+      objs[0].size = head_size;
+    }
+  }
+
+  const rgw_obj& get_obj() const {
+    return obj;
+  }
+
+  void set_tail_placement(const rgw_placement_rule& placement_rule, const rgw_bucket& _b) {
+    tail_placement.placement_rule = placement_rule;
+    tail_placement.bucket = _b;
+  }
+
+  const rgw_bucket_placement& get_tail_placement() const {
+    return tail_placement;
+  }
+
+  const rgw_placement_rule& get_head_placement_rule() const {
+    return head_placement_rule;
+  }
+
+  void set_prefix(const std::string& _p) {
+    prefix = _p;
+  }
+
+  const std::string& get_prefix() const {
+    return prefix;
+  }
+
+  void set_tail_instance(const std::string& _ti) {
+    tail_instance = _ti;
+  }
+
+  const std::string& get_tail_instance() const {
+    return tail_instance;
+  }
+
+  void set_head_size(uint64_t _s) {
+    head_size = _s;
+  }
+
+  void set_obj_size(uint64_t s) {
+    obj_size = s;
+  }
+
+  uint64_t get_obj_size() const {
+    return obj_size;
+  }
+
+  uint64_t get_head_size() const {
+    return head_size;
+  }
+
+  uint64_t get_max_head_size() const {
+    return max_head_size;
+  }
+
+  const std::string& get_tier_type() {
+      return tier_type;
+  }
+
+  inline void set_tier_type(std::string value) {
+      /* Only "cloud-s3" tier-type is supported for now */
+      if (value == "cloud-s3") {
+        tier_type = value;
+      }
+  }
+
+  inline void set_tier_config(RGWObjTier t) {
+      /* Set only if tier_type set to "cloud-s3" */
+      if (tier_type != "cloud-s3")
+        return;
+
+      tier_config.name = t.name;
+      tier_config.tier_placement = t.tier_placement;
+      tier_config.is_multipart_upload = t.is_multipart_upload;
+  }
+
+  inline const void get_tier_config(RGWObjTier* t) {
+      if (tier_type != "cloud-s3")
+        return;
+
+      t->name = tier_config.name;
+      t->tier_placement = tier_config.tier_placement;
+      t->is_multipart_upload = tier_config.is_multipart_upload;
+  }
+
+  class obj_iterator {
+    const DoutPrefixProvider *dpp;
+    const RGWObjManifest *manifest = nullptr;
+    uint64_t part_ofs = 0;   /* where current part starts */
+    uint64_t stripe_ofs = 0; /* where current stripe starts */
+    uint64_t ofs = 0;        /* current position within the object */
+    uint64_t stripe_size = 0;      /* current part size */
+
+    int cur_part_id = 0;
+    int cur_stripe = 0;
+    std::string cur_override_prefix;
+
+    rgw_obj_select location;
+
+    std::map<uint64_t, RGWObjManifestRule>::const_iterator rule_iter;
+    std::map<uint64_t, RGWObjManifestRule>::const_iterator next_rule_iter;
+    std::map<uint64_t, RGWObjManifestPart>::const_iterator explicit_iter;
+
+    void update_explicit_pos();
+
+  public:
+    obj_iterator() = default;
+    explicit obj_iterator(const DoutPrefixProvider *_dpp, const RGWObjManifest *_m)
+      : obj_iterator(_dpp, _m, 0)
+    {}
+    obj_iterator(const DoutPrefixProvider *_dpp, const RGWObjManifest *_m, uint64_t _ofs) : dpp(_dpp), manifest(_m) {
+      seek(_ofs);
+    }
+    void seek(uint64_t ofs);
+
+    void operator++();
+    bool operator==(const obj_iterator& rhs) const {
+      return (ofs == rhs.ofs);
+    }
+    bool operator!=(const obj_iterator& rhs) const {
+      return (ofs != rhs.ofs);
+    }
+    const rgw_obj_select& get_location() {
+      return location;
+    }
+
+    /* where current part starts */
+    uint64_t get_part_ofs() const {
+      return part_ofs;
+    }
+
+    /* start of current stripe */
+    uint64_t get_stripe_ofs() {
+      if (manifest->explicit_objs) {
+        return explicit_iter->first;
+      }
+      return stripe_ofs;
+    }
+
+    /* current ofs relative to start of rgw object */
+    uint64_t get_ofs() const {
+      return ofs;
+    }
+
+    /* stripe number */
+    int get_cur_stripe() const {
+      return cur_stripe;
+    }
+
+    /* current stripe size */
+    uint64_t get_stripe_size() {
+      if (manifest->explicit_objs) {
+        return explicit_iter->second.size;
+      }
+      return stripe_size;
+    }
+
+    /* offset where data starts within current stripe */
+    uint64_t location_ofs() {
+      if (manifest->explicit_objs) {
+        return explicit_iter->second.loc_ofs;
+      }
+      return 0; /* all stripes start at zero offset */
+    }
+
+    void update_location();
+
+    void dump(Formatter *f) const;
+  }; // class obj_iterator
+
+  obj_iterator obj_begin(const DoutPrefixProvider *dpp) const { return obj_iterator{dpp, this}; }
+  obj_iterator obj_end(const DoutPrefixProvider *dpp) const { return obj_iterator{dpp, this, obj_size}; }
+  obj_iterator obj_find(const DoutPrefixProvider *dpp, uint64_t ofs) const {
+    return obj_iterator{dpp, this, std::min(ofs, obj_size)};
+  }
+
+  /*
+   * simple object generator. Using a simple single rule manifest.
+   */
+  class generator {
+    RGWObjManifest *manifest;
+    uint64_t last_ofs;
+    uint64_t cur_part_ofs;
+    int cur_part_id;
+    int cur_stripe;
+    uint64_t cur_stripe_size;
+    std::string cur_oid;
+    
+    std::string oid_prefix;
+
+    rgw_obj_select cur_obj;
+
+    RGWObjManifestRule rule;
+
+  public:
+    generator() : manifest(NULL), last_ofs(0), cur_part_ofs(0), cur_part_id(0), 
+                 cur_stripe(0), cur_stripe_size(0) {}
+    int create_begin(CephContext *cct, RGWObjManifest *manifest,
+                     const rgw_placement_rule& head_placement_rule,
+                     const rgw_placement_rule *tail_placement_rule,
+                     const rgw_bucket& bucket,
+                     const rgw_obj& obj);
+
+    int create_next(uint64_t ofs);
+
+    rgw_raw_obj get_cur_obj(RGWZoneGroup& zonegroup, RGWZoneParams& zone_params) { return cur_obj.get_raw_obj(zonegroup, zone_params); }
+    rgw_raw_obj get_cur_obj(rgw::sal::RadosStore* store) const { return cur_obj.get_raw_obj(store); }
+
+    /* total max size of current stripe (including head obj) */
+    uint64_t cur_stripe_max_size() const {
+      return cur_stripe_size;
+    }
+  };
+};
+WRITE_CLASS_ENCODER(RGWObjManifest)
diff --git a/src/rgw/driver/rados/rgw_object_expirer_core.cc b/src/rgw/driver/rados/rgw_object_expirer_core.cc
new file mode 100644 (file)
index 0000000..ec1bf3f
--- /dev/null
@@ -0,0 +1,442 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include <errno.h>
+#include <iostream>
+#include <sstream>
+#include <string>
+
+
+#include "auth/Crypto.h"
+
+#include "common/armor.h"
+#include "common/ceph_json.h"
+#include "common/config.h"
+#include "common/ceph_argparse.h"
+#include "common/Formatter.h"
+#include "common/errno.h"
+
+#include "global/global_init.h"
+
+#include "include/utime.h"
+#include "include/str_list.h"
+
+#include "rgw_user.h"
+#include "rgw_bucket.h"
+#include "rgw_acl.h"
+#include "rgw_acl_s3.h"
+#include "rgw_log.h"
+#include "rgw_formats.h"
+#include "rgw_usage.h"
+#include "rgw_object_expirer_core.h"
+#include "rgw_zone.h"
+#include "rgw_sal_rados.h"
+
+#include "services/svc_rados.h"
+#include "services/svc_zone.h"
+#include "services/svc_sys_obj.h"
+#include "services/svc_bi_rados.h"
+
+#include "cls/lock/cls_lock_client.h"
+#include "cls/timeindex/cls_timeindex_client.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+
+static string objexp_lock_name = "gc_process";
+
+static string objexp_hint_get_shardname(int shard_num)
+{
+  char buf[64];
+  snprintf(buf, sizeof(buf), "obj_delete_at_hint.%010u", (unsigned)shard_num);
+  return buf;
+}
+
+static int objexp_key_shard(const rgw_obj_index_key& key, int num_shards)
+{
+  string obj_key = key.name + key.instance;
+  return RGWSI_BucketIndex_RADOS::bucket_shard_index(obj_key, num_shards);
+}
+
+static string objexp_hint_get_keyext(const string& tenant_name,
+                                     const string& bucket_name,
+                                     const string& bucket_id,
+                                     const rgw_obj_key& obj_key) {
+  return tenant_name + (tenant_name.empty() ? "" : ":") + bucket_name + ":" + bucket_id +
+    ":" + obj_key.name + ":" + obj_key.instance;
+}
+
+static void objexp_get_shard(int shard_num,
+                             string *shard)
+{
+  *shard = objexp_hint_get_shardname(shard_num);
+}
+
+static int objexp_hint_parse(const DoutPrefixProvider *dpp, CephContext *cct, cls_timeindex_entry &ti_entry,
+                             objexp_hint_entry *hint_entry)
+{
+  try {
+    auto iter = ti_entry.value.cbegin();
+    decode(*hint_entry, iter);
+  } catch (buffer::error& err) {
+    ldpp_dout(dpp, 0) << "ERROR: couldn't decode avail_pools" << dendl;
+  }
+
+  return 0;
+}
+
+int RGWObjExpStore::objexp_hint_add(const DoutPrefixProvider *dpp, 
+                              const ceph::real_time& delete_at,
+                              const string& tenant_name,
+                              const string& bucket_name,
+                              const string& bucket_id,
+                              const rgw_obj_index_key& obj_key)
+{
+  const string keyext = objexp_hint_get_keyext(tenant_name, bucket_name,
+          bucket_id, obj_key);
+  objexp_hint_entry he = {
+      .tenant = tenant_name,
+      .bucket_name = bucket_name,
+      .bucket_id = bucket_id,
+      .obj_key = obj_key,
+      .exp_time = delete_at };
+  bufferlist hebl;
+  encode(he, hebl);
+  librados::ObjectWriteOperation op;
+  cls_timeindex_add(op, utime_t(delete_at), keyext, hebl);
+
+  string shard_name = objexp_hint_get_shardname(objexp_key_shard(obj_key, cct->_conf->rgw_objexp_hints_num_shards));
+  auto obj = rados_svc->obj(rgw_raw_obj(driver->svc()->zone->get_zone_params().log_pool, shard_name));
+  int r = obj.open(dpp);
+  if (r < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: " << __func__ << "(): failed to open obj=" << obj << " (r=" << r << ")" << dendl;
+    return r;
+  }
+  return obj.operate(dpp, &op, null_yield);
+}
+
+int RGWObjExpStore::objexp_hint_list(const DoutPrefixProvider *dpp, 
+                               const string& oid,
+                               const ceph::real_time& start_time,
+                               const ceph::real_time& end_time,
+                               const int max_entries,
+                               const string& marker,
+                               list<cls_timeindex_entry>& entries, /* out */
+                               string *out_marker,                 /* out */
+                               bool *truncated)                    /* out */
+{
+  librados::ObjectReadOperation op;
+  cls_timeindex_list(op, utime_t(start_time), utime_t(end_time), marker, max_entries, entries,
+        out_marker, truncated);
+
+  auto obj = rados_svc->obj(rgw_raw_obj(driver->svc()->zone->get_zone_params().log_pool, oid));
+  int r = obj.open(dpp);
+  if (r < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: " << __func__ << "(): failed to open obj=" << obj << " (r=" << r << ")" << dendl;
+    return r;
+  }
+  bufferlist obl;
+  int ret = obj.operate(dpp, &op, &obl, null_yield);
+
+  if ((ret < 0 ) && (ret != -ENOENT)) {
+    return ret;
+  }
+
+  if ((ret == -ENOENT) && truncated) {
+    *truncated = false;
+  }
+
+  return 0;
+}
+
+static int cls_timeindex_trim_repeat(const DoutPrefixProvider *dpp, 
+                                rgw_rados_ref ref,
+                                const string& oid,
+                                const utime_t& from_time,
+                                const utime_t& to_time,
+                                const string& from_marker,
+                                const string& to_marker)
+{
+  bool done = false;
+  do {
+    librados::ObjectWriteOperation op;
+    cls_timeindex_trim(op, from_time, to_time, from_marker, to_marker);
+    int r = rgw_rados_operate(dpp, ref.pool.ioctx(), oid, &op, null_yield);
+    if (r == -ENODATA)
+      done = true;
+    else if (r < 0)
+      return r;
+  } while (!done);
+
+  return 0;
+}
+
+int RGWObjExpStore::objexp_hint_trim(const DoutPrefixProvider *dpp, 
+                               const string& oid,
+                               const ceph::real_time& start_time,
+                               const ceph::real_time& end_time,
+                               const string& from_marker,
+                               const string& to_marker)
+{
+  auto obj = rados_svc->obj(rgw_raw_obj(driver->svc()->zone->get_zone_params().log_pool, oid));
+  int r = obj.open(dpp);
+  if (r < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: " << __func__ << "(): failed to open obj=" << obj << " (r=" << r << ")" << dendl;
+    return r;
+  }
+  auto& ref = obj.get_ref();
+  int ret = cls_timeindex_trim_repeat(dpp, ref, oid, utime_t(start_time), utime_t(end_time),
+          from_marker, to_marker);
+  if ((ret < 0 ) && (ret != -ENOENT)) {
+    return ret;
+  }
+
+  return 0;
+}
+
+int RGWObjectExpirer::garbage_single_object(const DoutPrefixProvider *dpp, objexp_hint_entry& hint)
+{
+  RGWBucketInfo bucket_info;
+  std::unique_ptr<rgw::sal::Bucket> bucket;
+
+  int ret = driver->get_bucket(dpp, nullptr, rgw_bucket(hint.tenant, hint.bucket_name, hint.bucket_id), &bucket, null_yield);
+  if (-ENOENT == ret) {
+    ldpp_dout(dpp, 15) << "NOTICE: cannot find bucket = " \
+        << hint.bucket_name << ". The object must be already removed" << dendl;
+    return -ERR_PRECONDITION_FAILED;
+  } else if (ret < 0) {
+    ldpp_dout(dpp, 1) << "ERROR: could not init bucket = " \
+        << hint.bucket_name << "due to ret = " << ret << dendl;
+    return ret;
+  }
+
+  rgw_obj_key key = hint.obj_key;
+  if (key.instance.empty()) {
+    key.instance = "null";
+  }
+
+  std::unique_ptr<rgw::sal::Object> obj = bucket->get_object(key);
+  obj->set_atomic();
+  ret = obj->delete_object(dpp, null_yield);
+
+  return ret;
+}
+
+void RGWObjectExpirer::garbage_chunk(const DoutPrefixProvider *dpp, 
+                                  list<cls_timeindex_entry>& entries,      /* in  */
+                                  bool& need_trim)                         /* out */
+{
+  need_trim = false;
+
+  for (list<cls_timeindex_entry>::iterator iter = entries.begin();
+       iter != entries.end();
+       ++iter)
+  {
+    objexp_hint_entry hint;
+    ldpp_dout(dpp, 15) << "got removal hint for: " << iter->key_ts.sec() \
+        << " - " << iter->key_ext << dendl;
+
+    int ret = objexp_hint_parse(dpp, driver->ctx(), *iter, &hint);
+    if (ret < 0) {
+      ldpp_dout(dpp, 1) << "cannot parse removal hint for " << hint.obj_key << dendl;
+      continue;
+    }
+
+    /* PRECOND_FAILED simply means that our hint is not valid.
+     * We can silently ignore that and move forward. */
+    ret = garbage_single_object(dpp, hint);
+    if (ret == -ERR_PRECONDITION_FAILED) {
+      ldpp_dout(dpp, 15) << "not actual hint for object: " << hint.obj_key << dendl;
+    } else if (ret < 0) {
+      ldpp_dout(dpp, 1) << "cannot remove expired object: " << hint.obj_key << dendl;
+    }
+
+    need_trim = true;
+  }
+
+  return;
+}
+
+void RGWObjectExpirer::trim_chunk(const DoutPrefixProvider *dpp, 
+                                  const string& shard,
+                                  const utime_t& from,
+                                  const utime_t& to,
+                                  const string& from_marker,
+                                  const string& to_marker)
+{
+  ldpp_dout(dpp, 20) << "trying to trim removal hints to=" << to
+                          << ", to_marker=" << to_marker << dendl;
+
+  real_time rt_from = from.to_real_time();
+  real_time rt_to = to.to_real_time();
+
+  int ret = exp_store.objexp_hint_trim(dpp, shard, rt_from, rt_to,
+                                       from_marker, to_marker);
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "ERROR during trim: " << ret << dendl;
+  }
+
+  return;
+}
+
+bool RGWObjectExpirer::process_single_shard(const DoutPrefixProvider *dpp, 
+                                            const string& shard,
+                                            const utime_t& last_run,
+                                            const utime_t& round_start)
+{
+  string marker;
+  string out_marker;
+  bool truncated = false;
+  bool done = true;
+
+  CephContext *cct = driver->ctx();
+  int num_entries = cct->_conf->rgw_objexp_chunk_size;
+
+  int max_secs = cct->_conf->rgw_objexp_gc_interval;
+  utime_t end = ceph_clock_now();
+  end += max_secs;
+
+  rados::cls::lock::Lock l(objexp_lock_name);
+
+  utime_t time(max_secs, 0);
+  l.set_duration(time);
+
+  int ret = l.lock_exclusive(&static_cast<rgw::sal::RadosStore*>(driver)->getRados()->objexp_pool_ctx, shard);
+  if (ret == -EBUSY) { /* already locked by another processor */
+    ldpp_dout(dpp, 5) << __func__ << "(): failed to acquire lock on " << shard << dendl;
+    return false;
+  }
+
+  do {
+    real_time rt_last = last_run.to_real_time();
+    real_time rt_start = round_start.to_real_time();
+
+    list<cls_timeindex_entry> entries;
+    ret = exp_store.objexp_hint_list(dpp, shard, rt_last, rt_start,
+                                     num_entries, marker, entries,
+                                     &out_marker, &truncated);
+    if (ret < 0) {
+      ldpp_dout(dpp, 10) << "cannot get removal hints from shard: " << shard
+                     << dendl;
+      continue;
+    }
+
+    bool need_trim;
+    garbage_chunk(dpp, entries, need_trim);
+
+    if (need_trim) {
+      trim_chunk(dpp, shard, last_run, round_start, marker, out_marker);
+    }
+
+    utime_t now = ceph_clock_now();
+    if (now >= end) {
+      done = false;
+      break;
+    }
+
+    marker = out_marker;
+  } while (truncated);
+
+  l.unlock(&static_cast<rgw::sal::RadosStore*>(driver)->getRados()->objexp_pool_ctx, shard);
+  return done;
+}
+
+/* Returns true if all shards have been processed successfully. */
+bool RGWObjectExpirer::inspect_all_shards(const DoutPrefixProvider *dpp, 
+                                          const utime_t& last_run,
+                                          const utime_t& round_start)
+{
+  CephContext * const cct = driver->ctx();
+  int num_shards = cct->_conf->rgw_objexp_hints_num_shards;
+  bool all_done = true;
+
+  for (int i = 0; i < num_shards; i++) {
+    string shard;
+    objexp_get_shard(i, &shard);
+
+    ldpp_dout(dpp, 20) << "processing shard = " << shard << dendl;
+
+    if (! process_single_shard(dpp, shard, last_run, round_start)) {
+      all_done = false;
+    }
+  }
+
+  return all_done;
+}
+
+bool RGWObjectExpirer::going_down()
+{
+  return down_flag;
+}
+
+void RGWObjectExpirer::start_processor()
+{
+  worker = new OEWorker(driver->ctx(), this);
+  worker->create("rgw_obj_expirer");
+}
+
+void RGWObjectExpirer::stop_processor()
+{
+  down_flag = true;
+  if (worker) {
+    worker->stop();
+    worker->join();
+  }
+  delete worker;
+  worker = NULL;
+}
+
+void *RGWObjectExpirer::OEWorker::entry() {
+  utime_t last_run;
+  do {
+    utime_t start = ceph_clock_now();
+    ldpp_dout(this, 2) << "object expiration: start" << dendl;
+    if (oe->inspect_all_shards(this, last_run, start)) {
+      /* All shards have been processed properly. Next time we can start
+       * from this moment. */
+      last_run = start;
+    }
+    ldpp_dout(this, 2) << "object expiration: stop" << dendl;
+
+
+    if (oe->going_down())
+      break;
+
+    utime_t end = ceph_clock_now();
+    end -= start;
+    int secs = cct->_conf->rgw_objexp_gc_interval;
+
+    if (secs <= end.sec())
+      continue; // next round
+
+    secs -= end.sec();
+
+    std::unique_lock l{lock};
+    cond.wait_for(l, std::chrono::seconds(secs));
+  } while (!oe->going_down());
+
+  return NULL;
+}
+
+void RGWObjectExpirer::OEWorker::stop()
+{
+  std::lock_guard l{lock};
+  cond.notify_all();
+}
+
+CephContext *RGWObjectExpirer::OEWorker::get_cct() const 
+{ 
+  return cct; 
+}
+
+unsigned RGWObjectExpirer::OEWorker::get_subsys() const 
+{
+    return dout_subsys;
+}
+
+std::ostream& RGWObjectExpirer::OEWorker::gen_prefix(std::ostream& out) const 
+{ 
+  return out << "rgw object expirer Worker thread: "; 
+}
diff --git a/src/rgw/driver/rados/rgw_object_expirer_core.h b/src/rgw/driver/rados/rgw_object_expirer_core.h
new file mode 100644 (file)
index 0000000..fccd419
--- /dev/null
@@ -0,0 +1,148 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#ifndef CEPH_OBJEXP_H
+#define CEPH_OBJEXP_H
+
+#include <atomic>
+#include <string>
+#include <cerrno>
+#include <sstream>
+#include <iostream>
+
+#include "auth/Crypto.h"
+
+#include "common/armor.h"
+#include "common/ceph_json.h"
+#include "common/config.h"
+#include "common/ceph_argparse.h"
+#include "common/Formatter.h"
+#include "common/errno.h"
+
+#include "common/ceph_mutex.h"
+#include "common/Cond.h"
+#include "common/Thread.h"
+
+#include "global/global_init.h"
+
+#include "include/common_fwd.h"
+#include "include/utime.h"
+#include "include/str_list.h"
+
+#include "rgw_sal_rados.h"
+
+class RGWSI_RADOS;
+class RGWSI_Zone;
+class RGWBucketInfo;
+class cls_timeindex_entry;
+
+class RGWObjExpStore {
+  CephContext *cct;
+  RGWSI_RADOS *rados_svc;
+  rgw::sal::RadosStore* driver;
+public:
+  RGWObjExpStore(CephContext *_cct, RGWSI_RADOS *_rados_svc, rgw::sal::RadosStore* _driver) : cct(_cct),
+                                                                                      rados_svc(_rados_svc),
+                                                                                      driver(_driver) {}
+
+  int objexp_hint_add(const DoutPrefixProvider *dpp, 
+                      const ceph::real_time& delete_at,
+                      const std::string& tenant_name,
+                      const std::string& bucket_name,
+                      const std::string& bucket_id,
+                      const rgw_obj_index_key& obj_key);
+
+  int objexp_hint_list(const DoutPrefixProvider *dpp, 
+                       const std::string& oid,
+                       const ceph::real_time& start_time,
+                       const ceph::real_time& end_time,
+                       const int max_entries,
+                       const std::string& marker,
+                       std::list<cls_timeindex_entry>& entries, /* out */
+                       std::string *out_marker,                 /* out */
+                       bool *truncated);                   /* out */
+
+  int objexp_hint_trim(const DoutPrefixProvider *dpp, 
+                       const std::string& oid,
+                       const ceph::real_time& start_time,
+                       const ceph::real_time& end_time,
+                       const std::string& from_marker,
+                       const std::string& to_marker);
+};
+
+class RGWObjectExpirer {
+protected:
+  rgw::sal::Driver* driver;
+  RGWObjExpStore exp_store;
+
+  class OEWorker : public Thread, public DoutPrefixProvider {
+    CephContext *cct;
+    RGWObjectExpirer *oe;
+    ceph::mutex lock = ceph::make_mutex("OEWorker");
+    ceph::condition_variable cond;
+
+  public:
+    OEWorker(CephContext * const cct,
+             RGWObjectExpirer * const oe)
+      : cct(cct),
+        oe(oe) {
+    }
+
+    void *entry() override;
+    void stop();
+
+    CephContext *get_cct() const override;
+    unsigned get_subsys() const override;
+    std::ostream& gen_prefix(std::ostream& out) const override;
+  };
+
+  OEWorker *worker{nullptr};
+  std::atomic<bool> down_flag = { false };
+
+public:
+  explicit RGWObjectExpirer(rgw::sal::Driver* _driver)
+    : driver(_driver),
+      exp_store(_driver->ctx(), static_cast<rgw::sal::RadosStore*>(driver)->svc()->rados, static_cast<rgw::sal::RadosStore*>(driver)),
+      worker(NULL) {
+  }
+  ~RGWObjectExpirer() {
+    stop_processor();
+  }
+
+  int hint_add(const DoutPrefixProvider *dpp, 
+               const ceph::real_time& delete_at,
+               const std::string& tenant_name,
+               const std::string& bucket_name,
+               const std::string& bucket_id,
+               const rgw_obj_index_key& obj_key) {
+    return exp_store.objexp_hint_add(dpp, delete_at, tenant_name, bucket_name,
+                                     bucket_id, obj_key);
+  }
+
+  int garbage_single_object(const DoutPrefixProvider *dpp, objexp_hint_entry& hint);
+
+  void garbage_chunk(const DoutPrefixProvider *dpp, 
+                     std::list<cls_timeindex_entry>& entries, /* in  */
+                     bool& need_trim);                        /* out */
+
+  void trim_chunk(const DoutPrefixProvider *dpp, 
+                  const std::string& shard,
+                  const utime_t& from,
+                  const utime_t& to,
+                  const std::string& from_marker,
+                  const std::string& to_marker);
+
+  bool process_single_shard(const DoutPrefixProvider *dpp, 
+                            const std::string& shard,
+                            const utime_t& last_run,
+                            const utime_t& round_start);
+
+  bool inspect_all_shards(const DoutPrefixProvider *dpp, 
+                          const utime_t& last_run,
+                          const utime_t& round_start);
+
+  bool going_down();
+  void start_processor();
+  void stop_processor();
+};
+#endif /* CEPH_OBJEXP_H */
diff --git a/src/rgw/driver/rados/rgw_otp.cc b/src/rgw/driver/rados/rgw_otp.cc
new file mode 100644 (file)
index 0000000..07cc14f
--- /dev/null
@@ -0,0 +1,211 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include <errno.h>
+
+#include <string>
+#include <map>
+#include <boost/algorithm/string.hpp>
+
+#include "common/errno.h"
+#include "common/Formatter.h"
+#include "common/ceph_json.h"
+#include "rgw_otp.h"
+#include "rgw_zone.h"
+#include "rgw_metadata.h"
+
+#include "include/types.h"
+
+#include "rgw_common.h"
+#include "rgw_tools.h"
+
+#include "services/svc_zone.h"
+#include "services/svc_meta.h"
+#include "services/svc_meta_be.h"
+#include "services/svc_meta_be_otp.h"
+#include "services/svc_otp.h"
+
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+
+
+class RGWOTPMetadataHandler;
+
+class RGWOTPMetadataObject : public RGWMetadataObject {
+  friend class RGWOTPMetadataHandler;
+
+  otp_devices_list_t devices;
+public:
+  RGWOTPMetadataObject() {}
+  RGWOTPMetadataObject(otp_devices_list_t&& _devices, const obj_version& v, const real_time m) {
+    devices = std::move(_devices);
+    objv = v;
+    mtime = m;
+  }
+
+  void dump(Formatter *f) const override {
+    encode_json("devices", devices, f);
+  }
+
+  otp_devices_list_t& get_devs() {
+    return devices;
+  }
+};
+
+
+class RGWOTPMetadataHandler : public RGWOTPMetadataHandlerBase {
+  friend class RGWOTPCtl;
+
+  struct Svc {
+    RGWSI_Zone *zone;
+    RGWSI_MetaBackend *meta_be;
+    RGWSI_OTP *otp;
+  } svc;
+
+  int init(RGWSI_Zone *zone,
+           RGWSI_MetaBackend *_meta_be,
+           RGWSI_OTP *_otp) {
+    base_init(zone->ctx(), _otp->get_be_handler().get());
+    svc.zone = zone;
+    svc.meta_be = _meta_be;
+    svc.otp = _otp;
+    return 0;
+  }
+
+  int call(std::function<int(RGWSI_OTP_BE_Ctx& ctx)> f) {
+    return be_handler->call([&](RGWSI_MetaBackend_Handler::Op *op) {
+      RGWSI_OTP_BE_Ctx ctx(op->ctx());
+      return f(ctx);
+    });
+  }
+
+  RGWMetadataObject *get_meta_obj(JSONObj *jo, const obj_version& objv, const ceph::real_time& mtime) override {
+    otp_devices_list_t devices;
+    try {
+      JSONDecoder::decode_json("devices", devices, jo);
+    } catch (JSONDecoder::err& e) {
+      return nullptr;
+    }
+
+    return new RGWOTPMetadataObject(std::move(devices), objv, mtime);
+  }
+
+  int do_get(RGWSI_MetaBackend_Handler::Op *op, string& entry, RGWMetadataObject **obj, optional_yield y, const DoutPrefixProvider *dpp) override {
+    RGWObjVersionTracker objv_tracker;
+
+    std::unique_ptr<RGWOTPMetadataObject> mdo(new RGWOTPMetadataObject);
+
+    
+    RGWSI_OTP_BE_Ctx be_ctx(op->ctx());
+
+    int ret = svc.otp->read_all(be_ctx,
+                                entry,
+                                &mdo->get_devs(),
+                                &mdo->get_mtime(),
+                                &objv_tracker,
+                                y,
+                                dpp);
+    if (ret < 0) {
+      return ret;
+    }
+
+    mdo->objv = objv_tracker.read_version;
+
+    *obj = mdo.release();
+
+    return 0;
+  }
+
+  int do_put(RGWSI_MetaBackend_Handler::Op *op, string& entry,
+             RGWMetadataObject *_obj, RGWObjVersionTracker& objv_tracker,
+             optional_yield y,
+             const DoutPrefixProvider *dpp,
+             RGWMDLogSyncType type, bool from_remote_zone) override {
+    RGWOTPMetadataObject *obj = static_cast<RGWOTPMetadataObject *>(_obj);
+
+    RGWSI_OTP_BE_Ctx be_ctx(op->ctx());
+
+    int ret = svc.otp->store_all(dpp, be_ctx,
+                                 entry,
+                                 obj->devices,
+                                 obj->mtime,
+                                 &objv_tracker,
+                                 y);
+    if (ret < 0) {
+      return ret;
+    }
+
+    return STATUS_APPLIED;
+  }
+
+  int do_remove(RGWSI_MetaBackend_Handler::Op *op, string& entry, RGWObjVersionTracker& objv_tracker,
+                optional_yield y, const DoutPrefixProvider *dpp) override {
+    RGWSI_MBOTP_RemoveParams params;
+
+    RGWSI_OTP_BE_Ctx be_ctx(op->ctx());
+
+    return svc.otp->remove_all(dpp, be_ctx,
+                               entry,
+                               &objv_tracker,
+                               y);
+  }
+
+public:
+  RGWOTPMetadataHandler() {}
+
+  string get_type() override { return "otp"; }
+};
+
+
+RGWOTPCtl::RGWOTPCtl(RGWSI_Zone *zone_svc,
+                    RGWSI_OTP *otp_svc)
+{
+  svc.zone = zone_svc;
+  svc.otp = otp_svc;
+}
+
+
+void RGWOTPCtl::init(RGWOTPMetadataHandler *_meta_handler)
+{
+  meta_handler = _meta_handler;
+  be_handler = meta_handler->get_be_handler();
+}
+
+int RGWOTPCtl::read_all(const rgw_user& uid,
+                        RGWOTPInfo *info,
+                        optional_yield y,
+                        const DoutPrefixProvider *dpp,
+                        const GetParams& params)
+{
+  info->uid = uid;
+  return meta_handler->call([&](RGWSI_OTP_BE_Ctx& ctx) {
+    return svc.otp->read_all(ctx, uid, &info->devices, params.mtime, params.objv_tracker, y, dpp);
+  });
+}
+
+int RGWOTPCtl::store_all(const DoutPrefixProvider *dpp, 
+                         const RGWOTPInfo& info,
+                         optional_yield y,
+                         const PutParams& params)
+{
+  return meta_handler->call([&](RGWSI_OTP_BE_Ctx& ctx) {
+    return svc.otp->store_all(dpp, ctx, info.uid, info.devices, params.mtime, params.objv_tracker, y);
+  });
+}
+
+int RGWOTPCtl::remove_all(const DoutPrefixProvider *dpp,
+                          const rgw_user& uid,
+                          optional_yield y,
+                          const RemoveParams& params)
+{
+  return meta_handler->call([&](RGWSI_OTP_BE_Ctx& ctx) {
+    return svc.otp->remove_all(dpp, ctx, uid, params.objv_tracker, y);
+  });
+}
+
+
+RGWMetadataHandler *RGWOTPMetaHandlerAllocator::alloc()
+{
+  return new RGWOTPMetadataHandler();
+}
diff --git a/src/rgw/driver/rados/rgw_otp.h b/src/rgw/driver/rados/rgw_otp.h
new file mode 100644 (file)
index 0000000..eacff15
--- /dev/null
@@ -0,0 +1,114 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#ifndef CEPH_RGW_OTP_H
+#define CEPH_RGW_OTP_H
+
+#include "rgw_sal_fwd.h"
+#include "cls/otp/cls_otp_types.h"
+#include "services/svc_meta_be_otp.h"
+
+#include "rgw_basic_types.h"
+#include "rgw_metadata.h"
+
+
+class RGWObjVersionTracker;
+class RGWMetadataHandler;
+class RGWOTPMetadataHandler;
+class RGWSI_Zone;
+class RGWSI_OTP;
+class RGWSI_MetaBackend;
+
+class RGWOTPMetadataHandlerBase : public RGWMetadataHandler_GenericMetaBE {
+public:
+  virtual ~RGWOTPMetadataHandlerBase() {}
+  virtual int init(RGWSI_Zone *zone,
+                  RGWSI_MetaBackend *_meta_be,
+                  RGWSI_OTP *_otp) = 0;
+};
+
+class RGWOTPMetaHandlerAllocator {
+public:
+  static RGWMetadataHandler *alloc();
+};
+
+struct RGWOTPInfo {
+  rgw_user uid;
+  otp_devices_list_t devices;
+};
+
+
+class RGWOTPCtl
+{
+  struct Svc {
+    RGWSI_Zone *zone{nullptr};
+    RGWSI_OTP *otp{nullptr};
+  } svc;
+
+  RGWOTPMetadataHandler *meta_handler;
+  RGWSI_MetaBackend_Handler *be_handler;
+  
+public:
+  RGWOTPCtl(RGWSI_Zone *zone_svc,
+           RGWSI_OTP *otp_svc);
+
+  void init(RGWOTPMetadataHandler *_meta_handler);
+
+  struct GetParams {
+    RGWObjVersionTracker *objv_tracker{nullptr};
+    ceph::real_time *mtime{nullptr};
+
+    GetParams() {}
+
+    GetParams& set_objv_tracker(RGWObjVersionTracker *_objv_tracker) {
+      objv_tracker = _objv_tracker;
+      return *this;
+    }
+
+    GetParams& set_mtime(ceph::real_time *_mtime) {
+      mtime = _mtime;
+      return *this;
+    }
+  };
+
+  struct PutParams {
+    RGWObjVersionTracker *objv_tracker{nullptr};
+    ceph::real_time mtime;
+
+    PutParams() {}
+
+    PutParams& set_objv_tracker(RGWObjVersionTracker *_objv_tracker) {
+      objv_tracker = _objv_tracker;
+      return *this;
+    }
+
+    PutParams& set_mtime(const ceph::real_time& _mtime) {
+      mtime = _mtime;
+      return *this;
+    }
+  };
+
+  struct RemoveParams {
+    RGWObjVersionTracker *objv_tracker{nullptr};
+
+    RemoveParams() {}
+
+    RemoveParams& set_objv_tracker(RGWObjVersionTracker *_objv_tracker) {
+      objv_tracker = _objv_tracker;
+      return *this;
+    }
+  };
+
+  int read_all(const rgw_user& uid, RGWOTPInfo *info, optional_yield y,
+               const DoutPrefixProvider *dpp,
+               const GetParams& params = {});
+  int store_all(const DoutPrefixProvider *dpp, 
+                const RGWOTPInfo& info, optional_yield y,
+                const PutParams& params = {});
+  int remove_all(const DoutPrefixProvider *dpp, 
+                 const rgw_user& user, optional_yield y,
+                 const RemoveParams& params = {});
+};
+
+#endif
+
diff --git a/src/rgw/driver/rados/rgw_period.cc b/src/rgw/driver/rados/rgw_period.cc
new file mode 100644 (file)
index 0000000..61602b3
--- /dev/null
@@ -0,0 +1,324 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "rgw_sync.h"
+
+#include "services/svc_zone.h"
+
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+using namespace rgw_zone_defaults;
+
+int RGWPeriod::get_zonegroup(RGWZoneGroup& zonegroup,
+                             const string& zonegroup_id) const
+{
+  map<string, RGWZoneGroup>::const_iterator iter;
+  if (!zonegroup_id.empty()) {
+    iter = period_map.zonegroups.find(zonegroup_id);
+  } else {
+    iter = period_map.zonegroups.find("default");
+  }
+  if (iter != period_map.zonegroups.end()) {
+    zonegroup = iter->second;
+    return 0;
+  }
+
+  return -ENOENT;
+}
+
+int RGWPeriod::get_latest_epoch(const DoutPrefixProvider *dpp, epoch_t& latest_epoch, optional_yield y)
+{
+  RGWPeriodLatestEpochInfo info;
+
+  int ret = read_latest_epoch(dpp, info, y);
+  if (ret < 0) {
+    return ret;
+  }
+
+  latest_epoch = info.epoch;
+
+  return 0;
+}
+
+int RGWPeriod::delete_obj(const DoutPrefixProvider *dpp, optional_yield y)
+{
+  rgw_pool pool(get_pool(cct));
+
+  // delete the object for each period epoch
+  for (epoch_t e = 1; e <= epoch; e++) {
+    RGWPeriod p{get_id(), e};
+    rgw_raw_obj oid{pool, p.get_period_oid()};
+    auto sysobj = sysobj_svc->get_obj(oid);
+    int ret = sysobj.wop().remove(dpp, y);
+    if (ret < 0) {
+      ldpp_dout(dpp, 0) << "WARNING: failed to delete period object " << oid
+          << ": " << cpp_strerror(-ret) << dendl;
+    }
+  }
+
+  // delete the .latest_epoch object
+  rgw_raw_obj oid{pool, get_period_oid_prefix() + get_latest_epoch_oid()};
+  auto sysobj = sysobj_svc->get_obj(oid);
+  int ret = sysobj.wop().remove(dpp, y);
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "WARNING: failed to delete period object " << oid
+        << ": " << cpp_strerror(-ret) << dendl;
+  }
+  return ret;
+}
+
+int RGWPeriod::add_zonegroup(const DoutPrefixProvider *dpp, const RGWZoneGroup& zonegroup, optional_yield y)
+{
+  if (zonegroup.realm_id != realm_id) {
+    return 0;
+  }
+  int ret = period_map.update(zonegroup, cct);
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: updating period map: " << cpp_strerror(-ret) << dendl;
+    return ret;
+  }
+
+  return store_info(dpp, false, y);
+}
+
+int RGWPeriod::update(const DoutPrefixProvider *dpp, optional_yield y)
+{
+  auto zone_svc = sysobj_svc->get_zone_svc();
+  ldpp_dout(dpp, 20) << __func__ << " realm " << realm_id << " period " << get_id() << dendl;
+  list<string> zonegroups;
+  int ret = zone_svc->list_zonegroups(dpp, zonegroups);
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: failed to list zonegroups: " << cpp_strerror(-ret) << dendl;
+    return ret;
+  }
+
+  // clear zone short ids of removed zones. period_map.update() will add the
+  // remaining zones back
+  period_map.short_zone_ids.clear();
+
+  for (auto& iter : zonegroups) {
+    RGWZoneGroup zg(string(), iter);
+    ret = zg.init(dpp, cct, sysobj_svc, y);
+    if (ret < 0) {
+      ldpp_dout(dpp, 0) << "WARNING: zg.init() failed: " << cpp_strerror(-ret) << dendl;
+      continue;
+    }
+
+    if (zg.realm_id != realm_id) {
+      ldpp_dout(dpp, 20) << "skipping zonegroup " << zg.get_name() << " zone realm id " << zg.realm_id << ", not on our realm " << realm_id << dendl;
+      continue;
+    }
+
+    if (zg.master_zone.empty()) {
+      ldpp_dout(dpp, 0) << "ERROR: zonegroup " << zg.get_name() << " should have a master zone " << dendl;
+      return -EINVAL;
+    }
+
+    if (zg.zones.find(zg.master_zone) == zg.zones.end()) {
+      ldpp_dout(dpp, 0) << "ERROR: zonegroup " << zg.get_name()
+                   << " has a non existent master zone "<< dendl;
+      return -EINVAL;
+    }
+
+    if (zg.is_master_zonegroup()) {
+      master_zonegroup = zg.get_id();
+      master_zone = zg.master_zone;
+    }
+
+    int ret = period_map.update(zg, cct);
+    if (ret < 0) {
+      return ret;
+    }
+  }
+
+  ret = period_config.read(dpp, sysobj_svc, realm_id, y);
+  if (ret < 0 && ret != -ENOENT) {
+    ldpp_dout(dpp, 0) << "ERROR: failed to read period config: "
+        << cpp_strerror(ret) << dendl;
+    return ret;
+  }
+  return 0;
+}
+
+void RGWPeriod::fork()
+{
+  ldout(cct, 20) << __func__ << " realm " << realm_id << " period " << id << dendl;
+  predecessor_uuid = id;
+  id = get_staging_id(realm_id);
+  period_map.reset();
+  realm_epoch++;
+}
+
+static int read_sync_status(const DoutPrefixProvider *dpp, rgw::sal::Driver* driver, rgw_meta_sync_status *sync_status)
+{
+  rgw::sal::RadosStore* rados_store = static_cast<rgw::sal::RadosStore*>(driver);
+  // initialize a sync status manager to read the status
+  RGWMetaSyncStatusManager mgr(rados_store, rados_store->svc()->rados->get_async_processor());
+  int r = mgr.init(dpp);
+  if (r < 0) {
+    return r;
+  }
+  r = mgr.read_sync_status(dpp, sync_status);
+  mgr.stop();
+  return r;
+}
+
+int RGWPeriod::update_sync_status(const DoutPrefixProvider *dpp,
+                                  rgw::sal::Driver* driver, /* for now */
+                                 const RGWPeriod &current_period,
+                                  std::ostream& error_stream,
+                                  bool force_if_stale)
+{
+  rgw_meta_sync_status status;
+  int r = read_sync_status(dpp, driver, &status);
+  if (r < 0) {
+    ldpp_dout(dpp, 0) << "period failed to read sync status: "
+        << cpp_strerror(-r) << dendl;
+    return r;
+  }
+
+  std::vector<std::string> markers;
+
+  const auto current_epoch = current_period.get_realm_epoch();
+  if (current_epoch != status.sync_info.realm_epoch) {
+    // no sync status markers for the current period
+    ceph_assert(current_epoch > status.sync_info.realm_epoch);
+    const int behind = current_epoch - status.sync_info.realm_epoch;
+    if (!force_if_stale && current_epoch > 1) {
+      error_stream << "ERROR: This zone is " << behind << " period(s) behind "
+          "the current master zone in metadata sync. If this zone is promoted "
+          "to master, any metadata changes during that time are likely to "
+          "be lost.\n"
+          "Waiting for this zone to catch up on metadata sync (see "
+          "'radosgw-admin sync status') is recommended.\n"
+          "To promote this zone to master anyway, add the flag "
+          "--yes-i-really-mean-it." << std::endl;
+      return -EINVAL;
+    }
+    // empty sync status markers - other zones will skip this period during
+    // incremental metadata sync
+    markers.resize(status.sync_info.num_shards);
+  } else {
+    markers.reserve(status.sync_info.num_shards);
+    for (auto& i : status.sync_markers) {
+      auto& marker = i.second;
+      // filter out markers from other periods
+      if (marker.realm_epoch != current_epoch) {
+        marker.marker.clear();
+      }
+      markers.emplace_back(std::move(marker.marker));
+    }
+  }
+
+  std::swap(sync_status, markers);
+  return 0;
+}
+
+int RGWPeriod::commit(const DoutPrefixProvider *dpp,
+                     rgw::sal::Driver* driver,
+                     RGWRealm& realm, const RGWPeriod& current_period,
+                      std::ostream& error_stream, optional_yield y,
+                     bool force_if_stale)
+{
+  auto zone_svc = sysobj_svc->get_zone_svc();
+  ldpp_dout(dpp, 20) << __func__ << " realm " << realm.get_id() << " period " << current_period.get_id() << dendl;
+  // gateway must be in the master zone to commit
+  if (master_zone != zone_svc->get_zone_params().get_id()) {
+    error_stream << "Cannot commit period on zone "
+        << zone_svc->get_zone_params().get_id() << ", it must be sent to "
+        "the period's master zone " << master_zone << '.' << std::endl;
+    return -EINVAL;
+  }
+  // period predecessor must match current period
+  if (predecessor_uuid != current_period.get_id()) {
+    error_stream << "Period predecessor " << predecessor_uuid
+        << " does not match current period " << current_period.get_id()
+        << ". Use 'period pull' to get the latest period from the master, "
+        "reapply your changes, and try again." << std::endl;
+    return -EINVAL;
+  }
+  // realm epoch must be 1 greater than current period
+  if (realm_epoch != current_period.get_realm_epoch() + 1) {
+    error_stream << "Period's realm epoch " << realm_epoch
+        << " does not come directly after current realm epoch "
+        << current_period.get_realm_epoch() << ". Use 'realm pull' to get the "
+        "latest realm and period from the master zone, reapply your changes, "
+        "and try again." << std::endl;
+    return -EINVAL;
+  }
+  // did the master zone change?
+  if (master_zone != current_period.get_master_zone()) {
+    // store the current metadata sync status in the period
+    int r = update_sync_status(dpp, driver, current_period, error_stream, force_if_stale);
+    if (r < 0) {
+      ldpp_dout(dpp, 0) << "failed to update metadata sync status: "
+          << cpp_strerror(-r) << dendl;
+      return r;
+    }
+    // create an object with a new period id
+    r = create(dpp, y, true);
+    if (r < 0) {
+      ldpp_dout(dpp, 0) << "failed to create new period: " << cpp_strerror(-r) << dendl;
+      return r;
+    }
+    // set as current period
+    r = realm.set_current_period(dpp, *this, y);
+    if (r < 0) {
+      ldpp_dout(dpp, 0) << "failed to update realm's current period: "
+          << cpp_strerror(-r) << dendl;
+      return r;
+    }
+    ldpp_dout(dpp, 4) << "Promoted to master zone and committed new period "
+        << id << dendl;
+    realm.notify_new_period(dpp, *this, y);
+    return 0;
+  }
+  // period must be based on current epoch
+  if (epoch != current_period.get_epoch()) {
+    error_stream << "Period epoch " << epoch << " does not match "
+        "predecessor epoch " << current_period.get_epoch()
+        << ". Use 'period pull' to get the latest epoch from the master zone, "
+        "reapply your changes, and try again." << std::endl;
+    return -EINVAL;
+  }
+  // set period as next epoch
+  set_id(current_period.get_id());
+  set_epoch(current_period.get_epoch() + 1);
+  set_predecessor(current_period.get_predecessor());
+  realm_epoch = current_period.get_realm_epoch();
+  // write the period to rados
+  int r = store_info(dpp, false, y);
+  if (r < 0) {
+    ldpp_dout(dpp, 0) << "failed to store period: " << cpp_strerror(-r) << dendl;
+    return r;
+  }
+  // set as latest epoch
+  r = update_latest_epoch(dpp, epoch, y);
+  if (r == -EEXIST) {
+    // already have this epoch (or a more recent one)
+    return 0;
+  }
+  if (r < 0) {
+    ldpp_dout(dpp, 0) << "failed to set latest epoch: " << cpp_strerror(-r) << dendl;
+    return r;
+  }
+  r = reflect(dpp, y);
+  if (r < 0) {
+    ldpp_dout(dpp, 0) << "failed to update local objects: " << cpp_strerror(-r) << dendl;
+    return r;
+  }
+  ldpp_dout(dpp, 4) << "Committed new epoch " << epoch
+      << " for period " << id << dendl;
+  realm.notify_new_period(dpp, *this, y);
+  return 0;
+}
+
+void RGWPeriod::generate_test_instances(list<RGWPeriod*> &o)
+{
+  RGWPeriod *z = new RGWPeriod;
+  o.push_back(z);
+  o.push_back(new RGWPeriod);
+}
+
+
diff --git a/src/rgw/driver/rados/rgw_rest_pubsub.cc b/src/rgw/driver/rados/rgw_rest_pubsub.cc
new file mode 100644 (file)
index 0000000..23d5661
--- /dev/null
@@ -0,0 +1,1069 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <algorithm>
+#include <boost/tokenizer.hpp>
+#include <optional>
+#include "rgw_rest_pubsub.h"
+#include "rgw_pubsub_push.h"
+#include "rgw_pubsub.h"
+#include "rgw_op.h"
+#include "rgw_rest.h"
+#include "rgw_rest_s3.h"
+#include "rgw_arn.h"
+#include "rgw_auth_s3.h"
+#include "rgw_notify.h"
+#include "rgw_sal_rados.h"
+#include "services/svc_zone.h"
+#include "common/dout.h"
+#include "rgw_url.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rgw
+
+static const char* AWS_SNS_NS("https://sns.amazonaws.com/doc/2010-03-31/");
+
+bool verify_transport_security(CephContext *cct, const RGWEnv& env) {
+  const auto is_secure = rgw_transport_is_secure(cct, env);
+  if (!is_secure && g_conf().get_val<bool>("rgw_allow_notification_secrets_in_cleartext")) {
+    ldout(cct, 0) << "WARNING: bypassing endpoint validation, allows sending secrets over insecure transport" << dendl;
+    return true;
+  }
+  return is_secure;
+}
+
+// make sure that endpoint is a valid URL
+// make sure that if user/password are passed inside URL, it is over secure connection
+// update rgw_pubsub_sub_dest to indicate that a password is stored in the URL
+bool validate_and_update_endpoint_secret(rgw_pubsub_sub_dest& dest, CephContext *cct, const RGWEnv& env) {
+  if (dest.push_endpoint.empty()) {
+      return true;
+  }
+  std::string user;
+  std::string password;
+  if (!rgw::parse_url_userinfo(dest.push_endpoint, user, password)) {
+    ldout(cct, 1) << "endpoint validation error: malformed endpoint URL:" << dest.push_endpoint << dendl;
+    return false;
+  }
+  // this should be verified inside parse_url()
+  ceph_assert(user.empty() == password.empty());
+  if (!user.empty()) {
+      dest.stored_secret = true;
+      if (!verify_transport_security(cct, env)) {
+        ldout(cct, 1) << "endpoint validation error: sending secrets over insecure transport" << dendl;
+        return false;
+      }
+  }
+  return true;
+}
+
+bool topic_has_endpoint_secret(const rgw_pubsub_topic_subs& topic) {
+    return topic.topic.dest.stored_secret;
+}
+
+bool topics_has_endpoint_secret(const rgw_pubsub_topics& topics) {
+    for (const auto& topic : topics.topics) {
+        if (topic_has_endpoint_secret(topic.second)) return true;
+    }
+    return false;
+}
+
+// command (AWS compliant): 
+// POST
+// Action=CreateTopic&Name=<topic-name>[&OpaqueData=data][&push-endpoint=<endpoint>[&persistent][&<arg1>=<value1>]]
+class RGWPSCreateTopicOp : public RGWOp {
+  private:
+  std::optional<RGWPubSub> ps;
+  std::string topic_name;
+  rgw_pubsub_sub_dest dest;
+  std::string topic_arn;
+  std::string opaque_data;
+  
+  int get_params() {
+    topic_name = s->info.args.get("Name");
+    if (topic_name.empty()) {
+      ldpp_dout(this, 1) << "CreateTopic Action 'Name' argument is missing" << dendl;
+      return -EINVAL;
+    }
+
+    opaque_data = s->info.args.get("OpaqueData");
+
+    dest.push_endpoint = s->info.args.get("push-endpoint");
+    s->info.args.get_bool("persistent", &dest.persistent, false);
+
+    if (!validate_and_update_endpoint_secret(dest, s->cct, *(s->info.env))) {
+      return -EINVAL;
+    }
+    for (const auto& param : s->info.args.get_params()) {
+      if (param.first == "Action" || param.first == "Name" || param.first == "PayloadHash") {
+        continue;
+      }
+      dest.push_endpoint_args.append(param.first+"="+param.second+"&");
+    }
+
+    if (!dest.push_endpoint_args.empty()) {
+      // remove last separator
+      dest.push_endpoint_args.pop_back();
+    }
+    if (!dest.push_endpoint.empty() && dest.persistent) {
+      const auto ret = rgw::notify::add_persistent_topic(topic_name, s->yield);
+      if (ret < 0) {
+        ldpp_dout(this, 1) << "CreateTopic Action failed to create queue for persistent topics. error:" << ret << dendl;
+        return ret;
+      }
+    }
+    
+    // dest object only stores endpoint info
+    dest.arn_topic = topic_name;
+    // the topic ARN will be sent in the reply
+    const rgw::ARN arn(rgw::Partition::aws, rgw::Service::sns, 
+        driver->get_zone()->get_zonegroup().get_name(),
+        s->user->get_tenant(), topic_name);
+    topic_arn = arn.to_string();
+    return 0;
+  }
+
+  public:
+  int verify_permission(optional_yield) override {
+    return 0;
+  }
+
+  void pre_exec() override {
+    rgw_bucket_object_pre_exec(s);
+  }
+  void execute(optional_yield) override;
+
+  const char* name() const override { return "pubsub_topic_create"; }
+  RGWOpType get_type() override { return RGW_OP_PUBSUB_TOPIC_CREATE; }
+  uint32_t op_mask() override { return RGW_OP_TYPE_WRITE; }
+
+  void send_response() override {
+    if (op_ret) {
+      set_req_state_err(s, op_ret);
+    }
+    dump_errno(s);
+    end_header(s, this, "application/xml");
+
+    if (op_ret < 0) {
+      return;
+    }
+
+    const auto f = s->formatter;
+    f->open_object_section_in_ns("CreateTopicResponse", AWS_SNS_NS);
+    f->open_object_section("CreateTopicResult");
+    encode_xml("TopicArn", topic_arn, f); 
+    f->close_section(); // CreateTopicResult
+    f->open_object_section("ResponseMetadata");
+    encode_xml("RequestId", s->req_id, f); 
+    f->close_section(); // ResponseMetadata
+    f->close_section(); // CreateTopicResponse
+    rgw_flush_formatter_and_reset(s, f);
+  }
+};
+
+void RGWPSCreateTopicOp::execute(optional_yield y) {
+  op_ret = get_params();
+  if (op_ret < 0) {
+    return;
+  }
+
+  ps.emplace(static_cast<rgw::sal::RadosStore*>(driver), s->owner.get_id().tenant);
+  op_ret = ps->create_topic(this, topic_name, dest, topic_arn, opaque_data, y);
+  if (op_ret < 0) {
+    ldpp_dout(this, 1) << "failed to create topic '" << topic_name << "', ret=" << op_ret << dendl;
+    return;
+  }
+  ldpp_dout(this, 20) << "successfully created topic '" << topic_name << "'" << dendl;
+}
+
+// command (AWS compliant): 
+// POST 
+// Action=ListTopics
+class RGWPSListTopicsOp : public RGWOp {
+private:
+  std::optional<RGWPubSub> ps;
+  rgw_pubsub_topics result;
+
+public:
+  int verify_permission(optional_yield) override {
+    return 0;
+  }
+  void pre_exec() override {
+    rgw_bucket_object_pre_exec(s);
+  }
+  void execute(optional_yield) override;
+
+  const char* name() const override { return "pubsub_topics_list"; }
+  RGWOpType get_type() override { return RGW_OP_PUBSUB_TOPICS_LIST; }
+  uint32_t op_mask() override { return RGW_OP_TYPE_READ; }
+
+  void send_response() override {
+    if (op_ret) {
+      set_req_state_err(s, op_ret);
+    }
+    dump_errno(s);
+    end_header(s, this, "application/xml");
+
+    if (op_ret < 0) {
+      return;
+    }
+
+    const auto f = s->formatter;
+    f->open_object_section_in_ns("ListTopicsResponse", AWS_SNS_NS);
+    f->open_object_section("ListTopicsResult");
+    encode_xml("Topics", result, f); 
+    f->close_section(); // ListTopicsResult
+    f->open_object_section("ResponseMetadata");
+    encode_xml("RequestId", s->req_id, f); 
+    f->close_section(); // ResponseMetadat
+    f->close_section(); // ListTopicsResponse
+    rgw_flush_formatter_and_reset(s, f);
+  }
+};
+
+void RGWPSListTopicsOp::execute(optional_yield y) {
+  ps.emplace(static_cast<rgw::sal::RadosStore*>(driver), s->owner.get_id().tenant);
+  op_ret = ps->get_topics(&result);
+  // if there are no topics it is not considered an error
+  op_ret = op_ret == -ENOENT ? 0 : op_ret;
+  if (op_ret < 0) {
+    ldpp_dout(this, 1) << "failed to get topics, ret=" << op_ret << dendl;
+    return;
+  }
+  if (topics_has_endpoint_secret(result) && !verify_transport_security(s->cct, *(s->info.env))) {
+    ldpp_dout(this, 1) << "topics contain secrets and cannot be sent over insecure transport" << dendl;
+    op_ret = -EPERM;
+    return;
+  }
+  ldpp_dout(this, 20) << "successfully got topics" << dendl;
+}
+
+// command (extension to AWS): 
+// POST
+// Action=GetTopic&TopicArn=<topic-arn>
+class RGWPSGetTopicOp : public RGWOp {
+  private:
+  std::string topic_name;
+  std::optional<RGWPubSub> ps;
+  rgw_pubsub_topic_subs result;
+  
+  int get_params() {
+    const auto topic_arn = rgw::ARN::parse((s->info.args.get("TopicArn")));
+
+    if (!topic_arn || topic_arn->resource.empty()) {
+        ldpp_dout(this, 1) << "GetTopic Action 'TopicArn' argument is missing or invalid" << dendl;
+        return -EINVAL;
+    }
+
+    topic_name = topic_arn->resource;
+    return 0;
+  }
+
+  public:
+  int verify_permission(optional_yield y) override {
+    return 0;
+  }
+  void pre_exec() override {
+    rgw_bucket_object_pre_exec(s);
+  }
+  void execute(optional_yield y) override;
+
+  const char* name() const override { return "pubsub_topic_get"; }
+  RGWOpType get_type() override { return RGW_OP_PUBSUB_TOPIC_GET; }
+  uint32_t op_mask() override { return RGW_OP_TYPE_READ; }
+
+  void send_response() override {
+    if (op_ret) {
+      set_req_state_err(s, op_ret);
+    }
+    dump_errno(s);
+    end_header(s, this, "application/xml");
+
+    if (op_ret < 0) {
+      return;
+    }
+
+    const auto f = s->formatter;
+    f->open_object_section("GetTopicResponse");
+    f->open_object_section("GetTopicResult");
+    encode_xml("Topic", result.topic, f); 
+    f->close_section();
+    f->open_object_section("ResponseMetadata");
+    encode_xml("RequestId", s->req_id, f); 
+    f->close_section();
+    f->close_section();
+    rgw_flush_formatter_and_reset(s, f);
+  }
+};
+
+void RGWPSGetTopicOp::execute(optional_yield y) {
+  op_ret = get_params();
+  if (op_ret < 0) {
+    return;
+  }
+  ps.emplace(static_cast<rgw::sal::RadosStore*>(driver), s->owner.get_id().tenant);
+  op_ret = ps->get_topic(topic_name, &result);
+  if (op_ret < 0) {
+    ldpp_dout(this, 1) << "failed to get topic '" << topic_name << "', ret=" << op_ret << dendl;
+    return;
+  }
+  if (topic_has_endpoint_secret(result) && !verify_transport_security(s->cct, *(s->info.env))) {
+    ldpp_dout(this, 1) << "topic '" << topic_name << "' contain secret and cannot be sent over insecure transport" << dendl;
+    op_ret = -EPERM;
+    return;
+  }
+  ldpp_dout(this, 1) << "successfully got topic '" << topic_name << "'" << dendl;
+}
+
+// command (AWS compliant): 
+// POST
+// Action=GetTopicAttributes&TopicArn=<topic-arn>
+class RGWPSGetTopicAttributesOp : public RGWOp {
+  private:
+  std::string topic_name;
+  std::optional<RGWPubSub> ps;
+  rgw_pubsub_topic_subs result;
+  
+  int get_params() {
+    const auto topic_arn = rgw::ARN::parse((s->info.args.get("TopicArn")));
+
+    if (!topic_arn || topic_arn->resource.empty()) {
+        ldpp_dout(this, 1) << "GetTopicAttribute Action 'TopicArn' argument is missing or invalid" << dendl;
+        return -EINVAL;
+    }
+
+    topic_name = topic_arn->resource;
+    return 0;
+  }
+
+  public:
+  int verify_permission(optional_yield y) override {
+    return 0;
+  }
+  void pre_exec() override {
+    rgw_bucket_object_pre_exec(s);
+  }
+  void execute(optional_yield y) override;
+
+  const char* name() const override { return "pubsub_topic_get"; }
+  RGWOpType get_type() override { return RGW_OP_PUBSUB_TOPIC_GET; }
+  uint32_t op_mask() override { return RGW_OP_TYPE_READ; }
+
+  void send_response() override {
+    if (op_ret) {
+      set_req_state_err(s, op_ret);
+    }
+    dump_errno(s);
+    end_header(s, this, "application/xml");
+
+    if (op_ret < 0) {
+      return;
+    }
+
+    const auto f = s->formatter;
+    f->open_object_section_in_ns("GetTopicAttributesResponse", AWS_SNS_NS);
+    f->open_object_section("GetTopicAttributesResult");
+    result.topic.dump_xml_as_attributes(f);
+    f->close_section(); // GetTopicAttributesResult
+    f->open_object_section("ResponseMetadata");
+    encode_xml("RequestId", s->req_id, f); 
+    f->close_section(); // ResponseMetadata
+    f->close_section(); // GetTopicAttributesResponse
+    rgw_flush_formatter_and_reset(s, f);
+  }
+};
+
+void RGWPSGetTopicAttributesOp::execute(optional_yield y) {
+  op_ret = get_params();
+  if (op_ret < 0) {
+    return;
+  }
+  ps.emplace(static_cast<rgw::sal::RadosStore*>(driver), s->owner.get_id().tenant);
+  op_ret = ps->get_topic(topic_name, &result);
+  if (op_ret < 0) {
+    ldpp_dout(this, 1) << "failed to get topic '" << topic_name << "', ret=" << op_ret << dendl;
+    return;
+  }
+  if (topic_has_endpoint_secret(result) && !verify_transport_security(s->cct, *(s->info.env))) {
+    ldpp_dout(this, 1) << "topic '" << topic_name << "' contain secret and cannot be sent over insecure transport" << dendl;
+    op_ret = -EPERM;
+    return;
+  }
+  ldpp_dout(this, 1) << "successfully got topic '" << topic_name << "'" << dendl;
+}
+
+// command (AWS compliant): 
+// POST
+// Action=DeleteTopic&TopicArn=<topic-arn>
+class RGWPSDeleteTopicOp : public RGWOp {
+  private:
+  std::string topic_name;
+  std::optional<RGWPubSub> ps;
+  
+  int get_params() {
+    const auto topic_arn = rgw::ARN::parse((s->info.args.get("TopicArn")));
+
+    if (!topic_arn || topic_arn->resource.empty()) {
+      ldpp_dout(this, 1) << "DeleteTopic Action 'TopicArn' argument is missing or invalid" << dendl;
+      return -EINVAL;
+    }
+
+    topic_name = topic_arn->resource;
+
+    // upon deletion it is not known if topic is persistent or not
+    // will try to delete the persistent topic anyway
+    const auto ret = rgw::notify::remove_persistent_topic(topic_name, s->yield);
+    if (ret == -ENOENT) {
+      // topic was not persistent, or already deleted
+      return 0;
+    }
+    if (ret < 0) {
+      ldpp_dout(this, 1) << "DeleteTopic Action failed to remove queue for persistent topics. error:" << ret << dendl;
+      return ret;
+    }
+
+    return 0;
+  }
+
+  public:
+  int verify_permission(optional_yield) override {
+    return 0;
+  }
+  void pre_exec() override {
+    rgw_bucket_object_pre_exec(s);
+  }
+  void execute(optional_yield y) override;
+
+  const char* name() const override { return "pubsub_topic_delete"; }
+  RGWOpType get_type() override { return RGW_OP_PUBSUB_TOPIC_DELETE; }
+  uint32_t op_mask() override { return RGW_OP_TYPE_DELETE; }
+
+  void send_response() override {
+    if (op_ret) {
+      set_req_state_err(s, op_ret);
+    }
+    dump_errno(s);
+    end_header(s, this, "application/xml");
+
+    if (op_ret < 0) {
+      return;
+    }
+
+    const auto f = s->formatter;
+    f->open_object_section_in_ns("DeleteTopicResponse", AWS_SNS_NS);
+    f->open_object_section("ResponseMetadata");
+    encode_xml("RequestId", s->req_id, f); 
+    f->close_section(); // ResponseMetadata
+    f->close_section(); // DeleteTopicResponse
+    rgw_flush_formatter_and_reset(s, f);
+  }
+};
+
+void RGWPSDeleteTopicOp::execute(optional_yield y) {
+  op_ret = get_params();
+  if (op_ret < 0) {
+    return;
+  }
+  ps.emplace(static_cast<rgw::sal::RadosStore*>(driver), s->owner.get_id().tenant);
+  op_ret = ps->remove_topic(this, topic_name, y);
+  if (op_ret < 0) {
+    ldpp_dout(this, 1) << "failed to remove topic '" << topic_name << ", ret=" << op_ret << dendl;
+    return;
+  }
+  ldpp_dout(this, 1) << "successfully removed topic '" << topic_name << "'" << dendl;
+}
+
+namespace {
+// utility classes and functions for handling parameters with the following format:
+// Attributes.entry.{N}.{key|value}={VALUE}
+// N - any unsigned number
+// VALUE - url encoded string
+
+// and Attribute is holding key and value
+// ctor and set are done according to the "type" argument
+// if type is not "key" or "value" its a no-op
+class Attribute {
+  std::string key;
+  std::string value;
+public:
+  Attribute(const std::string& type, const std::string& key_or_value) {
+    set(type, key_or_value);
+  }
+  void set(const std::string& type, const std::string& key_or_value) {
+    if (type == "key") {
+      key = key_or_value;
+    } else if (type == "value") {
+      value = key_or_value;
+    }
+  }
+  const std::string& get_key() const { return key; }
+  const std::string& get_value() const { return value; }
+};
+
+using AttributeMap = std::map<unsigned, Attribute>;
+
+// aggregate the attributes into a map
+// the key and value are associated by the index (N)
+// no assumptions are made on the order in which these parameters are added
+void update_attribute_map(const std::string& input, AttributeMap& map) {
+  const boost::char_separator<char> sep(".");
+  const boost::tokenizer tokens(input, sep);
+  auto token = tokens.begin();
+  if (*token != "Attributes") {
+      return;
+  }
+  ++token;
+
+  if (*token != "entry") {
+      return;
+  }
+  ++token;
+
+  unsigned idx;
+  try {
+    idx = std::stoul(*token);
+  } catch (const std::invalid_argument&) {
+    return;
+  }
+  ++token;
+
+  std::string key_or_value = "";
+  // get the rest of the string regardless of dots
+  // this is to allow dots in the value
+  while (token != tokens.end()) {
+    key_or_value.append(*token+".");
+    ++token;
+  }
+  // remove last separator
+  key_or_value.pop_back();
+
+  auto pos = key_or_value.find("=");
+  if (pos != std::string::npos) {
+    const auto key_or_value_lhs = key_or_value.substr(0, pos);
+    const auto key_or_value_rhs = url_decode(key_or_value.substr(pos + 1, key_or_value.size() - 1));
+    const auto map_it = map.find(idx);
+    if (map_it == map.end()) {
+      // new entry
+      map.emplace(std::make_pair(idx, Attribute(key_or_value_lhs, key_or_value_rhs)));
+    } else {
+      // existing entry
+      map_it->second.set(key_or_value_lhs, key_or_value_rhs);
+    }
+  }
+}
+}
+
+void RGWHandler_REST_PSTopic_AWS::rgw_topic_parse_input() {
+  if (post_body.size() > 0) {
+    ldpp_dout(s, 10) << "Content of POST: " << post_body << dendl;
+
+    if (post_body.find("Action") != std::string::npos) {
+      const boost::char_separator<char> sep("&");
+      const boost::tokenizer<boost::char_separator<char>> tokens(post_body, sep);
+      AttributeMap map;
+      for (const auto& t : tokens) {
+        auto pos = t.find("=");
+        if (pos != std::string::npos) {
+          const auto key = t.substr(0, pos);
+          if (key == "Action") {
+            s->info.args.append(key, t.substr(pos + 1, t.size() - 1));
+          } else if (key == "Name" || key == "TopicArn") {
+            const auto value = url_decode(t.substr(pos + 1, t.size() - 1));
+            s->info.args.append(key, value);
+          } else {
+            update_attribute_map(t, map);
+          }
+        }
+      }
+      // update the regular args with the content of the attribute map
+      for (const auto& attr : map) {
+          s->info.args.append(attr.second.get_key(), attr.second.get_value());
+      }
+    }
+    const auto payload_hash = rgw::auth::s3::calc_v4_payload_hash(post_body);
+    s->info.args.append("PayloadHash", payload_hash);
+  }
+}
+
+RGWOp* RGWHandler_REST_PSTopic_AWS::op_post() {
+  rgw_topic_parse_input();
+
+  if (s->info.args.exists("Action")) {
+    const auto action = s->info.args.get("Action");
+    if (action.compare("CreateTopic") == 0)
+      return new RGWPSCreateTopicOp();
+    if (action.compare("DeleteTopic") == 0)
+      return new RGWPSDeleteTopicOp;
+    if (action.compare("ListTopics") == 0)
+      return new RGWPSListTopicsOp();
+    if (action.compare("GetTopic") == 0)
+      return new RGWPSGetTopicOp();
+    if (action.compare("GetTopicAttributes") == 0)
+      return new RGWPSGetTopicAttributesOp();
+  }
+
+  return nullptr;
+}
+
+int RGWHandler_REST_PSTopic_AWS::authorize(const DoutPrefixProvider* dpp, optional_yield y) {
+  return RGW_Auth_S3::authorize(dpp, driver, auth_registry, s, y);
+}
+
+namespace {
+// return a unique topic by prefexing with the notification name: <notification>_<topic>
+std::string topic_to_unique(const std::string& topic, const std::string& notification) {
+  return notification + "_" + topic;
+}
+
+// extract the topic from a unique topic of the form: <notification>_<topic>
+[[maybe_unused]] std::string unique_to_topic(const std::string& unique_topic, const std::string& notification) {
+  if (unique_topic.find(notification + "_") == std::string::npos) {
+    return "";
+  }
+  return unique_topic.substr(notification.length() + 1);
+}
+
+// from list of bucket topics, find the one that was auto-generated by a notification
+auto find_unique_topic(const rgw_pubsub_bucket_topics& bucket_topics, const std::string& notif_name) {
+    auto it = std::find_if(bucket_topics.topics.begin(), bucket_topics.topics.end(), [&](const auto& val) { return notif_name == val.second.s3_id; });
+    return it != bucket_topics.topics.end() ?
+        std::optional<std::reference_wrapper<const rgw_pubsub_topic_filter>>(it->second):
+        std::nullopt;
+}
+}
+
+int remove_notification_by_topic(const DoutPrefixProvider *dpp, const std::string& topic_name, const RGWPubSub::BucketRef& b, optional_yield y, RGWPubSub& ps) {
+  int op_ret = b->remove_notification(dpp, topic_name, y);
+  if (op_ret < 0) {
+    ldpp_dout(dpp, 1) << "failed to remove notification of topic '" << topic_name << "', ret=" << op_ret << dendl;
+  }
+  op_ret = ps.remove_topic(dpp, topic_name, y);
+  if (op_ret < 0) {
+    ldpp_dout(dpp, 1) << "failed to remove auto-generated topic '" << topic_name << "', ret=" << op_ret << dendl;
+  }
+  return op_ret;
+}
+
+int delete_all_notifications(const DoutPrefixProvider *dpp, const rgw_pubsub_bucket_topics& bucket_topics, const RGWPubSub::BucketRef& b, optional_yield y, RGWPubSub& ps) {
+  // delete all notifications of on a bucket
+  for (const auto& topic : bucket_topics.topics) {
+    const auto op_ret = remove_notification_by_topic(dpp, topic.first, b, y, ps);
+    if (op_ret < 0) {
+      return op_ret;
+    }
+  }
+  return 0;
+}
+
+// command (S3 compliant): PUT /<bucket name>?notification
+// a "notification" and a subscription will be auto-generated
+// actual configuration is XML encoded in the body of the message
+class RGWPSCreateNotifOp : public RGWDefaultResponseOp {
+  private:
+  std::optional<RGWPubSub> ps;
+  std::string bucket_name;
+  RGWBucketInfo bucket_info;
+  rgw_pubsub_s3_notifications configurations;
+
+  int get_params() {
+    bool exists;
+    const auto no_value = s->info.args.get("notification", &exists);
+    if (!exists) {
+      ldpp_dout(this, 1) << "missing required param 'notification'" << dendl;
+      return -EINVAL;
+    } 
+    if (no_value.length() > 0) {
+      ldpp_dout(this, 1) << "param 'notification' should not have any value" << dendl;
+      return -EINVAL;
+    }
+    if (s->bucket_name.empty()) {
+      ldpp_dout(this, 1) << "request must be on a bucket" << dendl;
+      return -EINVAL;
+    }
+    bucket_name = s->bucket_name;
+    return 0;
+  }
+
+  public:
+  int verify_permission(optional_yield y) override;
+
+  void pre_exec() override {
+    rgw_bucket_object_pre_exec(s);
+  }
+
+  const char* name() const override { return "pubsub_notification_create_s3"; }
+  RGWOpType get_type() override { return RGW_OP_PUBSUB_NOTIF_CREATE; }
+  uint32_t op_mask() override { return RGW_OP_TYPE_WRITE; }
+
+  int get_params_from_body() {
+    const auto max_size = s->cct->_conf->rgw_max_put_param_size;
+    int r;
+    bufferlist data;
+    std::tie(r, data) = read_all_input(s, max_size, false);
+
+    if (r < 0) {
+      ldpp_dout(this, 1) << "failed to read XML payload" << dendl;
+      return r;
+    }
+    if (data.length() == 0) {
+      ldpp_dout(this, 1) << "XML payload missing" << dendl;
+      return -EINVAL;
+    }
+
+    RGWXMLDecoder::XMLParser parser;
+
+    if (!parser.init()){
+      ldpp_dout(this, 1) << "failed to initialize XML parser" << dendl;
+      return -EINVAL;
+    }
+    if (!parser.parse(data.c_str(), data.length(), 1)) {
+      ldpp_dout(this, 1) << "failed to parse XML payload" << dendl;
+      return -ERR_MALFORMED_XML;
+    }
+    try {
+      // NotificationConfigurations is mandatory
+      // It can be empty which means we delete all the notifications
+      RGWXMLDecoder::decode_xml("NotificationConfiguration", configurations, &parser, true);
+    } catch (RGWXMLDecoder::err& err) {
+      ldpp_dout(this, 1) << "failed to parse XML payload. error: " << err << dendl;
+      return -ERR_MALFORMED_XML;
+    }
+    return 0;
+  }
+
+  void execute(optional_yield) override;
+};
+
+void RGWPSCreateNotifOp::execute(optional_yield y) {
+  op_ret = get_params_from_body();
+  if (op_ret < 0) {
+    return;
+  }
+
+  ps.emplace(static_cast<rgw::sal::RadosStore*>(driver), s->owner.get_id().tenant);
+  auto b = ps->get_bucket(bucket_info.bucket);
+  ceph_assert(b);
+
+  if(configurations.list.empty()) {
+    // get all topics on a bucket
+    rgw_pubsub_bucket_topics bucket_topics;
+    op_ret = b->get_topics(&bucket_topics);
+    if (op_ret < 0) {
+      ldpp_dout(this, 1) << "failed to get list of topics from bucket '" << bucket_info.bucket.name << "', ret=" << op_ret << dendl;
+      return;
+    }
+
+    op_ret = delete_all_notifications(this, bucket_topics, b, y, *ps);
+    return;
+  }
+
+  for (const auto& c : configurations.list) {
+    const auto& notif_name = c.id;
+    if (notif_name.empty()) {
+      ldpp_dout(this, 1) << "missing notification id" << dendl;
+      op_ret = -EINVAL;
+      return;
+    }
+    if (c.topic_arn.empty()) {
+      ldpp_dout(this, 1) << "missing topic ARN in notification: '" << notif_name << "'" << dendl;
+      op_ret = -EINVAL;
+      return;
+    }
+
+    const auto arn = rgw::ARN::parse(c.topic_arn);
+    if (!arn || arn->resource.empty()) {
+      ldpp_dout(this, 1) << "topic ARN has invalid format: '" << c.topic_arn << "' in notification: '" << notif_name << "'" << dendl;
+      op_ret = -EINVAL;
+      return;
+    }
+
+    if (std::find(c.events.begin(), c.events.end(), rgw::notify::UnknownEvent) != c.events.end()) {
+      ldpp_dout(this, 1) << "unknown event type in notification: '" << notif_name << "'" << dendl;
+      op_ret = -EINVAL;
+      return;
+    }
+
+    const auto topic_name = arn->resource;
+
+    // get topic information. destination information is stored in the topic
+    rgw_pubsub_topic topic_info;  
+    op_ret = ps->get_topic(topic_name, &topic_info);
+    if (op_ret < 0) {
+      ldpp_dout(this, 1) << "failed to get topic '" << topic_name << "', ret=" << op_ret << dendl;
+      return;
+    }
+    // make sure that full topic configuration match
+    // TODO: use ARN match function
+    
+    // create unique topic name. this has 2 reasons:
+    // (1) topics cannot be shared between different S3 notifications because they hold the filter information
+    // (2) make topic clneaup easier, when notification is removed
+    const auto unique_topic_name = topic_to_unique(topic_name, notif_name);
+    // generate the internal topic. destination is stored here for the "push-only" case
+    // when no subscription exists
+    // ARN is cached to make the "GET" method faster
+    op_ret = ps->create_topic(this, unique_topic_name, topic_info.dest, topic_info.arn, topic_info.opaque_data, y);
+    if (op_ret < 0) {
+      ldpp_dout(this, 1) << "failed to auto-generate unique topic '" << unique_topic_name << 
+        "', ret=" << op_ret << dendl;
+      return;
+    }
+    ldpp_dout(this, 20) << "successfully auto-generated unique topic '" << unique_topic_name << "'" << dendl;
+    // generate the notification
+    rgw::notify::EventTypeList events;
+    op_ret = b->create_notification(this, unique_topic_name, c.events, std::make_optional(c.filter), notif_name, y);
+    if (op_ret < 0) {
+      ldpp_dout(this, 1) << "failed to auto-generate notification for unique topic '" << unique_topic_name <<
+        "', ret=" << op_ret << dendl;
+      // rollback generated topic (ignore return value)
+      ps->remove_topic(this, unique_topic_name, y);
+      return;
+    }
+    ldpp_dout(this, 20) << "successfully auto-generated notification for unique topic '" << unique_topic_name << "'" << dendl;
+  }
+}
+
+int RGWPSCreateNotifOp::verify_permission(optional_yield y) {
+  int ret = get_params();
+  if (ret < 0) {
+    return ret;
+  }
+
+  std::unique_ptr<rgw::sal::User> user = driver->get_user(s->owner.get_id());
+  std::unique_ptr<rgw::sal::Bucket> bucket;
+  ret = driver->get_bucket(this, user.get(), s->owner.get_id().tenant, bucket_name, &bucket, y);
+  if (ret < 0) {
+    ldpp_dout(this, 1) << "failed to get bucket info, cannot verify ownership" << dendl;
+    return ret;
+  }
+  bucket_info = bucket->get_info();
+
+  if (bucket_info.owner != s->owner.get_id()) {
+    ldpp_dout(this, 1) << "user doesn't own bucket, not allowed to create notification" << dendl;
+    return -EPERM;
+  }
+  return 0;
+}
+
+// command (extension to S3): DELETE /bucket?notification[=<notification-id>]
+class RGWPSDeleteNotifOp : public RGWDefaultResponseOp {
+  private:
+  std::optional<RGWPubSub> ps;
+  std::string bucket_name;
+  RGWBucketInfo bucket_info;
+  std::string notif_name;
+  
+  public:
+  int verify_permission(optional_yield y) override;
+
+  void pre_exec() override {
+    rgw_bucket_object_pre_exec(s);
+  }
+  
+  const char* name() const override { return "pubsub_notification_delete_s3"; }
+  RGWOpType get_type() override { return RGW_OP_PUBSUB_NOTIF_DELETE; }
+  uint32_t op_mask() override { return RGW_OP_TYPE_DELETE; }
+
+  int get_params() {
+    bool exists;
+    notif_name = s->info.args.get("notification", &exists);
+    if (!exists) {
+      ldpp_dout(this, 1) << "missing required param 'notification'" << dendl;
+      return -EINVAL;
+    } 
+    if (s->bucket_name.empty()) {
+      ldpp_dout(this, 1) << "request must be on a bucket" << dendl;
+      return -EINVAL;
+    }
+    bucket_name = s->bucket_name;
+    return 0;
+  }
+
+  void execute(optional_yield y) override;
+};
+
+void RGWPSDeleteNotifOp::execute(optional_yield y) {
+  op_ret = get_params();
+  if (op_ret < 0) {
+    return;
+  }
+
+  ps.emplace(static_cast<rgw::sal::RadosStore*>(driver), s->owner.get_id().tenant);
+  auto b = ps->get_bucket(bucket_info.bucket);
+  ceph_assert(b);
+
+  // get all topics on a bucket
+  rgw_pubsub_bucket_topics bucket_topics;
+  op_ret = b->get_topics(&bucket_topics);
+  if (op_ret < 0) {
+    ldpp_dout(this, 1) << "failed to get list of topics from bucket '" << bucket_info.bucket.name << "', ret=" << op_ret << dendl;
+    return;
+  }
+
+  if (!notif_name.empty()) {
+    // delete a specific notification
+    const auto unique_topic = find_unique_topic(bucket_topics, notif_name);
+    if (unique_topic) {
+      const auto unique_topic_name = unique_topic->get().topic.name;
+      op_ret = remove_notification_by_topic(this, unique_topic_name, b, y, *ps);
+      return;
+    }
+    // notification to be removed is not found - considered success
+    ldpp_dout(this, 20) << "notification '" << notif_name << "' already removed" << dendl;
+    return;
+  }
+
+  op_ret = delete_all_notifications(this, bucket_topics, b, y, *ps);
+}
+
+int RGWPSDeleteNotifOp::verify_permission(optional_yield y) {
+  int ret = get_params();
+  if (ret < 0) {
+    return ret;
+  }
+
+  std::unique_ptr<rgw::sal::User> user = driver->get_user(s->owner.get_id());
+  std::unique_ptr<rgw::sal::Bucket> bucket;
+  ret = driver->get_bucket(this, user.get(), s->owner.get_id().tenant, bucket_name, &bucket, y);
+  if (ret < 0) {
+    return ret;
+  }
+  bucket_info = bucket->get_info();
+
+  if (bucket_info.owner != s->owner.get_id()) {
+    ldpp_dout(this, 1) << "user doesn't own bucket, cannot remove notification" << dendl;
+    return -EPERM;
+  }
+  return 0;
+}
+
+// command (S3 compliant): GET /bucket?notification[=<notification-id>]
+class RGWPSListNotifsOp : public RGWOp {
+private:
+  std::string bucket_name;
+  RGWBucketInfo bucket_info;
+  std::optional<RGWPubSub> ps;
+  std::string notif_name;
+  rgw_pubsub_s3_notifications notifications;
+
+  int get_params() {
+    bool exists;
+    notif_name = s->info.args.get("notification", &exists);
+    if (!exists) {
+      ldpp_dout(this, 1) << "missing required param 'notification'" << dendl;
+      return -EINVAL;
+    } 
+    if (s->bucket_name.empty()) {
+      ldpp_dout(this, 1) << "request must be on a bucket" << dendl;
+      return -EINVAL;
+    }
+    bucket_name = s->bucket_name;
+    return 0;
+  }
+
+  public:
+  int verify_permission(optional_yield y) override;
+
+  void pre_exec() override {
+    rgw_bucket_object_pre_exec(s);
+  }
+
+  const char* name() const override { return "pubsub_notifications_get_s3"; }
+  RGWOpType get_type() override { return RGW_OP_PUBSUB_NOTIF_LIST; }
+  uint32_t op_mask() override { return RGW_OP_TYPE_READ; }
+
+  void execute(optional_yield y) override;
+  void send_response() override {
+    if (op_ret) {
+      set_req_state_err(s, op_ret);
+    }
+    dump_errno(s);
+    end_header(s, this, "application/xml");
+
+    if (op_ret < 0) {
+      return;
+    }
+    notifications.dump_xml(s->formatter);
+    rgw_flush_formatter_and_reset(s, s->formatter);
+  }
+};
+
+void RGWPSListNotifsOp::execute(optional_yield y) {
+  ps.emplace(static_cast<rgw::sal::RadosStore*>(driver), s->owner.get_id().tenant);
+  auto b = ps->get_bucket(bucket_info.bucket);
+  ceph_assert(b);
+  
+  // get all topics on a bucket
+  rgw_pubsub_bucket_topics bucket_topics;
+  op_ret = b->get_topics(&bucket_topics);
+  if (op_ret < 0) {
+    ldpp_dout(this, 1) << "failed to get list of topics from bucket '" << bucket_info.bucket.name << "', ret=" << op_ret << dendl;
+    return;
+  }
+  if (!notif_name.empty()) {
+    // get info of a specific notification
+    const auto unique_topic = find_unique_topic(bucket_topics, notif_name);
+    if (unique_topic) {
+      notifications.list.emplace_back(unique_topic->get());
+      return;
+    }
+    op_ret = -ENOENT;
+    ldpp_dout(this, 1) << "failed to get notification info for '" << notif_name << "', ret=" << op_ret << dendl;
+    return;
+  }
+  // loop through all topics of the bucket
+  for (const auto& topic : bucket_topics.topics) {
+    if (topic.second.s3_id.empty()) {
+        // not an s3 notification
+        continue;
+    }
+    notifications.list.emplace_back(topic.second);
+  }
+}
+
+int RGWPSListNotifsOp::verify_permission(optional_yield y) {
+  int ret = get_params();
+  if (ret < 0) {
+    return ret;
+  }
+
+  std::unique_ptr<rgw::sal::User> user = driver->get_user(s->owner.get_id());
+  std::unique_ptr<rgw::sal::Bucket> bucket;
+  ret = driver->get_bucket(this, user.get(), s->owner.get_id().tenant, bucket_name, &bucket, y);
+  if (ret < 0) {
+    return ret;
+  }
+  bucket_info = bucket->get_info();
+
+  if (bucket_info.owner != s->owner.get_id()) {
+    ldpp_dout(this, 1) << "user doesn't own bucket, cannot get notification list" << dendl;
+    return -EPERM;
+  }
+
+  return 0;
+}
+
+RGWOp* RGWHandler_REST_PSNotifs_S3::op_get() {
+  return new RGWPSListNotifsOp();
+}
+
+RGWOp* RGWHandler_REST_PSNotifs_S3::op_put() {
+  return new RGWPSCreateNotifOp();
+}
+
+RGWOp* RGWHandler_REST_PSNotifs_S3::op_delete() {
+  return new RGWPSDeleteNotifOp();
+}
+
+RGWOp* RGWHandler_REST_PSNotifs_S3::create_get_op() {
+    return new RGWPSListNotifsOp();
+}
+
+RGWOp* RGWHandler_REST_PSNotifs_S3::create_put_op() {
+  return new RGWPSCreateNotifOp();
+}
+
+RGWOp* RGWHandler_REST_PSNotifs_S3::create_delete_op() {
+  return new RGWPSDeleteNotifOp();
+}
+
diff --git a/src/rgw/driver/rados/rgw_rest_pubsub.h b/src/rgw/driver/rados/rgw_rest_pubsub.h
new file mode 100644 (file)
index 0000000..3b1a1bc
--- /dev/null
@@ -0,0 +1,39 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#pragma once
+
+#include "rgw_rest_s3.h"
+
+// s3 compliant notification handler factory
+class RGWHandler_REST_PSNotifs_S3 : public RGWHandler_REST_S3 {
+protected:
+  int init_permissions(RGWOp* op, optional_yield y) override {return 0;}
+  int read_permissions(RGWOp* op, optional_yield y) override {return 0;}
+  bool supports_quota() override {return false;}
+  RGWOp* op_get() override;
+  RGWOp* op_put() override;
+  RGWOp* op_delete() override;
+public:
+  using RGWHandler_REST_S3::RGWHandler_REST_S3;
+  virtual ~RGWHandler_REST_PSNotifs_S3() = default;
+  // following are used to generate the operations when invoked by another REST handler
+  static RGWOp* create_get_op();
+  static RGWOp* create_put_op();
+  static RGWOp* create_delete_op();
+};
+
+// AWS compliant topics handler factory
+class RGWHandler_REST_PSTopic_AWS : public RGWHandler_REST {
+  const rgw::auth::StrategyRegistry& auth_registry;
+  const std::string& post_body;
+  void rgw_topic_parse_input();
+protected:
+  RGWOp* op_post() override;
+public:
+  RGWHandler_REST_PSTopic_AWS(const rgw::auth::StrategyRegistry& _auth_registry, const std::string& _post_body) : 
+      auth_registry(_auth_registry),
+      post_body(_post_body) {}
+  virtual ~RGWHandler_REST_PSTopic_AWS() = default;
+  int postauth_init(optional_yield) override { return 0; }
+  int authorize(const DoutPrefixProvider* dpp, optional_yield y) override;
+};
diff --git a/src/rgw/driver/rados/rgw_rest_realm.cc b/src/rgw/driver/rados/rgw_rest_realm.cc
new file mode 100644 (file)
index 0000000..79640a2
--- /dev/null
@@ -0,0 +1,376 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "common/errno.h"
+#include "rgw_rest_realm.h"
+#include "rgw_rest_s3.h"
+#include "rgw_rest_config.h"
+#include "rgw_zone.h"
+#include "rgw_sal_rados.h"
+
+#include "services/svc_zone.h"
+#include "services/svc_mdlog.h"
+
+#include "include/ceph_assert.h"
+
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+
+// reject 'period push' if we would have to fetch too many intermediate periods
+static const uint32_t PERIOD_HISTORY_FETCH_MAX = 64;
+
+// base period op, shared between Get and Post
+class RGWOp_Period_Base : public RGWRESTOp {
+ protected:
+  RGWPeriod period;
+  std::ostringstream error_stream;
+ public:
+  int verify_permission(optional_yield) override { return 0; }
+  void send_response() override;
+};
+
+// reply with the period object on success
+void RGWOp_Period_Base::send_response()
+{
+  set_req_state_err(s, op_ret, error_stream.str());
+  dump_errno(s);
+
+  if (op_ret < 0) {
+    if (!s->err.message.empty()) {
+      ldpp_dout(this, 4) << "Request failed with " << op_ret
+          << ": " << s->err.message << dendl;
+    }
+    end_header(s);
+    return;
+  }
+
+  encode_json("period", period, s->formatter);
+  end_header(s, NULL, "application/json", s->formatter->get_len());
+  flusher.flush();
+}
+
+// GET /admin/realm/period
+class RGWOp_Period_Get : public RGWOp_Period_Base {
+ public:
+  void execute(optional_yield y) override;
+  int check_caps(const RGWUserCaps& caps) override {
+    return caps.check_cap("zone", RGW_CAP_READ);
+  }
+  int verify_permission(optional_yield) override {
+    return check_caps(s->user->get_caps());
+  }
+  const char* name() const override { return "get_period"; }
+};
+
+void RGWOp_Period_Get::execute(optional_yield y)
+{
+  string realm_id, realm_name, period_id;
+  epoch_t epoch = 0;
+  RESTArgs::get_string(s, "realm_id", realm_id, &realm_id);
+  RESTArgs::get_string(s, "realm_name", realm_name, &realm_name);
+  RESTArgs::get_string(s, "period_id", period_id, &period_id);
+  RESTArgs::get_uint32(s, "epoch", 0, &epoch);
+
+  period.set_id(period_id);
+  period.set_epoch(epoch);
+
+  op_ret = period.init(this, driver->ctx(), static_cast<rgw::sal::RadosStore*>(driver)->svc()->sysobj, realm_id, y, realm_name);
+  if (op_ret < 0)
+    ldpp_dout(this, 5) << "failed to read period" << dendl;
+}
+
+// POST /admin/realm/period
+class RGWOp_Period_Post : public RGWOp_Period_Base {
+ public:
+  void execute(optional_yield y) override;
+  int check_caps(const RGWUserCaps& caps) override {
+    return caps.check_cap("zone", RGW_CAP_WRITE);
+  }
+  int verify_permission(optional_yield) override {
+    return check_caps(s->user->get_caps());
+  }
+  const char* name() const override { return "post_period"; }
+  RGWOpType get_type() override { return RGW_OP_PERIOD_POST; }
+};
+
+void RGWOp_Period_Post::execute(optional_yield y)
+{
+  auto cct = driver->ctx();
+
+  // initialize the period without reading from rados
+  period.init(this, cct, static_cast<rgw::sal::RadosStore*>(driver)->svc()->sysobj, y, false);
+
+  // decode the period from input
+  const auto max_size = cct->_conf->rgw_max_put_param_size;
+  bool empty;
+  op_ret = get_json_input(cct, s, period, max_size, &empty);
+  if (op_ret < 0) {
+    ldpp_dout(this, -1) << "failed to decode period" << dendl;
+    return;
+  }
+
+  // require period.realm_id to match our realm
+  if (period.get_realm() != static_cast<rgw::sal::RadosStore*>(driver)->svc()->zone->get_realm().get_id()) {
+    error_stream << "period with realm id " << period.get_realm()
+        << " doesn't match current realm " << static_cast<rgw::sal::RadosStore*>(driver)->svc()->zone->get_realm().get_id() << std::endl;
+    op_ret = -EINVAL;
+    return;
+  }
+
+  // load the realm and current period from rados; there may be a more recent
+  // period that we haven't restarted with yet. we also don't want to modify
+  // the objects in use by RGWRados
+  RGWRealm realm(period.get_realm());
+  op_ret = realm.init(this, cct, static_cast<rgw::sal::RadosStore*>(driver)->svc()->sysobj, y);
+  if (op_ret < 0) {
+    ldpp_dout(this, -1) << "failed to read current realm: "
+        << cpp_strerror(-op_ret) << dendl;
+    return;
+  }
+
+  RGWPeriod current_period;
+  op_ret = current_period.init(this, cct, static_cast<rgw::sal::RadosStore*>(driver)->svc()->sysobj, realm.get_id(), y);
+  if (op_ret < 0) {
+    ldpp_dout(this, -1) << "failed to read current period: "
+        << cpp_strerror(-op_ret) << dendl;
+    return;
+  }
+
+  // if period id is empty, handle as 'period commit'
+  if (period.get_id().empty()) {
+    op_ret = period.commit(this, driver, realm, current_period, error_stream, y);
+    if (op_ret < 0) {
+      ldpp_dout(this, -1) << "master zone failed to commit period" << dendl;
+    }
+    return;
+  }
+
+  // if it's not period commit, nobody is allowed to push to the master zone
+  if (period.get_master_zone() == static_cast<rgw::sal::RadosStore*>(driver)->svc()->zone->get_zone_params().get_id()) {
+    ldpp_dout(this, 10) << "master zone rejecting period id="
+        << period.get_id() << " epoch=" << period.get_epoch() << dendl;
+    op_ret = -EINVAL; // XXX: error code
+    return;
+  }
+
+  // write the period to rados
+  op_ret = period.store_info(this, false, y);
+  if (op_ret < 0) {
+    ldpp_dout(this, -1) << "failed to store period " << period.get_id() << dendl;
+    return;
+  }
+  // set as latest epoch
+  op_ret = period.update_latest_epoch(this, period.get_epoch(), y);
+  if (op_ret == -EEXIST) {
+    // already have this epoch (or a more recent one)
+    ldpp_dout(this, 4) << "already have epoch >= " << period.get_epoch()
+        << " for period " << period.get_id() << dendl;
+    op_ret = 0;
+    return;
+  }
+  if (op_ret < 0) {
+    ldpp_dout(this, -1) << "failed to set latest epoch" << dendl;
+    return;
+  }
+
+  auto period_history = static_cast<rgw::sal::RadosStore*>(driver)->svc()->mdlog->get_period_history();
+
+  // decide whether we can set_current_period() or set_latest_epoch()
+  if (period.get_id() != current_period.get_id()) {
+    auto current_epoch = current_period.get_realm_epoch();
+    // discard periods in the past
+    if (period.get_realm_epoch() < current_epoch) {
+      ldpp_dout(this, 10) << "discarding period " << period.get_id()
+          << " with realm epoch " << period.get_realm_epoch()
+          << " older than current epoch " << current_epoch << dendl;
+      // return success to ack that we have this period
+      return;
+    }
+    // discard periods too far in the future
+    if (period.get_realm_epoch() > current_epoch + PERIOD_HISTORY_FETCH_MAX) {
+      ldpp_dout(this, -1) << "discarding period " << period.get_id()
+          << " with realm epoch " << period.get_realm_epoch() << " too far in "
+          "the future from current epoch " << current_epoch << dendl;
+      op_ret = -ENOENT; // XXX: error code
+      return;
+    }
+    // attach a copy of the period into the period history
+    auto cursor = period_history->attach(this, RGWPeriod{period}, y);
+    if (!cursor) {
+      // we're missing some history between the new period and current_period
+      op_ret = cursor.get_error();
+      ldpp_dout(this, -1) << "failed to collect the periods between current period "
+          << current_period.get_id() << " (realm epoch " << current_epoch
+          << ") and the new period " << period.get_id()
+          << " (realm epoch " << period.get_realm_epoch()
+          << "): " << cpp_strerror(-op_ret) << dendl;
+      return;
+    }
+    if (cursor.has_next()) {
+      // don't switch if we have a newer period in our history
+      ldpp_dout(this, 4) << "attached period " << period.get_id()
+          << " to history, but the history contains newer periods" << dendl;
+      return;
+    }
+    // set as current period
+    op_ret = realm.set_current_period(this, period, y);
+    if (op_ret < 0) {
+      ldpp_dout(this, -1) << "failed to update realm's current period" << dendl;
+      return;
+    }
+    ldpp_dout(this, 4) << "period " << period.get_id()
+        << " is newer than current period " << current_period.get_id()
+        << ", updating realm's current period and notifying zone" << dendl;
+    realm.notify_new_period(this, period, y);
+    return;
+  }
+  // reflect the period into our local objects
+  op_ret = period.reflect(this, y);
+  if (op_ret < 0) {
+    ldpp_dout(this, -1) << "failed to update local objects: "
+        << cpp_strerror(-op_ret) << dendl;
+    return;
+  }
+  ldpp_dout(this, 4) << "period epoch " << period.get_epoch()
+      << " is newer than current epoch " << current_period.get_epoch()
+      << ", updating period's latest epoch and notifying zone" << dendl;
+  realm.notify_new_period(this, period, y);
+  // update the period history
+  period_history->insert(RGWPeriod{period});
+}
+
+class RGWHandler_Period : public RGWHandler_Auth_S3 {
+ protected:
+  using RGWHandler_Auth_S3::RGWHandler_Auth_S3;
+
+  RGWOp *op_get() override { return new RGWOp_Period_Get; }
+  RGWOp *op_post() override { return new RGWOp_Period_Post; }
+};
+
+class RGWRESTMgr_Period : public RGWRESTMgr {
+ public:
+  RGWHandler_REST* get_handler(rgw::sal::Driver* driver,
+                              req_state*,
+                               const rgw::auth::StrategyRegistry& auth_registry,
+                               const std::string&) override {
+    return new RGWHandler_Period(auth_registry);
+  }
+};
+
+
+// GET /admin/realm
+class RGWOp_Realm_Get : public RGWRESTOp {
+  std::unique_ptr<RGWRealm> realm;
+public:
+  int check_caps(const RGWUserCaps& caps) override {
+    return caps.check_cap("zone", RGW_CAP_READ);
+  }
+  int verify_permission(optional_yield) override {
+    return check_caps(s->user->get_caps());
+  }
+  void execute(optional_yield y) override;
+  void send_response() override;
+  const char* name() const override { return "get_realm"; }
+};
+
+void RGWOp_Realm_Get::execute(optional_yield y)
+{
+  string id;
+  RESTArgs::get_string(s, "id", id, &id);
+  string name;
+  RESTArgs::get_string(s, "name", name, &name);
+
+  // read realm
+  realm.reset(new RGWRealm(id, name));
+  op_ret = realm->init(this, g_ceph_context, static_cast<rgw::sal::RadosStore*>(driver)->svc()->sysobj, y);
+  if (op_ret < 0)
+    ldpp_dout(this, -1) << "failed to read realm id=" << id
+        << " name=" << name << dendl;
+}
+
+void RGWOp_Realm_Get::send_response()
+{
+  set_req_state_err(s, op_ret);
+  dump_errno(s);
+
+  if (op_ret < 0) {
+    end_header(s);
+    return;
+  }
+
+  encode_json("realm", *realm, s->formatter);
+  end_header(s, NULL, "application/json", s->formatter->get_len());
+  flusher.flush();
+}
+
+// GET /admin/realm?list
+class RGWOp_Realm_List : public RGWRESTOp {
+  std::string default_id;
+  std::list<std::string> realms;
+public:
+  int check_caps(const RGWUserCaps& caps) override {
+    return caps.check_cap("zone", RGW_CAP_READ);
+  }
+  int verify_permission(optional_yield) override {
+    return check_caps(s->user->get_caps());
+  }
+  void execute(optional_yield y) override;
+  void send_response() override;
+  const char* name() const override { return "list_realms"; }
+};
+
+void RGWOp_Realm_List::execute(optional_yield y)
+{
+  {
+    // read default realm
+    RGWRealm realm(driver->ctx(), static_cast<rgw::sal::RadosStore*>(driver)->svc()->sysobj);
+    [[maybe_unused]] int ret = realm.read_default_id(this, default_id, y);
+  }
+  op_ret = static_cast<rgw::sal::RadosStore*>(driver)->svc()->zone->list_realms(this, realms);
+  if (op_ret < 0)
+    ldpp_dout(this, -1) << "failed to list realms" << dendl;
+}
+
+void RGWOp_Realm_List::send_response()
+{
+  set_req_state_err(s, op_ret);
+  dump_errno(s);
+
+  if (op_ret < 0) {
+    end_header(s);
+    return;
+  }
+
+  s->formatter->open_object_section("realms_list");
+  encode_json("default_info", default_id, s->formatter);
+  encode_json("realms", realms, s->formatter);
+  s->formatter->close_section();
+  end_header(s, NULL, "application/json", s->formatter->get_len());
+  flusher.flush();
+}
+
+class RGWHandler_Realm : public RGWHandler_Auth_S3 {
+protected:
+  using RGWHandler_Auth_S3::RGWHandler_Auth_S3;
+  RGWOp *op_get() override {
+    if (s->info.args.sub_resource_exists("list"))
+      return new RGWOp_Realm_List;
+    return new RGWOp_Realm_Get;
+  }
+};
+
+RGWRESTMgr_Realm::RGWRESTMgr_Realm()
+{
+  // add the /admin/realm/period resource
+  register_resource("period", new RGWRESTMgr_Period);
+}
+
+RGWHandler_REST*
+RGWRESTMgr_Realm::get_handler(rgw::sal::Driver* driver,
+                             req_state*,
+                              const rgw::auth::StrategyRegistry& auth_registry,
+                              const std::string&)
+{
+  return new RGWHandler_Realm(auth_registry);
+}
diff --git a/src/rgw/driver/rados/rgw_rest_realm.h b/src/rgw/driver/rados/rgw_rest_realm.h
new file mode 100644 (file)
index 0000000..a0d1dc1
--- /dev/null
@@ -0,0 +1,16 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include "rgw_rest.h"
+
+class RGWRESTMgr_Realm : public RGWRESTMgr {
+public:
+  RGWRESTMgr_Realm();
+
+  RGWHandler_REST* get_handler(rgw::sal::Driver* driver,
+                              req_state*,
+                               const rgw::auth::StrategyRegistry& auth_registry,
+                               const std::string&) override;
+};
diff --git a/src/rgw/driver/rados/rgw_rest_user.cc b/src/rgw/driver/rados/rgw_rest_user.cc
new file mode 100644 (file)
index 0000000..c2aeece
--- /dev/null
@@ -0,0 +1,1109 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "common/ceph_json.h"
+
+#include "rgw_op.h"
+#include "rgw_user.h"
+#include "rgw_rest_user.h"
+#include "rgw_sal.h"
+
+#include "include/str_list.h"
+#include "include/ceph_assert.h"
+
+#include "services/svc_zone.h"
+#include "services/svc_sys_obj.h"
+#include "rgw_zone.h"
+
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+
+class RGWOp_User_List : public RGWRESTOp {
+
+public:
+  RGWOp_User_List() {}
+
+  int check_caps(const RGWUserCaps& caps) override {
+    return caps.check_cap("users", RGW_CAP_READ);
+  }
+
+  void execute(optional_yield y) override;
+
+  const char* name() const override { return "list_user"; }
+};
+
+void RGWOp_User_List::execute(optional_yield y)
+{
+  RGWUserAdminOpState op_state(driver);
+
+  uint32_t max_entries;
+  std::string marker;
+  RESTArgs::get_uint32(s, "max-entries", 1000, &max_entries);
+  RESTArgs::get_string(s, "marker", marker, &marker);
+
+  op_state.max_entries = max_entries;
+  op_state.marker = marker;
+  op_ret = RGWUserAdminOp_User::list(this, driver, op_state, flusher);
+}
+
+class RGWOp_User_Info : public RGWRESTOp {
+
+public:
+  RGWOp_User_Info() {}
+
+  int check_caps(const RGWUserCaps& caps) override {
+    return caps.check_cap("users", RGW_CAP_READ);
+  }
+
+  void execute(optional_yield y) override;
+
+  const char* name() const override { return "get_user_info"; }
+};
+
+void RGWOp_User_Info::execute(optional_yield y)
+{
+  RGWUserAdminOpState op_state(driver);
+
+  std::string uid_str, access_key_str;
+  bool fetch_stats;
+  bool sync_stats;
+
+  RESTArgs::get_string(s, "uid", uid_str, &uid_str);
+  RESTArgs::get_string(s, "access-key", access_key_str, &access_key_str);
+
+  // if uid was not supplied in rest argument, error out now, otherwise we'll
+  // end up initializing anonymous user, for which keys.init will eventually
+  // return -EACESS
+  if (uid_str.empty() && access_key_str.empty()){
+    op_ret=-EINVAL;
+    return;
+  }
+
+  rgw_user uid(uid_str);
+
+  RESTArgs::get_bool(s, "stats", false, &fetch_stats);
+
+  RESTArgs::get_bool(s, "sync", false, &sync_stats);
+
+  op_state.set_user_id(uid);
+  op_state.set_access_key(access_key_str);
+  op_state.set_fetch_stats(fetch_stats);
+  op_state.set_sync_stats(sync_stats);
+
+  op_ret = RGWUserAdminOp_User::info(s, driver, op_state, flusher, y);
+}
+
+class RGWOp_User_Create : public RGWRESTOp {
+
+public:
+  RGWOp_User_Create() {}
+
+  int check_caps(const RGWUserCaps& caps) override {
+    return caps.check_cap("users", RGW_CAP_WRITE);
+  }
+
+  void execute(optional_yield y) override;
+
+  const char* name() const override { return "create_user"; }
+};
+
+void RGWOp_User_Create::execute(optional_yield y)
+{
+  std::string uid_str;
+  std::string display_name;
+  std::string email;
+  std::string access_key;
+  std::string secret_key;
+  std::string key_type_str;
+  std::string caps;
+  std::string tenant_name;
+  std::string op_mask_str;
+  std::string default_placement_str;
+  std::string placement_tags_str;
+
+  bool gen_key;
+  bool suspended;
+  bool system;
+  bool exclusive;
+
+  int32_t max_buckets;
+  const int32_t default_max_buckets =
+    s->cct->_conf.get_val<int64_t>("rgw_user_max_buckets");
+
+  RGWUserAdminOpState op_state(driver);
+
+  RESTArgs::get_string(s, "uid", uid_str, &uid_str);
+  rgw_user uid(uid_str);
+
+  RESTArgs::get_string(s, "display-name", display_name, &display_name);
+  RESTArgs::get_string(s, "email", email, &email);
+  RESTArgs::get_string(s, "access-key", access_key, &access_key);
+  RESTArgs::get_string(s, "secret-key", secret_key, &secret_key);
+  RESTArgs::get_string(s, "key-type", key_type_str, &key_type_str);
+  RESTArgs::get_string(s, "user-caps", caps, &caps);
+  RESTArgs::get_string(s, "tenant", tenant_name, &tenant_name);
+  RESTArgs::get_bool(s, "generate-key", true, &gen_key);
+  RESTArgs::get_bool(s, "suspended", false, &suspended);
+  RESTArgs::get_int32(s, "max-buckets", default_max_buckets, &max_buckets);
+  RESTArgs::get_bool(s, "system", false, &system);
+  RESTArgs::get_bool(s, "exclusive", false, &exclusive);
+  RESTArgs::get_string(s, "op-mask", op_mask_str, &op_mask_str);
+  RESTArgs::get_string(s, "default-placement", default_placement_str, &default_placement_str);
+  RESTArgs::get_string(s, "placement-tags", placement_tags_str, &placement_tags_str);
+
+  if (!s->user->get_info().system && system) {
+    ldpp_dout(this, 0) << "cannot set system flag by non-system user" << dendl;
+    op_ret = -EINVAL;
+    return;
+  }
+
+  if (!tenant_name.empty()) {
+    uid.tenant = tenant_name;
+  }
+
+  // TODO: validate required args are passed in. (for eg. uid and display_name here)
+  op_state.set_user_id(uid);
+  op_state.set_display_name(display_name);
+  op_state.set_user_email(email);
+  op_state.set_caps(caps);
+  op_state.set_access_key(access_key);
+  op_state.set_secret_key(secret_key);
+
+  if (!op_mask_str.empty()) {
+    uint32_t op_mask;
+    int ret = rgw_parse_op_type_list(op_mask_str, &op_mask);
+    if (ret < 0) {
+      ldpp_dout(this, 0) << "failed to parse op_mask: " << ret << dendl;
+      op_ret = -EINVAL;
+      return;
+    }
+    op_state.set_op_mask(op_mask);
+  }
+
+  if (!key_type_str.empty()) {
+    int32_t key_type = KEY_TYPE_UNDEFINED;
+    if (key_type_str.compare("swift") == 0)
+      key_type = KEY_TYPE_SWIFT;
+    else if (key_type_str.compare("s3") == 0)
+      key_type = KEY_TYPE_S3;
+
+    op_state.set_key_type(key_type);
+  }
+
+  if (max_buckets != default_max_buckets) {
+    if (max_buckets < 0) {
+      max_buckets = -1;
+    }
+    op_state.set_max_buckets(max_buckets);
+  }
+  if (s->info.args.exists("suspended"))
+    op_state.set_suspension(suspended);
+
+  if (s->info.args.exists("system"))
+    op_state.set_system(system);
+
+  if (s->info.args.exists("exclusive"))
+    op_state.set_exclusive(exclusive);
+
+  if (gen_key)
+    op_state.set_generate_key();
+
+  if (!default_placement_str.empty()) {
+    rgw_placement_rule target_rule;
+    target_rule.from_str(default_placement_str);
+    if (!driver->valid_placement(target_rule)) {
+      ldpp_dout(this, 0) << "NOTICE: invalid dest placement: " << target_rule.to_str() << dendl;
+      op_ret = -EINVAL;
+      return;
+    }
+    op_state.set_default_placement(target_rule);
+  }
+
+  if (!placement_tags_str.empty()) {
+    list<string> placement_tags_list;
+    get_str_list(placement_tags_str, ",", placement_tags_list);
+    op_state.set_placement_tags(placement_tags_list);
+  }
+
+  bufferlist data;
+  op_ret = driver->forward_request_to_master(s, s->user.get(), nullptr, data, nullptr, s->info, y);
+  if (op_ret < 0) {
+    ldpp_dout(this, 0) << "forward_request_to_master returned ret=" << op_ret << dendl;
+    return;
+  }
+  op_ret = RGWUserAdminOp_User::create(s, driver, op_state, flusher, y);
+}
+
+class RGWOp_User_Modify : public RGWRESTOp {
+
+public:
+  RGWOp_User_Modify() {}
+
+  int check_caps(const RGWUserCaps& caps) override {
+    return caps.check_cap("users", RGW_CAP_WRITE);
+  }
+
+  void execute(optional_yield y) override;
+
+  const char* name() const override { return "modify_user"; }
+};
+
+void RGWOp_User_Modify::execute(optional_yield y)
+{
+  std::string uid_str;
+  std::string display_name;
+  std::string email;
+  std::string access_key;
+  std::string secret_key;
+  std::string key_type_str;
+  std::string op_mask_str;
+  std::string default_placement_str;
+  std::string placement_tags_str;
+
+  bool gen_key;
+  bool suspended;
+  bool system;
+  bool email_set;
+  bool quota_set;
+  int32_t max_buckets;
+
+  RGWUserAdminOpState op_state(driver);
+
+  RESTArgs::get_string(s, "uid", uid_str, &uid_str);
+  rgw_user uid(uid_str);
+
+  RESTArgs::get_string(s, "display-name", display_name, &display_name);
+  RESTArgs::get_string(s, "email", email, &email, &email_set);
+  RESTArgs::get_string(s, "access-key", access_key, &access_key);
+  RESTArgs::get_string(s, "secret-key", secret_key, &secret_key);
+  RESTArgs::get_bool(s, "generate-key", false, &gen_key);
+  RESTArgs::get_bool(s, "suspended", false, &suspended);
+  RESTArgs::get_int32(s, "max-buckets", RGW_DEFAULT_MAX_BUCKETS, &max_buckets, &quota_set);
+  RESTArgs::get_string(s, "key-type", key_type_str, &key_type_str);
+
+  RESTArgs::get_bool(s, "system", false, &system);
+  RESTArgs::get_string(s, "op-mask", op_mask_str, &op_mask_str);
+  RESTArgs::get_string(s, "default-placement", default_placement_str, &default_placement_str);
+  RESTArgs::get_string(s, "placement-tags", placement_tags_str, &placement_tags_str);
+
+  if (!s->user->get_info().system && system) {
+    ldpp_dout(this, 0) << "cannot set system flag by non-system user" << dendl;
+    op_ret = -EINVAL;
+    return;
+  }
+
+  op_state.set_user_id(uid);
+  op_state.set_display_name(display_name);
+
+  if (email_set)
+    op_state.set_user_email(email);
+
+  op_state.set_access_key(access_key);
+  op_state.set_secret_key(secret_key);
+
+  if (quota_set) {
+    if (max_buckets < 0 ) {
+      max_buckets = -1;
+    }
+    op_state.set_max_buckets(max_buckets);
+  }
+  if (gen_key)
+    op_state.set_generate_key();
+
+  if (!key_type_str.empty()) {
+    int32_t key_type = KEY_TYPE_UNDEFINED;
+    if (key_type_str.compare("swift") == 0)
+      key_type = KEY_TYPE_SWIFT;
+    else if (key_type_str.compare("s3") == 0)
+      key_type = KEY_TYPE_S3;
+
+    op_state.set_key_type(key_type);
+  }
+
+  if (!op_mask_str.empty()) {
+    uint32_t op_mask;
+    if (rgw_parse_op_type_list(op_mask_str, &op_mask) < 0) {
+        ldpp_dout(this, 0) << "failed to parse op_mask" << dendl;
+        op_ret = -EINVAL;
+        return;
+    }   
+    op_state.set_op_mask(op_mask);
+  }
+
+  if (s->info.args.exists("suspended"))
+    op_state.set_suspension(suspended);
+
+  if (s->info.args.exists("system"))
+    op_state.set_system(system);
+
+  if (!op_mask_str.empty()) {
+    uint32_t op_mask;
+    int ret = rgw_parse_op_type_list(op_mask_str, &op_mask);
+    if (ret < 0) {
+      ldpp_dout(this, 0) << "failed to parse op_mask: " << ret << dendl;
+      op_ret = -EINVAL;
+      return;
+    }
+    op_state.set_op_mask(op_mask);
+  }
+
+  if (!default_placement_str.empty()) {
+    rgw_placement_rule target_rule;
+    target_rule.from_str(default_placement_str);
+    if (!driver->valid_placement(target_rule)) {
+      ldpp_dout(this, 0) << "NOTICE: invalid dest placement: " << target_rule.to_str() << dendl;
+      op_ret = -EINVAL;
+      return;
+    }
+    op_state.set_default_placement(target_rule);
+  }
+
+  if (!placement_tags_str.empty()) {
+    list<string> placement_tags_list;
+    get_str_list(placement_tags_str, ",", placement_tags_list);
+    op_state.set_placement_tags(placement_tags_list);
+  }
+  
+  bufferlist data;
+  op_ret = driver->forward_request_to_master(s, s->user.get(), nullptr, data, nullptr, s->info, y);
+  if (op_ret < 0) {
+    ldpp_dout(this, 0) << "forward_request_to_master returned ret=" << op_ret << dendl;
+    return;
+  }
+  op_ret = RGWUserAdminOp_User::modify(s, driver, op_state, flusher, y);
+}
+
+class RGWOp_User_Remove : public RGWRESTOp {
+
+public:
+  RGWOp_User_Remove() {}
+
+  int check_caps(const RGWUserCaps& caps) override {
+    return caps.check_cap("users", RGW_CAP_WRITE);
+  }
+
+  void execute(optional_yield y) override;
+
+  const char* name() const override { return "remove_user"; }
+};
+
+void RGWOp_User_Remove::execute(optional_yield y)
+{
+  std::string uid_str;
+  bool purge_data;
+
+  RGWUserAdminOpState op_state(driver);
+
+  RESTArgs::get_string(s, "uid", uid_str, &uid_str);
+  rgw_user uid(uid_str);
+
+  RESTArgs::get_bool(s, "purge-data", false, &purge_data);
+
+  // FIXME: no double checking
+  if (!uid.empty())
+    op_state.set_user_id(uid);
+
+  op_state.set_purge_data(purge_data);
+
+  bufferlist data;
+  op_ret = driver->forward_request_to_master(s, s->user.get(), nullptr, data, nullptr, s->info, y);
+  if (op_ret < 0) {
+    ldpp_dout(this, 0) << "forward_request_to_master returned ret=" << op_ret << dendl;
+    return;
+  }
+  op_ret = RGWUserAdminOp_User::remove(s, driver, op_state, flusher, s->yield);
+}
+
+class RGWOp_Subuser_Create : public RGWRESTOp {
+
+public:
+  RGWOp_Subuser_Create() {}
+
+  int check_caps(const RGWUserCaps& caps) override {
+    return caps.check_cap("users", RGW_CAP_WRITE);
+  }
+
+  void execute(optional_yield y) override;
+
+  const char* name() const override { return "create_subuser"; }
+};
+
+void RGWOp_Subuser_Create::execute(optional_yield y)
+{
+  std::string uid_str;
+  std::string subuser;
+  std::string secret_key;
+  std::string access_key;
+  std::string perm_str;
+  std::string key_type_str;
+
+  bool gen_subuser = false; // FIXME placeholder
+  bool gen_secret;
+  bool gen_access;
+
+  uint32_t perm_mask = 0;
+  int32_t key_type = KEY_TYPE_SWIFT;
+
+  RGWUserAdminOpState op_state(driver);
+
+  RESTArgs::get_string(s, "uid", uid_str, &uid_str);
+  rgw_user uid(uid_str);
+
+  RESTArgs::get_string(s, "subuser", subuser, &subuser);
+  RESTArgs::get_string(s, "access-key", access_key, &access_key);
+  RESTArgs::get_string(s, "secret-key", secret_key, &secret_key);
+  RESTArgs::get_string(s, "access", perm_str, &perm_str);
+  RESTArgs::get_string(s, "key-type", key_type_str, &key_type_str);
+  RESTArgs::get_bool(s, "generate-secret", false, &gen_secret);
+  RESTArgs::get_bool(s, "gen-access-key", false, &gen_access);
+  
+  perm_mask = rgw_str_to_perm(perm_str.c_str());
+  op_state.set_perm(perm_mask);
+
+  op_state.set_user_id(uid);
+  op_state.set_subuser(subuser);
+  op_state.set_access_key(access_key);
+  op_state.set_secret_key(secret_key);
+  op_state.set_generate_subuser(gen_subuser);
+
+  if (gen_access)
+    op_state.set_gen_access();
+
+  if (gen_secret)
+    op_state.set_gen_secret();
+
+  if (!key_type_str.empty()) {
+    if (key_type_str.compare("swift") == 0)
+      key_type = KEY_TYPE_SWIFT;
+    else if (key_type_str.compare("s3") == 0)
+      key_type = KEY_TYPE_S3;
+  }
+  op_state.set_key_type(key_type);
+
+  bufferlist data;
+  op_ret = driver->forward_request_to_master(s, s->user.get(), nullptr, data, nullptr, s->info, y);
+  if (op_ret < 0) {
+    ldpp_dout(this, 0) << "forward_request_to_master returned ret=" << op_ret << dendl;
+    return;
+  }
+  op_ret = RGWUserAdminOp_Subuser::create(s, driver, op_state, flusher, y);
+}
+
+class RGWOp_Subuser_Modify : public RGWRESTOp {
+
+public:
+  RGWOp_Subuser_Modify() {}
+
+  int check_caps(const RGWUserCaps& caps) override {
+    return caps.check_cap("users", RGW_CAP_WRITE);
+  }
+
+  void execute(optional_yield y) override;
+
+  const char* name() const override { return "modify_subuser"; }
+};
+
+void RGWOp_Subuser_Modify::execute(optional_yield y)
+{
+  std::string uid_str;
+  std::string subuser;
+  std::string secret_key;
+  std::string key_type_str;
+  std::string perm_str;
+
+  RGWUserAdminOpState op_state(driver);
+
+  uint32_t perm_mask;
+  int32_t key_type = KEY_TYPE_SWIFT;
+
+  bool gen_secret;
+
+  RESTArgs::get_string(s, "uid", uid_str, &uid_str);
+  rgw_user uid(uid_str);
+
+  RESTArgs::get_string(s, "subuser", subuser, &subuser);
+  RESTArgs::get_string(s, "secret-key", secret_key, &secret_key);
+  RESTArgs::get_string(s, "access", perm_str, &perm_str);
+  RESTArgs::get_string(s, "key-type", key_type_str, &key_type_str);
+  RESTArgs::get_bool(s, "generate-secret", false, &gen_secret);
+
+  perm_mask = rgw_str_to_perm(perm_str.c_str());
+  op_state.set_perm(perm_mask);
+
+  op_state.set_user_id(uid);
+  op_state.set_subuser(subuser);
+
+  if (!secret_key.empty())
+    op_state.set_secret_key(secret_key);
+
+  if (gen_secret)
+    op_state.set_gen_secret();
+
+  if (!key_type_str.empty()) {
+    if (key_type_str.compare("swift") == 0)
+      key_type = KEY_TYPE_SWIFT;
+    else if (key_type_str.compare("s3") == 0)
+      key_type = KEY_TYPE_S3;
+  }
+  op_state.set_key_type(key_type);
+
+  bufferlist data;
+  op_ret = driver->forward_request_to_master(s, s->user.get(), nullptr, data, nullptr, s->info, y);
+  if (op_ret < 0) {
+    ldpp_dout(this, 0) << "forward_request_to_master returned ret=" << op_ret << dendl;
+    return;
+  }
+  op_ret = RGWUserAdminOp_Subuser::modify(s, driver, op_state, flusher, y);
+}
+
+class RGWOp_Subuser_Remove : public RGWRESTOp {
+
+public:
+  RGWOp_Subuser_Remove() {}
+
+  int check_caps(const RGWUserCaps& caps) override {
+    return caps.check_cap("users", RGW_CAP_WRITE);
+  }
+
+  void execute(optional_yield y) override;
+
+  const char* name() const override { return "remove_subuser"; }
+};
+
+void RGWOp_Subuser_Remove::execute(optional_yield y)
+{
+  std::string uid_str;
+  std::string subuser;
+  bool purge_keys;
+
+  RGWUserAdminOpState op_state(driver);
+
+  RESTArgs::get_string(s, "uid", uid_str, &uid_str);
+  rgw_user uid(uid_str);
+
+  RESTArgs::get_string(s, "subuser", subuser, &subuser);
+  RESTArgs::get_bool(s, "purge-keys", true, &purge_keys);
+
+  op_state.set_user_id(uid);
+  op_state.set_subuser(subuser);
+
+  if (purge_keys)
+    op_state.set_purge_keys();
+
+  bufferlist data;
+  op_ret = driver->forward_request_to_master(s, s->user.get(), nullptr, data, nullptr, s->info, y);
+  if (op_ret < 0) {
+    ldpp_dout(this, 0) << "forward_request_to_master returned ret=" << op_ret << dendl;
+    return;
+  }
+  op_ret = RGWUserAdminOp_Subuser::remove(s, driver, op_state, flusher, y);
+}
+
+class RGWOp_Key_Create : public RGWRESTOp {
+
+public:
+  RGWOp_Key_Create() {}
+
+  int check_caps(const RGWUserCaps& caps) override {
+    return caps.check_cap("users", RGW_CAP_WRITE);
+  }
+
+  void execute(optional_yield y) override;
+
+  const char* name() const override { return "create_access_key"; }
+};
+
+void RGWOp_Key_Create::execute(optional_yield y)
+{
+  std::string uid_str;
+  std::string subuser;
+  std::string access_key;
+  std::string secret_key;
+  std::string key_type_str;
+
+  bool gen_key;
+
+  RGWUserAdminOpState op_state(driver);
+
+  RESTArgs::get_string(s, "uid", uid_str, &uid_str);
+  rgw_user uid(uid_str);
+
+  RESTArgs::get_string(s, "subuser", subuser, &subuser);
+  RESTArgs::get_string(s, "access-key", access_key, &access_key);
+  RESTArgs::get_string(s, "secret-key", secret_key, &secret_key);
+  RESTArgs::get_string(s, "key-type", key_type_str, &key_type_str);
+  RESTArgs::get_bool(s, "generate-key", true, &gen_key);
+
+  op_state.set_user_id(uid);
+  op_state.set_subuser(subuser);
+  op_state.set_access_key(access_key);
+  op_state.set_secret_key(secret_key);
+
+  if (gen_key)
+    op_state.set_generate_key();
+
+  if (!key_type_str.empty()) {
+    int32_t key_type = KEY_TYPE_UNDEFINED;
+    if (key_type_str.compare("swift") == 0)
+      key_type = KEY_TYPE_SWIFT;
+    else if (key_type_str.compare("s3") == 0)
+      key_type = KEY_TYPE_S3;
+
+    op_state.set_key_type(key_type);
+  }
+
+  op_ret = RGWUserAdminOp_Key::create(s, driver, op_state, flusher, y);
+}
+
+class RGWOp_Key_Remove : public RGWRESTOp {
+
+public:
+  RGWOp_Key_Remove() {}
+
+  int check_caps(const RGWUserCaps& caps) override {
+    return caps.check_cap("users", RGW_CAP_WRITE);
+  }
+
+  void execute(optional_yield y) override;
+
+  const char* name() const override { return "remove_access_key"; }
+};
+
+void RGWOp_Key_Remove::execute(optional_yield y)
+{
+  std::string uid_str;
+  std::string subuser;
+  std::string access_key;
+  std::string key_type_str;
+
+  RGWUserAdminOpState op_state(driver);
+
+  RESTArgs::get_string(s, "uid", uid_str, &uid_str);
+  rgw_user uid(uid_str);
+
+  RESTArgs::get_string(s, "subuser", subuser, &subuser);
+  RESTArgs::get_string(s, "access-key", access_key, &access_key);
+  RESTArgs::get_string(s, "key-type", key_type_str, &key_type_str);
+
+  op_state.set_user_id(uid);
+  op_state.set_subuser(subuser);
+  op_state.set_access_key(access_key);
+
+  if (!key_type_str.empty()) {
+    int32_t key_type = KEY_TYPE_UNDEFINED;
+    if (key_type_str.compare("swift") == 0)
+      key_type = KEY_TYPE_SWIFT;
+    else if (key_type_str.compare("s3") == 0)
+      key_type = KEY_TYPE_S3;
+
+    op_state.set_key_type(key_type);
+  }
+
+  op_ret = RGWUserAdminOp_Key::remove(s, driver, op_state, flusher, y);
+}
+
+class RGWOp_Caps_Add : public RGWRESTOp {
+
+public:
+  RGWOp_Caps_Add() {}
+
+  int check_caps(const RGWUserCaps& caps) override {
+    return caps.check_cap("users", RGW_CAP_WRITE);
+  }
+
+  void execute(optional_yield y) override;
+
+  const char* name() const override { return "add_user_caps"; }
+};
+
+void RGWOp_Caps_Add::execute(optional_yield y)
+{
+  std::string uid_str;
+  std::string caps;
+
+  RGWUserAdminOpState op_state(driver);
+
+  RESTArgs::get_string(s, "uid", uid_str, &uid_str);
+  rgw_user uid(uid_str);
+
+  RESTArgs::get_string(s, "user-caps", caps, &caps);
+
+  op_state.set_user_id(uid);
+  op_state.set_caps(caps);
+
+  bufferlist data;
+  op_ret = driver->forward_request_to_master(s, s->user.get(), nullptr, data, nullptr, s->info, y);
+  if (op_ret < 0) {
+    ldpp_dout(this, 0) << "forward_request_to_master returned ret=" << op_ret << dendl;
+    return;
+  }
+  op_ret = RGWUserAdminOp_Caps::add(s, driver, op_state, flusher, y);
+}
+
+class RGWOp_Caps_Remove : public RGWRESTOp {
+
+public:
+  RGWOp_Caps_Remove() {}
+
+  int check_caps(const RGWUserCaps& caps) override {
+    return caps.check_cap("users", RGW_CAP_WRITE);
+  }
+
+  void execute(optional_yield y) override;
+
+  const char* name() const override { return "remove_user_caps"; }
+};
+
+void RGWOp_Caps_Remove::execute(optional_yield y)
+{
+  std::string uid_str;
+  std::string caps;
+
+  RGWUserAdminOpState op_state(driver);
+
+  RESTArgs::get_string(s, "uid", uid_str, &uid_str);
+  rgw_user uid(uid_str);
+
+  RESTArgs::get_string(s, "user-caps", caps, &caps);
+
+  op_state.set_user_id(uid);
+  op_state.set_caps(caps);
+
+  bufferlist data;
+  op_ret = driver->forward_request_to_master(s, s->user.get(), nullptr, data, nullptr, s->info, y);
+  if (op_ret < 0) {
+    ldpp_dout(this, 0) << "forward_request_to_master returned ret=" << op_ret << dendl;
+    return;
+  }
+  op_ret = RGWUserAdminOp_Caps::remove(s, driver, op_state, flusher, y);
+}
+
+struct UserQuotas {
+  RGWQuota quota;
+
+  UserQuotas() {}
+
+  explicit UserQuotas(RGWUserInfo& info){
+    quota.bucket_quota = info.quota.bucket_quota;
+    quota.user_quota = info.quota.user_quota;
+  }
+
+  void dump(Formatter *f) const {
+    encode_json("bucket_quota", quota.bucket_quota, f);
+    encode_json("user_quota", quota.user_quota, f);
+  }
+  void decode_json(JSONObj *obj) {
+    JSONDecoder::decode_json("bucket_quota", quota.bucket_quota, obj);
+    JSONDecoder::decode_json("user_quota", quota.user_quota, obj);
+  }
+};
+
+class RGWOp_Quota_Info : public RGWRESTOp {
+
+public:
+  RGWOp_Quota_Info() {}
+
+  int check_caps(const RGWUserCaps& caps) override {
+    return caps.check_cap("users", RGW_CAP_READ);
+  }
+
+  void execute(optional_yield y) override;
+
+  const char* name() const override { return "get_quota_info"; }
+};
+
+
+void RGWOp_Quota_Info::execute(optional_yield y)
+{
+  RGWUserAdminOpState op_state(driver);
+
+  std::string uid_str;
+  std::string quota_type;
+
+  RESTArgs::get_string(s, "uid", uid_str, &uid_str);
+  RESTArgs::get_string(s, "quota-type", quota_type, &quota_type);
+
+  if (uid_str.empty()) {
+    op_ret = -EINVAL;
+    return;
+  }
+
+  rgw_user uid(uid_str);
+
+  bool show_all = quota_type.empty();
+  bool show_bucket = show_all || (quota_type == "bucket");
+  bool show_user = show_all || (quota_type == "user");
+
+  if (!(show_all || show_bucket || show_user)) {
+    op_ret = -EINVAL;
+    return;
+  }
+
+  op_state.set_user_id(uid);
+
+  RGWUser user;
+  op_ret = user.init(s, driver, op_state, y);
+  if (op_ret < 0)
+    return;
+
+  if (!op_state.has_existing_user()) {
+    op_ret = -ERR_NO_SUCH_USER;
+    return;
+  }
+
+  RGWUserInfo info;
+  string err_msg;
+  op_ret = user.info(info, &err_msg);
+  if (op_ret < 0)
+    return;
+
+  flusher.start(0);
+  if (show_all) {
+    UserQuotas quotas(info);
+    encode_json("quota", quotas, s->formatter);
+  } else if (show_user) {
+    encode_json("user_quota", info.quota.user_quota, s->formatter);
+  } else {
+    encode_json("bucket_quota", info.quota.bucket_quota, s->formatter);
+  }
+
+  flusher.flush();
+}
+
+class RGWOp_Quota_Set : public RGWRESTOp {
+
+public:
+  RGWOp_Quota_Set() {}
+
+  int check_caps(const RGWUserCaps& caps) override {
+    return caps.check_cap("users", RGW_CAP_WRITE);
+  }
+
+  void execute(optional_yield y) override;
+
+  const char* name() const override { return "set_quota_info"; }
+};
+
+/**
+ * set quota
+ *
+ * two different ways to set the quota info: as json struct in the message body or via http params.
+ *
+ * as json:
+ *
+ * PUT /admin/user?uid=<uid>[&quota-type=<type>]
+ *
+ * whereas quota-type is optional and is either user, or bucket
+ *
+ * if quota-type is not specified then we expect to get a structure that contains both quotas,
+ * otherwise we'll only get the relevant configuration.
+ *
+ * E.g., if quota type not specified:
+ * {
+ *    "user_quota" : {
+ *      "max_size_kb" : 4096,
+ *      "max_objects" : -1,
+ *      "enabled" : false
+ *    },
+ *    "bucket_quota" : {
+ *      "max_size_kb" : 1024,
+ *      "max_objects" : -1,
+ *      "enabled" : true
+ *    }
+ * }
+ *
+ *
+ * or if quota type is specified:
+ * {
+ *   "max_size_kb" : 4096,
+ *   "max_objects" : -1,
+ *   "enabled" : false
+ * }
+ *
+ * Another option is not to pass any body and set the following http params:
+ *
+ *
+ * max-size-kb=<size>
+ * max-objects=<max objects>
+ * enabled[={true,false}]
+ *
+ * all params are optionals and default to the current settings. With this type of configuration the
+ * quota-type param is mandatory.
+ *
+ */
+
+void RGWOp_Quota_Set::execute(optional_yield y)
+{
+  RGWUserAdminOpState op_state(driver);
+
+  std::string uid_str;
+  std::string quota_type;
+
+  RESTArgs::get_string(s, "uid", uid_str, &uid_str);
+  RESTArgs::get_string(s, "quota-type", quota_type, &quota_type);
+
+  if (uid_str.empty()) {
+    op_ret = -EINVAL;
+    return;
+  }
+
+  rgw_user uid(uid_str);
+
+  bool set_all = quota_type.empty();
+  bool set_bucket = set_all || (quota_type == "bucket");
+  bool set_user = set_all || (quota_type == "user");
+
+  if (!(set_all || set_bucket || set_user)) {
+    ldpp_dout(this, 20) << "invalid quota type" << dendl;
+    op_ret = -EINVAL;
+    return;
+  }
+
+  bool use_http_params;
+
+  if (s->content_length > 0) {
+    use_http_params = false;
+  } else {
+    const char *encoding = s->info.env->get("HTTP_TRANSFER_ENCODING");
+    use_http_params = (!encoding || strcmp(encoding, "chunked") != 0);
+  }
+
+  if (use_http_params && set_all) {
+    ldpp_dout(this, 20) << "quota type was not specified, can't set all quotas via http headers" << dendl;
+    op_ret = -EINVAL;
+    return;
+  }
+
+  op_state.set_user_id(uid);
+
+  RGWUser user;
+  op_ret = user.init(s, driver, op_state, y);
+  if (op_ret < 0) {
+    ldpp_dout(this, 20) << "failed initializing user info: " << op_ret << dendl;
+    return;
+  }
+
+  if (!op_state.has_existing_user()) {
+    op_ret = -ERR_NO_SUCH_USER;
+    return;
+  }
+
+#define QUOTA_INPUT_MAX_LEN 1024
+  if (set_all) {
+    UserQuotas quotas;
+
+    if ((op_ret = get_json_input(driver->ctx(), s, quotas, QUOTA_INPUT_MAX_LEN, NULL)) < 0) {
+      ldpp_dout(this, 20) << "failed to retrieve input" << dendl;
+      return;
+    }
+
+    op_state.set_user_quota(quotas.quota.user_quota);
+    op_state.set_bucket_quota(quotas.quota.bucket_quota);
+  } else {
+    RGWQuotaInfo quota;
+
+    if (!use_http_params) {
+      bool empty;
+      op_ret = get_json_input(driver->ctx(), s, quota, QUOTA_INPUT_MAX_LEN, &empty);
+      if (op_ret < 0) {
+        ldpp_dout(this, 20) << "failed to retrieve input" << dendl;
+        if (!empty)
+          return;
+
+        /* was probably chunked input, but no content provided, configure via http params */
+        use_http_params = true;
+      }
+    }
+
+    if (use_http_params) {
+      RGWUserInfo info;
+      string err_msg;
+      op_ret = user.info(info, &err_msg);
+      if (op_ret < 0) {
+        ldpp_dout(this, 20) << "failed to get user info: " << op_ret << dendl;
+        return;
+      }
+      RGWQuotaInfo *old_quota;
+      if (set_user) {
+        old_quota = &info.quota.user_quota;
+      } else {
+        old_quota = &info.quota.bucket_quota;
+      }
+
+      RESTArgs::get_int64(s, "max-objects", old_quota->max_objects, &quota.max_objects);
+      RESTArgs::get_int64(s, "max-size", old_quota->max_size, &quota.max_size);
+      int64_t max_size_kb;
+      bool has_max_size_kb = false;
+      RESTArgs::get_int64(s, "max-size-kb", 0, &max_size_kb, &has_max_size_kb);
+      if (has_max_size_kb) {
+        quota.max_size = max_size_kb * 1024;
+      }
+      RESTArgs::get_bool(s, "enabled", old_quota->enabled, &quota.enabled);
+    }
+
+    if (set_user) {
+      op_state.set_user_quota(quota);
+    } else {
+      op_state.set_bucket_quota(quota);
+    }
+  }
+
+  string err;
+  op_ret = user.modify(s, op_state, y, &err);
+  if (op_ret < 0) {
+    ldpp_dout(this, 20) << "failed updating user info: " << op_ret << ": " << err << dendl;
+    return;
+  }
+}
+
+RGWOp *RGWHandler_User::op_get()
+{
+  if (s->info.args.sub_resource_exists("quota"))
+    return new RGWOp_Quota_Info;
+
+  if (s->info.args.sub_resource_exists("list"))
+    return new RGWOp_User_List;
+
+  return new RGWOp_User_Info;
+}
+
+RGWOp *RGWHandler_User::op_put()
+{
+  if (s->info.args.sub_resource_exists("subuser"))
+    return new RGWOp_Subuser_Create;
+
+  if (s->info.args.sub_resource_exists("key"))
+    return new RGWOp_Key_Create;
+
+  if (s->info.args.sub_resource_exists("caps"))
+    return new RGWOp_Caps_Add;
+
+  if (s->info.args.sub_resource_exists("quota"))
+    return new RGWOp_Quota_Set;
+
+  return new RGWOp_User_Create;
+}
+
+RGWOp *RGWHandler_User::op_post()
+{
+  if (s->info.args.sub_resource_exists("subuser"))
+    return new RGWOp_Subuser_Modify;
+
+  return new RGWOp_User_Modify;
+}
+
+RGWOp *RGWHandler_User::op_delete()
+{
+  if (s->info.args.sub_resource_exists("subuser"))
+    return new RGWOp_Subuser_Remove;
+
+  if (s->info.args.sub_resource_exists("key"))
+    return new RGWOp_Key_Remove;
+
+  if (s->info.args.sub_resource_exists("caps"))
+    return new RGWOp_Caps_Remove;
+
+  return new RGWOp_User_Remove;
+}
+
diff --git a/src/rgw/driver/rados/rgw_rest_user.h b/src/rgw/driver/rados/rgw_rest_user.h
new file mode 100644 (file)
index 0000000..ee585be
--- /dev/null
@@ -0,0 +1,36 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include "rgw_rest.h"
+#include "rgw_rest_s3.h"
+
+
+class RGWHandler_User : public RGWHandler_Auth_S3 {
+protected:
+  RGWOp *op_get() override;
+  RGWOp *op_put() override;
+  RGWOp *op_post() override;
+  RGWOp *op_delete() override;
+public:
+  using RGWHandler_Auth_S3::RGWHandler_Auth_S3;
+  ~RGWHandler_User() override = default;
+
+  int read_permissions(RGWOp*, optional_yield) override {
+    return 0;
+  }
+};
+
+class RGWRESTMgr_User : public RGWRESTMgr {
+public:
+  RGWRESTMgr_User() = default;
+  ~RGWRESTMgr_User() override = default;
+
+  RGWHandler_REST *get_handler(rgw::sal::Driver* driver,
+                              req_state*,
+                               const rgw::auth::StrategyRegistry& auth_registry,
+                               const std::string&) override {
+    return new RGWHandler_User(auth_registry);
+  }
+};
diff --git a/src/rgw/driver/rados/rgw_sal_rados.cc b/src/rgw/driver/rados/rgw_sal_rados.cc
new file mode 100644 (file)
index 0000000..577569d
--- /dev/null
@@ -0,0 +1,3630 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2020 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include <errno.h>
+#include <stdlib.h>
+#include <system_error>
+#include <filesystem>
+#include <unistd.h>
+#include <sstream>
+#include <boost/algorithm/string.hpp>
+#include <boost/process.hpp>
+
+#include "common/Clock.h"
+#include "common/errno.h"
+
+#include "rgw_sal.h"
+#include "rgw_sal_rados.h"
+#include "rgw_bucket.h"
+#include "rgw_multi.h"
+#include "rgw_acl_s3.h"
+#include "rgw_aio.h"
+#include "rgw_aio_throttle.h"
+#include "rgw_tracer.h"
+
+#include "rgw_zone.h"
+#include "rgw_rest_conn.h"
+#include "rgw_service.h"
+#include "rgw_lc.h"
+#include "rgw_lc_tier.h"
+#include "rgw_rest_admin.h"
+#include "rgw_rest_bucket.h"
+#include "rgw_rest_metadata.h"
+#include "rgw_rest_log.h"
+#include "rgw_rest_config.h"
+#include "rgw_rest_ratelimit.h"
+#include "rgw_rest_realm.h"
+#include "rgw_rest_user.h"
+#include "services/svc_sys_obj.h"
+#include "services/svc_meta.h"
+#include "services/svc_meta_be_sobj.h"
+#include "services/svc_cls.h"
+#include "services/svc_zone.h"
+#include "services/svc_tier_rados.h"
+#include "services/svc_quota.h"
+#include "services/svc_config_key.h"
+#include "services/svc_zone_utils.h"
+#include "services/svc_role_rados.h"
+#include "services/svc_user.h"
+#include "cls/rgw/cls_rgw_client.h"
+
+#include "rgw_pubsub.h"
+
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+
+static string mp_ns = RGW_OBJ_NS_MULTIPART;
+
+namespace rgw::sal {
+
+// default number of entries to list with each bucket listing call
+// (use marker to bridge between calls)
+static constexpr size_t listing_max_entries = 1000;
+
+static int decode_policy(CephContext* cct,
+                         bufferlist& bl,
+                         RGWAccessControlPolicy* policy)
+{
+  auto iter = bl.cbegin();
+  try {
+    policy->decode(iter);
+  } catch (buffer::error& err) {
+    ldout(cct, 0) << "ERROR: could not decode policy, caught buffer::error" << dendl;
+    return -EIO;
+  }
+  if (cct->_conf->subsys.should_gather<ceph_subsys_rgw, 15>()) {
+    ldout(cct, 15) << __func__ << " Read AccessControlPolicy";
+    RGWAccessControlPolicy_S3* s3policy = static_cast<RGWAccessControlPolicy_S3 *>(policy);
+    s3policy->to_xml(*_dout);
+    *_dout << dendl;
+  }
+  return 0;
+}
+
+static int rgw_op_get_bucket_policy_from_attr(const DoutPrefixProvider* dpp,
+                                             RadosStore* store,
+                                             User* user,
+                                             Attrs& bucket_attrs,
+                                             RGWAccessControlPolicy* policy,
+                                             optional_yield y)
+{
+  auto aiter = bucket_attrs.find(RGW_ATTR_ACL);
+
+  if (aiter != bucket_attrs.end()) {
+    int ret = decode_policy(store->ctx(), aiter->second, policy);
+    if (ret < 0)
+      return ret;
+  } else {
+    ldout(store->ctx(), 0) << "WARNING: couldn't find acl header for bucket, generating default" << dendl;
+    /* object exists, but policy is broken */
+    int r = user->load_user(dpp, y);
+    if (r < 0)
+      return r;
+
+    policy->create_default(user->get_id(), user->get_display_name());
+  }
+  return 0;
+}
+
+int RadosCompletions::drain()
+{
+  int ret = 0;
+  while (!handles.empty()) {
+    librados::AioCompletion* handle = handles.front();
+    handles.pop_front();
+    handle->wait_for_complete();
+    int r = handle->get_return_value();
+    handle->release();
+    if (r < 0) {
+      ret = r;
+    }
+  }
+  return ret;
+}
+
+int RadosUser::list_buckets(const DoutPrefixProvider* dpp, const std::string& marker,
+                              const std::string& end_marker, uint64_t max, bool need_stats,
+                              BucketList &buckets, optional_yield y)
+{
+  RGWUserBuckets ulist;
+  bool is_truncated = false;
+  int ret;
+
+  buckets.clear();
+  ret = store->ctl()->user->list_buckets(dpp, info.user_id, marker, end_marker, max,
+                                        need_stats, &ulist, &is_truncated, y);
+  if (ret < 0)
+    return ret;
+
+  buckets.set_truncated(is_truncated);
+  for (const auto& ent : ulist.get_buckets()) {
+    buckets.add(std::unique_ptr<Bucket>(new RadosBucket(this->store, ent.second, this)));
+  }
+
+  return 0;
+}
+
+int RadosUser::create_bucket(const DoutPrefixProvider* dpp,
+                                const rgw_bucket& b,
+                                const std::string& zonegroup_id,
+                                rgw_placement_rule& placement_rule,
+                                std::string& swift_ver_location,
+                                const RGWQuotaInfo * pquota_info,
+                                const RGWAccessControlPolicy& policy,
+                                Attrs& attrs,
+                                RGWBucketInfo& info,
+                                obj_version& ep_objv,
+                                bool exclusive,
+                                bool obj_lock_enabled,
+                                bool* existed,
+                                req_info& req_info,
+                                std::unique_ptr<Bucket>* bucket_out,
+                                optional_yield y)
+{
+  int ret;
+  bufferlist in_data;
+  RGWBucketInfo master_info;
+  rgw_bucket* pmaster_bucket;
+  uint32_t* pmaster_num_shards;
+  real_time creation_time;
+  std::unique_ptr<Bucket> bucket;
+  obj_version objv,* pobjv = NULL;
+
+  /* If it exists, look it up; otherwise create it */
+  ret = store->get_bucket(dpp, this, b, &bucket, y);
+  if (ret < 0 && ret != -ENOENT)
+    return ret;
+
+  if (ret != -ENOENT) {
+    RGWAccessControlPolicy old_policy(store->ctx());
+    *existed = true;
+    if (swift_ver_location.empty()) {
+      swift_ver_location = bucket->get_info().swift_ver_location;
+    }
+    placement_rule.inherit_from(bucket->get_info().placement_rule);
+
+    // don't allow changes to the acl policy
+    int r = rgw_op_get_bucket_policy_from_attr(dpp, store, this, bucket->get_attrs(),
+                                              &old_policy, y);
+    if (r >= 0 && old_policy != policy) {
+      bucket_out->swap(bucket);
+      return -EEXIST;
+    }
+  } else {
+    bucket = std::unique_ptr<Bucket>(new RadosBucket(store, b, this));
+    *existed = false;
+    bucket->set_attrs(attrs);
+  }
+
+  if (!store->svc()->zone->is_meta_master()) {
+    JSONParser jp;
+    ret = store->forward_request_to_master(dpp, this, NULL, in_data, &jp, req_info, y);
+    if (ret < 0) {
+      return ret;
+    }
+
+    JSONDecoder::decode_json("entry_point_object_ver", ep_objv, &jp);
+    JSONDecoder::decode_json("object_ver", objv, &jp);
+    JSONDecoder::decode_json("bucket_info", master_info, &jp);
+    ldpp_dout(dpp, 20) << "parsed: objv.tag=" << objv.tag << " objv.ver=" << objv.ver << dendl;
+    std::time_t ctime = ceph::real_clock::to_time_t(master_info.creation_time);
+    ldpp_dout(dpp, 20) << "got creation time: << " << std::put_time(std::localtime(&ctime), "%F %T") << dendl;
+    pmaster_bucket= &master_info.bucket;
+    creation_time = master_info.creation_time;
+    pmaster_num_shards = &master_info.layout.current_index.layout.normal.num_shards;
+    pobjv = &objv;
+    if (master_info.obj_lock_enabled()) {
+      info.flags = BUCKET_VERSIONED | BUCKET_OBJ_LOCK_ENABLED;
+    }
+  } else {
+    pmaster_bucket = NULL;
+    pmaster_num_shards = NULL;
+    if (obj_lock_enabled)
+      info.flags = BUCKET_VERSIONED | BUCKET_OBJ_LOCK_ENABLED;
+  }
+
+  std::string zid = zonegroup_id;
+  if (zid.empty()) {
+    zid = store->svc()->zone->get_zonegroup().get_id();
+  }
+
+  if (*existed) {
+    rgw_placement_rule selected_placement_rule;
+    ret = store->svc()->zone->select_bucket_placement(dpp, this->get_info(),
+                                              zid, placement_rule,
+                                              &selected_placement_rule, nullptr, y);
+    if (selected_placement_rule != info.placement_rule) {
+      ret = -EEXIST;
+      bucket_out->swap(bucket);
+      return ret;
+    }
+  } else {
+
+    ret = store->getRados()->create_bucket(this->get_info(), bucket->get_key(),
+                                   zid, placement_rule, swift_ver_location, pquota_info,
+                                   attrs, info, pobjv, &ep_objv, creation_time,
+                                   pmaster_bucket, pmaster_num_shards, y, dpp,
+                                   exclusive);
+    if (ret == -EEXIST) {
+      *existed = true;
+      /* bucket already existed, might have raced with another bucket creation,
+       * or might be partial bucket creation that never completed. Read existing
+       * bucket info, verify that the reported bucket owner is the current user.
+       * If all is ok then update the user's list of buckets.  Otherwise inform
+       * client about a name conflict.
+       */
+      if (info.owner.compare(this->get_id()) != 0) {
+       return -EEXIST;
+      }
+      ret = 0;
+    } else if (ret != 0) {
+      return ret;
+    }
+  }
+
+  bucket->set_version(ep_objv);
+  bucket->get_info() = info;
+
+  RadosBucket* rbucket = static_cast<RadosBucket*>(bucket.get());
+  ret = rbucket->link(dpp, this, y, false);
+  if (ret && !*existed && ret != -EEXIST) {
+    /* if it exists (or previously existed), don't remove it! */
+    ret = rbucket->unlink(dpp, this, y);
+    if (ret < 0) {
+      ldpp_dout(dpp, 0) << "WARNING: failed to unlink bucket: ret=" << ret
+                      << dendl;
+    }
+  } else if (ret == -EEXIST || (ret == 0 && *existed)) {
+    ret = -ERR_BUCKET_EXISTS;
+  }
+
+  bucket_out->swap(bucket);
+
+  return ret;
+}
+
+int RadosUser::read_attrs(const DoutPrefixProvider* dpp, optional_yield y)
+{
+  return store->ctl()->user->get_attrs_by_uid(dpp, get_id(), &attrs, y, &objv_tracker);
+}
+
+int RadosUser::merge_and_store_attrs(const DoutPrefixProvider* dpp, Attrs& new_attrs, optional_yield y)
+{
+  for(auto& it : new_attrs) {
+         attrs[it.first] = it.second;
+  }
+  return store_user(dpp, y, false);
+}
+
+int RadosUser::read_stats(const DoutPrefixProvider *dpp,
+                             optional_yield y, RGWStorageStats* stats,
+                            ceph::real_time* last_stats_sync,
+                            ceph::real_time* last_stats_update)
+{
+  return store->ctl()->user->read_stats(dpp, get_id(), stats, y, last_stats_sync, last_stats_update);
+}
+
+int RadosUser::read_stats_async(const DoutPrefixProvider *dpp, RGWGetUserStats_CB* cb)
+{
+  return store->svc()->user->read_stats_async(dpp, get_id(), cb);
+}
+
+int RadosUser::complete_flush_stats(const DoutPrefixProvider *dpp, optional_yield y)
+{
+  return store->svc()->user->complete_flush_stats(dpp, get_id(), y);
+}
+
+int RadosUser::read_usage(const DoutPrefixProvider *dpp, uint64_t start_epoch, uint64_t end_epoch,
+                              uint32_t max_entries, bool* is_truncated,
+                              RGWUsageIter& usage_iter,
+                              map<rgw_user_bucket, rgw_usage_log_entry>& usage)
+{
+  std::string bucket_name;
+  return store->getRados()->read_usage(dpp, get_id(), bucket_name, start_epoch,
+                                      end_epoch, max_entries, is_truncated,
+                                      usage_iter, usage);
+}
+
+int RadosUser::trim_usage(const DoutPrefixProvider *dpp, uint64_t start_epoch, uint64_t end_epoch)
+{
+  std::string bucket_name;
+
+  return store->getRados()->trim_usage(dpp, get_id(), bucket_name, start_epoch, end_epoch);
+}
+
+int RadosUser::load_user(const DoutPrefixProvider* dpp, optional_yield y)
+{
+    return store->ctl()->user->get_info_by_uid(dpp, info.user_id, &info, y, RGWUserCtl::GetParams().set_objv_tracker(&objv_tracker).set_attrs(&attrs));
+}
+
+int RadosUser::store_user(const DoutPrefixProvider* dpp, optional_yield y, bool exclusive, RGWUserInfo* old_info)
+{
+    return store->ctl()->user->store_info(dpp, info, y,
+                                         RGWUserCtl::PutParams().set_objv_tracker(&objv_tracker)
+                                         .set_exclusive(exclusive)
+                                         .set_attrs(&attrs)
+                                         .set_old_info(old_info));
+}
+
+int RadosUser::remove_user(const DoutPrefixProvider* dpp, optional_yield y)
+{
+    return store->ctl()->user->remove_info(dpp, info, y,
+                                         RGWUserCtl::RemoveParams().set_objv_tracker(&objv_tracker));
+}
+
+int RadosUser::verify_mfa(const std::string& mfa_str, bool* verified,
+                         const DoutPrefixProvider* dpp, optional_yield y)
+{
+  vector<string> params;
+  get_str_vec(mfa_str, " ", params);
+
+  if (params.size() != 2) {
+    ldpp_dout(dpp, 5) << "NOTICE: invalid mfa string provided: " << mfa_str << dendl;
+    return -EINVAL;
+  }
+
+  string& serial = params[0];
+  string& pin = params[1];
+
+  auto i = info.mfa_ids.find(serial);
+  if (i == info.mfa_ids.end()) {
+    ldpp_dout(dpp, 5) << "NOTICE: user does not have mfa device with serial=" << serial << dendl;
+    return -EACCES;
+  }
+
+  int ret = store->svc()->cls->mfa.check_mfa(dpp, info.user_id, serial, pin, y);
+  if (ret < 0) {
+    ldpp_dout(dpp, 20) << "NOTICE: failed to check MFA, serial=" << serial << dendl;
+    return -EACCES;
+  }
+
+  *verified = true;
+
+  return 0;
+}
+
+RadosBucket::~RadosBucket() {}
+
+int RadosBucket::remove_bucket(const DoutPrefixProvider* dpp,
+                              bool delete_children,
+                              bool forward_to_master,
+                              req_info* req_info,
+                              optional_yield y)
+{
+  int ret;
+
+  // Refresh info
+  ret = load_bucket(dpp, y);
+  if (ret < 0) {
+    return ret;
+  }
+
+  ListParams params;
+  params.list_versions = true;
+  params.allow_unordered = true;
+
+  ListResults results;
+
+  do {
+    results.objs.clear();
+
+    ret = list(dpp, params, 1000, results, y);
+    if (ret < 0) {
+      return ret;
+    }
+
+    if (!results.objs.empty() && !delete_children) {
+      ldpp_dout(dpp, -1) << "ERROR: could not remove non-empty bucket " << info.bucket.name <<
+       dendl;
+      return -ENOTEMPTY;
+    }
+
+    for (const auto& obj : results.objs) {
+      rgw_obj_key key(obj.key);
+      /* xxx dang */
+      ret = rgw_remove_object(dpp, store, this, key);
+      if (ret < 0 && ret != -ENOENT) {
+       return ret;
+      }
+    }
+  } while(results.is_truncated);
+
+  ret = abort_multiparts(dpp, store->ctx());
+  if (ret < 0) {
+    return ret;
+  }
+
+  // remove lifecycle config, if any (XXX note could be made generic)
+  (void) store->getRados()->get_lc()->remove_bucket_config(
+    this, get_attrs());
+
+  ret = store->ctl()->bucket->sync_user_stats(dpp, info.owner, info, y, nullptr);
+  if (ret < 0) {
+     ldout(store->ctx(), 1) << "WARNING: failed sync user stats before bucket delete. ret=" <<  ret << dendl;
+  }
+
+  RGWObjVersionTracker ot;
+
+  // if we deleted children above we will force delete, as any that
+  // remain is detrius from a prior bug
+  ret = store->getRados()->delete_bucket(info, ot, y, dpp, !delete_children);
+  if (ret < 0) {
+    ldpp_dout(dpp, -1) << "ERROR: could not remove bucket " <<
+      info.bucket.name << dendl;
+    return ret;
+  }
+
+  // if bucket has notification definitions associated with it
+  // they should be removed (note that any pending notifications on the bucket are still going to be sent)
+  RGWPubSub ps(store, info.owner.tenant);
+  RGWPubSub::Bucket ps_bucket(&ps, info.bucket);
+  const auto ps_ret = ps_bucket.remove_notifications(dpp, y);
+  if (ps_ret < 0 && ps_ret != -ENOENT) {
+    ldpp_dout(dpp, -1) << "ERROR: unable to remove notifications from bucket. ret=" << ps_ret << dendl;
+  }
+
+  ret = store->ctl()->bucket->unlink_bucket(info.owner, info.bucket, y, dpp, false);
+  if (ret < 0) {
+    ldpp_dout(dpp, -1) << "ERROR: unable to remove user bucket information" << dendl;
+  }
+
+  if (forward_to_master) {
+    bufferlist in_data;
+    ret = store->forward_request_to_master(dpp, owner, &ot.read_version, in_data, nullptr, *req_info, y);
+    if (ret < 0) {
+      if (ret == -ENOENT) {
+       /* adjust error, we want to return with NoSuchBucket and not
+        * NoSuchKey */
+       ret = -ERR_NO_SUCH_BUCKET;
+      }
+      return ret;
+    }
+  }
+
+  return ret;
+}
+
+int RadosBucket::remove_bucket_bypass_gc(int concurrent_max, bool
+                                        keep_index_consistent,
+                                        optional_yield y, const
+                                        DoutPrefixProvider *dpp)
+{
+  int ret;
+  map<RGWObjCategory, RGWStorageStats> stats;
+  map<string, bool> common_prefixes;
+  RGWObjectCtx obj_ctx(store);
+  CephContext *cct = store->ctx();
+
+  string bucket_ver, master_ver;
+
+  ret = load_bucket(dpp, null_yield);
+  if (ret < 0)
+    return ret;
+
+  const auto& index = info.get_current_index();
+  ret = read_stats(dpp, index, RGW_NO_SHARD, &bucket_ver, &master_ver, stats, NULL);
+  if (ret < 0)
+    return ret;
+
+  ret = abort_multiparts(dpp, cct);
+  if (ret < 0) {
+    return ret;
+  }
+
+  rgw::sal::Bucket::ListParams params;
+  rgw::sal::Bucket::ListResults results;
+
+  params.list_versions = true;
+  params.allow_unordered = true;
+
+  std::unique_ptr<rgw::sal::Completions> handles = store->get_completions();
+
+  int max_aio = concurrent_max;
+  results.is_truncated = true;
+
+  while (results.is_truncated) {
+    ret = list(dpp, params, listing_max_entries, results, null_yield);
+    if (ret < 0)
+      return ret;
+
+    std::vector<rgw_bucket_dir_entry>::iterator it = results.objs.begin();
+    for (; it != results.objs.end(); ++it) {
+      RGWObjState *astate = NULL;
+      RGWObjManifest *amanifest = nullptr;
+      std::unique_ptr<rgw::sal::Object> obj = get_object((*it).key);
+
+      ret = store->getRados()->get_obj_state(dpp, &obj_ctx, obj->get_bucket()->get_info(),
+                                            obj.get(), &astate, &amanifest,
+                                            false, y);
+      if (ret == -ENOENT) {
+        ldpp_dout(dpp, 1) << "WARNING: cannot find obj state for obj " << obj << dendl;
+        continue;
+      }
+      if (ret < 0) {
+        ldpp_dout(dpp, -1) << "ERROR: get obj state returned with error " << ret << dendl;
+        return ret;
+      }
+
+      if (amanifest) {
+        RGWObjManifest& manifest = *amanifest;
+        RGWObjManifest::obj_iterator miter = manifest.obj_begin(dpp);
+       std::unique_ptr<rgw::sal::Object> head_obj = get_object(manifest.get_obj().key);
+        rgw_raw_obj raw_head_obj;
+       dynamic_cast<RadosObject*>(head_obj.get())->get_raw_obj(&raw_head_obj);
+
+        for (; miter != manifest.obj_end(dpp) && max_aio--; ++miter) {
+          if (!max_aio) {
+            ret = handles->drain();
+            if (ret < 0) {
+              ldpp_dout(dpp, -1) << "ERROR: could not drain handles as aio completion returned with " << ret << dendl;
+              return ret;
+            }
+            max_aio = concurrent_max;
+          }
+
+          rgw_raw_obj last_obj = miter.get_location().get_raw_obj(store);
+          if (last_obj == raw_head_obj) {
+            // have the head obj deleted at the end
+            continue;
+          }
+
+          ret = store->delete_raw_obj_aio(dpp, last_obj, handles.get());
+          if (ret < 0) {
+            ldpp_dout(dpp, -1) << "ERROR: delete obj aio failed with " << ret << dendl;
+            return ret;
+          }
+        } // for all shadow objs
+
+       ret = head_obj->delete_obj_aio(dpp, astate, handles.get(), keep_index_consistent, null_yield);
+        if (ret < 0) {
+          ldpp_dout(dpp, -1) << "ERROR: delete obj aio failed with " << ret << dendl;
+          return ret;
+        }
+      }
+
+      if (!max_aio) {
+        ret = handles->drain();
+        if (ret < 0) {
+          ldpp_dout(dpp, -1) << "ERROR: could not drain handles as aio completion returned with " << ret << dendl;
+          return ret;
+        }
+        max_aio = concurrent_max;
+      }
+      obj_ctx.invalidate(obj->get_obj());
+    } // for all RGW objects in results
+  } // while is_truncated
+
+  ret = handles->drain();
+  if (ret < 0) {
+    ldpp_dout(dpp, -1) << "ERROR: could not drain handles as aio completion returned with " << ret << dendl;
+    return ret;
+  }
+
+  sync_user_stats(dpp, y);
+  if (ret < 0) {
+     ldpp_dout(dpp, 1) << "WARNING: failed sync user stats before bucket delete. ret=" <<  ret << dendl;
+  }
+
+  RGWObjVersionTracker objv_tracker;
+
+  // this function can only be run if caller wanted children to be
+  // deleted, so we can ignore the check for children as any that
+  // remain are detritus from a prior bug
+  ret = remove_bucket(dpp, true, false, nullptr, y);
+  if (ret < 0) {
+    ldpp_dout(dpp, -1) << "ERROR: could not remove bucket " << this << dendl;
+    return ret;
+  }
+
+  return ret;
+}
+
+int RadosBucket::load_bucket(const DoutPrefixProvider* dpp, optional_yield y, bool get_stats)
+{
+  int ret;
+
+  RGWSI_MetaBackend_CtxParams bectx_params = RGWSI_MetaBackend_CtxParams_SObj();
+  RGWObjVersionTracker ep_ot;
+  if (info.bucket.bucket_id.empty()) {
+    ret = store->ctl()->bucket->read_bucket_info(info.bucket, &info, y, dpp,
+                                     RGWBucketCtl::BucketInstance::GetParams()
+                                     .set_mtime(&mtime)
+                                     .set_attrs(&attrs)
+                                      .set_bectx_params(bectx_params),
+                                     &ep_ot);
+  } else {
+    ret  = store->ctl()->bucket->read_bucket_instance_info(info.bucket, &info, y, dpp,
+                                     RGWBucketCtl::BucketInstance::GetParams()
+                                     .set_mtime(&mtime)
+                                     .set_attrs(&attrs)
+                                     .set_bectx_params(bectx_params));
+  }
+  if (ret != 0) {
+    return ret;
+  }
+
+  bucket_version = ep_ot.read_version;
+
+  if (get_stats) {
+    ret = store->ctl()->bucket->read_bucket_stats(info.bucket, &ent, y, dpp);
+  }
+
+  return ret;
+}
+
+int RadosBucket::read_stats(const DoutPrefixProvider *dpp,
+                           const bucket_index_layout_generation& idx_layout,
+                           int shard_id, std::string* bucket_ver, std::string* master_ver,
+                           std::map<RGWObjCategory, RGWStorageStats>& stats,
+                           std::string* max_marker, bool* syncstopped)
+{
+  return store->getRados()->get_bucket_stats(dpp, info, idx_layout, shard_id, bucket_ver, master_ver, stats, max_marker, syncstopped);
+}
+
+int RadosBucket::read_stats_async(const DoutPrefixProvider *dpp,
+                                 const bucket_index_layout_generation& idx_layout,
+                                 int shard_id, RGWGetBucketStats_CB* ctx)
+{
+  return store->getRados()->get_bucket_stats_async(dpp, get_info(), idx_layout, shard_id, ctx);
+}
+
+int RadosBucket::sync_user_stats(const DoutPrefixProvider *dpp, optional_yield y)
+{
+  return store->ctl()->bucket->sync_user_stats(dpp, owner->get_id(), info, y, &ent);
+}
+
+int RadosBucket::update_container_stats(const DoutPrefixProvider* dpp)
+{
+  int ret;
+  map<std::string, RGWBucketEnt> m;
+
+  m[info.bucket.name] = ent;
+  ret = store->getRados()->update_containers_stats(m, dpp);
+  if (!ret)
+    return -EEXIST;
+  if (ret < 0)
+    return ret;
+
+  map<std::string, RGWBucketEnt>::iterator iter = m.find(info.bucket.name);
+  if (iter == m.end())
+    return -EINVAL;
+
+  ent.count = iter->second.count;
+  ent.size = iter->second.size;
+  ent.size_rounded = iter->second.size_rounded;
+  ent.creation_time = iter->second.creation_time;
+  ent.placement_rule = std::move(iter->second.placement_rule);
+
+  info.creation_time = ent.creation_time;
+  info.placement_rule = ent.placement_rule;
+
+  return 0;
+}
+
+int RadosBucket::check_bucket_shards(const DoutPrefixProvider* dpp)
+{
+      return store->getRados()->check_bucket_shards(info, info.bucket, get_count(), dpp);
+}
+
+int RadosBucket::link(const DoutPrefixProvider* dpp, User* new_user, optional_yield y, bool update_entrypoint, RGWObjVersionTracker* objv)
+{
+  RGWBucketEntryPoint ep;
+  ep.bucket = info.bucket;
+  ep.owner = new_user->get_id();
+  ep.creation_time = get_creation_time();
+  ep.linked = true;
+  Attrs ep_attrs;
+  rgw_ep_info ep_data{ep, ep_attrs};
+
+  int r = store->ctl()->bucket->link_bucket(new_user->get_id(), info.bucket,
+                                           get_creation_time(), y, dpp, update_entrypoint,
+                                           &ep_data);
+  if (r < 0)
+    return r;
+
+  if (objv)
+    *objv = ep_data.ep_objv;
+
+  return r;
+}
+
+int RadosBucket::unlink(const DoutPrefixProvider* dpp, User* new_user, optional_yield y, bool update_entrypoint)
+{
+  return store->ctl()->bucket->unlink_bucket(new_user->get_id(), info.bucket, y, dpp, update_entrypoint);
+}
+
+int RadosBucket::chown(const DoutPrefixProvider* dpp, User* new_user, User* old_user, optional_yield y, const std::string* marker)
+{
+  std::string obj_marker;
+
+  if (marker == nullptr)
+    marker = &obj_marker;
+
+  int r = this->link(dpp, new_user, y);
+  if (r < 0) {
+    return r;
+  }
+  if (!old_user) {
+    return r;
+  }
+
+  return store->ctl()->bucket->chown(store, this, new_user->get_id(),
+                          old_user->get_display_name(), *marker, y, dpp);
+}
+
+int RadosBucket::put_info(const DoutPrefixProvider* dpp, bool exclusive, ceph::real_time _mtime)
+{
+  mtime = _mtime;
+  return store->getRados()->put_bucket_instance_info(info, exclusive, mtime, &attrs, dpp);
+}
+
+/* Make sure to call get_bucket_info() if you need it first */
+bool RadosBucket::is_owner(User* user)
+{
+  return (info.owner.compare(user->get_id()) == 0);
+}
+
+int RadosBucket::check_empty(const DoutPrefixProvider* dpp, optional_yield y)
+{
+  return store->getRados()->check_bucket_empty(dpp, info, y);
+}
+
+int RadosBucket::check_quota(const DoutPrefixProvider *dpp, RGWQuota& quota, uint64_t obj_size,
+                               optional_yield y, bool check_size_only)
+{
+    return store->getRados()->check_quota(dpp, owner->get_id(), get_key(),
+                                         quota, obj_size, y, check_size_only);
+}
+
+int RadosBucket::merge_and_store_attrs(const DoutPrefixProvider* dpp, Attrs& new_attrs, optional_yield y)
+{
+  for(auto& it : new_attrs) {
+         attrs[it.first] = it.second;
+  }
+  return store->ctl()->bucket->set_bucket_instance_attrs(get_info(),
+                               new_attrs, &get_info().objv_tracker, y, dpp);
+}
+
+int RadosBucket::try_refresh_info(const DoutPrefixProvider* dpp, ceph::real_time* pmtime)
+{
+  return store->getRados()->try_refresh_bucket_info(info, pmtime, dpp, &attrs);
+}
+
+int RadosBucket::read_usage(const DoutPrefixProvider *dpp, uint64_t start_epoch, uint64_t end_epoch,
+                              uint32_t max_entries, bool* is_truncated,
+                              RGWUsageIter& usage_iter,
+                              map<rgw_user_bucket, rgw_usage_log_entry>& usage)
+{
+  return store->getRados()->read_usage(dpp, owner->get_id(), get_name(), start_epoch,
+                                      end_epoch, max_entries, is_truncated,
+                                      usage_iter, usage);
+}
+
+int RadosBucket::trim_usage(const DoutPrefixProvider *dpp, uint64_t start_epoch, uint64_t end_epoch)
+{
+  return store->getRados()->trim_usage(dpp, owner->get_id(), get_name(), start_epoch, end_epoch);
+}
+
+int RadosBucket::remove_objs_from_index(const DoutPrefixProvider *dpp, std::list<rgw_obj_index_key>& objs_to_unlink)
+{
+  return store->getRados()->remove_objs_from_index(dpp, info, objs_to_unlink);
+}
+
+int RadosBucket::check_index(const DoutPrefixProvider *dpp, std::map<RGWObjCategory, RGWStorageStats>& existing_stats, std::map<RGWObjCategory, RGWStorageStats>& calculated_stats)
+{
+  return store->getRados()->bucket_check_index(dpp, info, &existing_stats, &calculated_stats);
+}
+
+int RadosBucket::rebuild_index(const DoutPrefixProvider *dpp)
+{
+  return store->getRados()->bucket_rebuild_index(dpp, info);
+}
+
+int RadosBucket::set_tag_timeout(const DoutPrefixProvider *dpp, uint64_t timeout)
+{
+  return store->getRados()->cls_obj_set_bucket_tag_timeout(dpp, info, timeout);
+}
+
+int RadosBucket::purge_instance(const DoutPrefixProvider* dpp)
+{
+  int max_shards = (info.layout.current_index.layout.normal.num_shards > 0 ? info.layout.current_index.layout.normal.num_shards : 1);
+  for (int i = 0; i < max_shards; i++) {
+    RGWRados::BucketShard bs(store->getRados());
+    int shard_id = (info.layout.current_index.layout.normal.num_shards > 0  ? i : -1);
+    int ret = bs.init(dpp, info, info.layout.current_index, shard_id);
+    if (ret < 0) {
+      cerr << "ERROR: bs.init(bucket=" << info.bucket << ", shard=" << shard_id
+           << "): " << cpp_strerror(-ret) << std::endl;
+      return ret;
+    }
+    ret = store->getRados()->bi_remove(dpp, bs);
+    if (ret < 0) {
+      cerr << "ERROR: failed to remove bucket index object: "
+           << cpp_strerror(-ret) << std::endl;
+      return ret;
+    }
+  }
+  return 0;
+}
+
+int RadosBucket::set_acl(const DoutPrefixProvider* dpp, RGWAccessControlPolicy &acl, optional_yield y)
+{
+  bufferlist aclbl;
+
+  acls = acl;
+  acl.encode(aclbl);
+  map<string, bufferlist>& attrs = get_attrs();
+
+  attrs[RGW_ATTR_ACL] = aclbl;
+  info.owner = acl.get_owner().get_id();
+
+  int r = store->ctl()->bucket->store_bucket_instance_info(info.bucket,
+                 info, y, dpp,
+                 RGWBucketCtl::BucketInstance::PutParams().set_attrs(&attrs));
+  if (r < 0) {
+    cerr << "ERROR: failed to set bucket owner: " << cpp_strerror(-r) << std::endl;
+    return r;
+  }
+  
+  return 0;
+}
+
+std::unique_ptr<Object> RadosBucket::get_object(const rgw_obj_key& k)
+{
+  return std::make_unique<RadosObject>(this->store, k, this);
+}
+
+int RadosBucket::list(const DoutPrefixProvider* dpp, ListParams& params, int max, ListResults& results, optional_yield y)
+{
+  RGWRados::Bucket target(store->getRados(), get_info());
+  if (params.shard_id >= 0) {
+    target.set_shard_id(params.shard_id);
+  }
+  RGWRados::Bucket::List list_op(&target);
+
+  list_op.params.prefix = params.prefix;
+  list_op.params.delim = params.delim;
+  list_op.params.marker = params.marker;
+  list_op.params.ns = params.ns;
+  list_op.params.end_marker = params.end_marker;
+  list_op.params.ns = params.ns;
+  list_op.params.enforce_ns = params.enforce_ns;
+  list_op.params.access_list_filter = params.access_list_filter;
+  list_op.params.force_check_filter = params.force_check_filter;
+  list_op.params.list_versions = params.list_versions;
+  list_op.params.allow_unordered = params.allow_unordered;
+
+  int ret = list_op.list_objects(dpp, max, &results.objs, &results.common_prefixes, &results.is_truncated, y);
+  if (ret >= 0) {
+    results.next_marker = list_op.get_next_marker();
+    params.marker = results.next_marker;
+  }
+
+  return ret;
+}
+
+std::unique_ptr<MultipartUpload> RadosBucket::get_multipart_upload(
+                                 const std::string& oid,
+                                 std::optional<std::string> upload_id,
+                                 ACLOwner owner, ceph::real_time mtime)
+{
+  return std::make_unique<RadosMultipartUpload>(this->store, this, oid, upload_id,
+                                               std::move(owner), mtime);
+}
+
+int RadosBucket::list_multiparts(const DoutPrefixProvider *dpp,
+                                const string& prefix,
+                                string& marker,
+                                const string& delim,
+                                const int& max_uploads,
+                                vector<std::unique_ptr<MultipartUpload>>& uploads,
+                                map<string, bool> *common_prefixes,
+                                bool *is_truncated)
+{
+  rgw::sal::Bucket::ListParams params;
+  rgw::sal::Bucket::ListResults results;
+  MultipartMetaFilter mp_filter;
+
+  params.prefix = prefix;
+  params.delim = delim;
+  params.marker = marker;
+  params.ns = RGW_OBJ_NS_MULTIPART;
+  params.access_list_filter = &mp_filter;
+
+  int ret = list(dpp, params, max_uploads, results, null_yield);
+
+  if (ret < 0)
+    return ret;
+
+  if (!results.objs.empty()) {
+    for (const rgw_bucket_dir_entry& dentry : results.objs) {
+      rgw_obj_key key(dentry.key);
+      ACLOwner owner(rgw_user(dentry.meta.owner));
+      owner.set_name(dentry.meta.owner_display_name);
+      uploads.push_back(this->get_multipart_upload(key.name,
+                       std::nullopt, std::move(owner)));
+    }
+  }
+  if (common_prefixes) {
+    *common_prefixes = std::move(results.common_prefixes);
+  }
+  *is_truncated = results.is_truncated;
+  marker = params.marker.name;
+
+  return 0;
+}
+
+int RadosBucket::abort_multiparts(const DoutPrefixProvider* dpp,
+                                 CephContext* cct)
+{
+  constexpr int max = 1000;
+  int ret, num_deleted = 0;
+  vector<std::unique_ptr<MultipartUpload>> uploads;
+  string marker;
+  bool is_truncated;
+
+  const std::string empty_delim;
+  const std::string empty_prefix;
+
+  do {
+    ret = list_multiparts(dpp, empty_prefix, marker, empty_delim,
+                         max, uploads, nullptr, &is_truncated);
+    if (ret < 0) {
+      ldpp_dout(dpp, 0) << __func__ <<
+       " ERROR : calling list_bucket_multiparts; ret=" << ret <<
+       "; bucket=\"" << this << "\"" << dendl;
+      return ret;
+    }
+    ldpp_dout(dpp, 20) << __func__ <<
+      " INFO: aborting and cleaning up multipart upload(s); bucket=\"" <<
+      this << "\"; uploads.size()=" << uploads.size() <<
+      "; is_truncated=" << is_truncated << dendl;
+
+    if (!uploads.empty()) {
+      for (const auto& upload : uploads) {
+       ret = upload->abort(dpp, cct);
+        if (ret < 0) {
+         // we're doing a best-effort; if something cannot be found,
+         // log it and keep moving forward
+         if (ret != -ENOENT && ret != -ERR_NO_SUCH_UPLOAD) {
+           ldpp_dout(dpp, 0) << __func__ <<
+             " ERROR : failed to abort and clean-up multipart upload \"" <<
+             upload->get_meta() << "\"" << dendl;
+           return ret;
+         } else {
+           ldpp_dout(dpp, 10) << __func__ <<
+             " NOTE : unable to find part(s) of "
+             "aborted multipart upload of \"" << upload->get_meta() <<
+             "\" for cleaning up" << dendl;
+         }
+        }
+        num_deleted++;
+      }
+      if (num_deleted) {
+        ldpp_dout(dpp, 0) << __func__ <<
+         " WARNING : aborted " << num_deleted <<
+         " incomplete multipart uploads" << dendl;
+      }
+    }
+  } while (is_truncated);
+
+  return 0;
+}
+
+std::unique_ptr<User> RadosStore::get_user(const rgw_user &u)
+{
+  return std::make_unique<RadosUser>(this, u);
+}
+
+std::string RadosStore::get_cluster_id(const DoutPrefixProvider* dpp,  optional_yield y)
+{
+  return getRados()->get_cluster_fsid(dpp, y);
+}
+
+int RadosStore::get_user_by_access_key(const DoutPrefixProvider* dpp, const std::string& key, optional_yield y, std::unique_ptr<User>* user)
+{
+  RGWUserInfo uinfo;
+  User* u;
+  RGWObjVersionTracker objv_tracker;
+
+  int r = ctl()->user->get_info_by_access_key(dpp, key, &uinfo, y, RGWUserCtl::GetParams().set_objv_tracker(&objv_tracker));
+  if (r < 0)
+    return r;
+
+  u = new RadosUser(this, uinfo);
+  if (!u)
+    return -ENOMEM;
+
+  u->get_version_tracker() = objv_tracker;
+
+  user->reset(u);
+  return 0;
+}
+
+int RadosStore::get_user_by_email(const DoutPrefixProvider* dpp, const std::string& email, optional_yield y, std::unique_ptr<User>* user)
+{
+  RGWUserInfo uinfo;
+  User* u;
+  RGWObjVersionTracker objv_tracker;
+
+  int r = ctl()->user->get_info_by_email(dpp, email, &uinfo, y, RGWUserCtl::GetParams().set_objv_tracker(&objv_tracker));
+  if (r < 0)
+    return r;
+
+  u = new RadosUser(this, uinfo);
+  if (!u)
+    return -ENOMEM;
+
+  u->get_version_tracker() = objv_tracker;
+
+  user->reset(u);
+  return 0;
+}
+
+int RadosStore::get_user_by_swift(const DoutPrefixProvider* dpp, const std::string& user_str, optional_yield y, std::unique_ptr<User>* user)
+{
+  RGWUserInfo uinfo;
+  User* u;
+  RGWObjVersionTracker objv_tracker;
+
+  int r = ctl()->user->get_info_by_swift(dpp, user_str, &uinfo, y, RGWUserCtl::GetParams().set_objv_tracker(&objv_tracker));
+  if (r < 0)
+    return r;
+
+  u = new RadosUser(this, uinfo);
+  if (!u)
+    return -ENOMEM;
+
+  u->get_version_tracker() = objv_tracker;
+
+  user->reset(u);
+  return 0;
+}
+
+std::unique_ptr<Object> RadosStore::get_object(const rgw_obj_key& k)
+{
+  return std::make_unique<RadosObject>(this, k);
+}
+
+int RadosStore::get_bucket(const DoutPrefixProvider* dpp, User* u, const rgw_bucket& b, std::unique_ptr<Bucket>* bucket, optional_yield y)
+{
+  int ret;
+  Bucket* bp;
+
+  bp = new RadosBucket(this, b, u);
+  ret = bp->load_bucket(dpp, y);
+  if (ret < 0) {
+    delete bp;
+    return ret;
+  }
+
+  bucket->reset(bp);
+  return 0;
+}
+
+int RadosStore::get_bucket(User* u, const RGWBucketInfo& i, std::unique_ptr<Bucket>* bucket)
+{
+  Bucket* bp;
+
+  bp = new RadosBucket(this, i, u);
+  /* Don't need to fetch the bucket info, use the provided one */
+
+  bucket->reset(bp);
+  return 0;
+}
+
+int RadosStore::get_bucket(const DoutPrefixProvider* dpp, User* u, const std::string& tenant, const std::string& name, std::unique_ptr<Bucket>* bucket, optional_yield y)
+{
+  rgw_bucket b;
+
+  b.tenant = tenant;
+  b.name = name;
+
+  return get_bucket(dpp, u, b, bucket, y);
+}
+
+bool RadosStore::is_meta_master()
+{
+  return svc()->zone->is_meta_master();
+}
+
+int RadosStore::forward_request_to_master(const DoutPrefixProvider *dpp, User* user, obj_version* objv,
+                                            bufferlist& in_data,
+                                            JSONParser* jp, req_info& info,
+                                            optional_yield y)
+{
+  if (is_meta_master()) {
+    /* We're master, don't forward */
+    return 0;
+  }
+
+  if (!svc()->zone->get_master_conn()) {
+    ldpp_dout(dpp, 0) << "rest connection is invalid" << dendl;
+    return -EINVAL;
+  }
+  ldpp_dout(dpp, 0) << "sending request to master zonegroup" << dendl;
+  bufferlist response;
+  std::string uid_str = user->get_id().to_str();
+#define MAX_REST_RESPONSE (128 * 1024) // we expect a very small response
+  int ret = svc()->zone->get_master_conn()->forward(dpp, rgw_user(uid_str), info,
+                                                    objv, MAX_REST_RESPONSE,
+                                                   &in_data, &response, y);
+  if (ret < 0)
+    return ret;
+
+  ldpp_dout(dpp, 20) << "response: " << response.c_str() << dendl;
+  if (jp && !jp->parse(response.c_str(), response.length())) {
+    ldpp_dout(dpp, 0) << "failed parsing response from master zonegroup" << dendl;
+    return -EINVAL;
+  }
+
+  return 0;
+}
+
+int RadosStore::forward_iam_request_to_master(const DoutPrefixProvider *dpp, const RGWAccessKey& key, obj_version* objv,
+                                            bufferlist& in_data,
+                                            RGWXMLDecoder::XMLParser* parser, req_info& info,
+                                            optional_yield y)
+{
+  if (is_meta_master()) {
+    /* We're master, don't forward */
+    return 0;
+  }
+
+  if (!svc()->zone->get_master_conn()) {
+    ldpp_dout(dpp, 0) << "rest connection is invalid" << dendl;
+    return -EINVAL;
+  }
+  ldpp_dout(dpp, 0) << "sending request to master zonegroup" << dendl;
+  bufferlist response;
+#define MAX_REST_RESPONSE (128 * 1024) // we expect a very small response
+  int ret = svc()->zone->get_master_conn()->forward_iam_request(dpp, key, info,
+                                                    objv, MAX_REST_RESPONSE,
+                                                                                       &in_data, &response, y);
+  if (ret < 0)
+    return ret;
+
+  ldpp_dout(dpp, 20) << "response: " << response.c_str() << dendl;
+
+  std::string r = response.c_str();
+  std::string str_to_search = "&quot;";
+  std::string str_to_replace = "\"";
+  boost::replace_all(r, str_to_search, str_to_replace);
+  ldpp_dout(dpp, 20) << "r: " << r.c_str() << dendl;
+
+  if (parser && !parser->parse(r.c_str(), r.length(), 1)) {
+    ldpp_dout(dpp, 0) << "ERROR: failed to parse response from master zonegroup" << dendl;
+    return -EIO;
+  }
+
+  return 0;
+}
+
+std::string RadosStore::zone_unique_id(uint64_t unique_num)
+{
+  return svc()->zone_utils->unique_id(unique_num);
+}
+
+std::string RadosStore::zone_unique_trans_id(const uint64_t unique_num)
+{
+  return svc()->zone_utils->unique_trans_id(unique_num);
+}
+
+int RadosStore::get_zonegroup(const std::string& id,
+                             std::unique_ptr<ZoneGroup>* zonegroup)
+{
+  ZoneGroup* zg;
+  RGWZoneGroup rzg;
+  int r = svc()->zone->get_zonegroup(id, rzg);
+  if (r < 0)
+    return r;
+
+  zg = new RadosZoneGroup(this, rzg);
+  if (!zg)
+    return -ENOMEM;
+
+  zonegroup->reset(zg);
+  return 0;
+}
+
+int RadosStore::list_all_zones(const DoutPrefixProvider* dpp, std::list<std::string>& zone_ids)
+{
+  return svc()->zone->list_zones(dpp, zone_ids);
+}
+
+int RadosStore::cluster_stat(RGWClusterStat& stats)
+{
+  rados_cluster_stat_t rados_stats;
+  int ret;
+
+  ret = rados->get_rados_handle()->cluster_stat(rados_stats);
+  if (ret < 0)
+    return ret;
+
+  stats.kb = rados_stats.kb;
+  stats.kb_used = rados_stats.kb_used;
+  stats.kb_avail = rados_stats.kb_avail;
+  stats.num_objects = rados_stats.num_objects;
+
+  return ret;
+}
+
+std::unique_ptr<Lifecycle> RadosStore::get_lifecycle(void)
+{
+  return std::make_unique<RadosLifecycle>(this);
+}
+
+std::unique_ptr<Completions> RadosStore::get_completions(void)
+{
+  return std::make_unique<RadosCompletions>();
+}
+
+std::unique_ptr<Notification> RadosStore::get_notification(
+  rgw::sal::Object* obj, rgw::sal::Object* src_obj, req_state* s, rgw::notify::EventType event_type, const std::string* object_name)
+{
+  return std::make_unique<RadosNotification>(s, this, obj, src_obj, s, event_type, object_name);
+}
+
+std::unique_ptr<Notification> RadosStore::get_notification(const DoutPrefixProvider* dpp, rgw::sal::Object* obj, rgw::sal::Object* src_obj, rgw::notify::EventType event_type, rgw::sal::Bucket* _bucket, std::string& _user_id, std::string& _user_tenant, std::string& _req_id, optional_yield y)
+{
+  return std::make_unique<RadosNotification>(dpp, this, obj, src_obj, event_type, _bucket, _user_id, _user_tenant, _req_id, y);
+}
+
+int RadosStore::delete_raw_obj(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj)
+{
+  return rados->delete_raw_obj(dpp, obj);
+}
+
+int RadosStore::delete_raw_obj_aio(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj, Completions* aio)
+{
+  RadosCompletions* raio = static_cast<RadosCompletions*>(aio);
+
+  return rados->delete_raw_obj_aio(dpp, obj, raio->handles);
+}
+
+void RadosStore::get_raw_obj(const rgw_placement_rule& placement_rule, const rgw_obj& obj, rgw_raw_obj* raw_obj)
+{
+    rados->obj_to_raw(placement_rule, obj, raw_obj);
+}
+
+int RadosStore::get_raw_chunk_size(const DoutPrefixProvider* dpp, const rgw_raw_obj& obj, uint64_t* chunk_size)
+{
+  return rados->get_max_chunk_size(obj.pool, chunk_size, dpp);
+}
+
+int RadosStore::initialize(CephContext *cct, const DoutPrefixProvider *dpp)
+{
+  std::unique_ptr<ZoneGroup> zg =
+    std::make_unique<RadosZoneGroup>(this, svc()->zone->get_zonegroup());
+  zone = make_unique<RadosZone>(this, std::move(zg));
+  return 0;
+}
+
+int RadosStore::log_usage(const DoutPrefixProvider *dpp, map<rgw_user_bucket, RGWUsageBatch>& usage_info)
+{
+    return rados->log_usage(dpp, usage_info);
+}
+
+int RadosStore::log_op(const DoutPrefixProvider *dpp, std::string& oid, bufferlist& bl)
+{
+  rgw_raw_obj obj(svc()->zone->get_zone_params().log_pool, oid);
+
+  int ret = rados->append_async(dpp, obj, bl.length(), bl);
+  if (ret == -ENOENT) {
+    ret = rados->create_pool(dpp, svc()->zone->get_zone_params().log_pool);
+    if (ret < 0)
+      return ret;
+    // retry
+    ret = rados->append_async(dpp, obj, bl.length(), bl);
+  }
+
+  return ret;
+}
+
+int RadosStore::register_to_service_map(const DoutPrefixProvider *dpp, const std::string& daemon_type,
+                                          const map<std::string, std::string>& meta)
+{
+  return rados->register_to_service_map(dpp, daemon_type, meta);
+}
+
+void RadosStore::get_quota(RGWQuota& quota)
+{
+    quota.bucket_quota = svc()->quota->get_bucket_quota();
+    quota.user_quota = svc()->quota->get_user_quota();
+}
+
+void RadosStore::get_ratelimit(RGWRateLimitInfo& bucket_ratelimit, RGWRateLimitInfo& user_ratelimit, RGWRateLimitInfo& anon_ratelimit)
+{
+  bucket_ratelimit = svc()->zone->get_current_period().get_config().bucket_ratelimit;
+  user_ratelimit = svc()->zone->get_current_period().get_config().user_ratelimit;
+  anon_ratelimit = svc()->zone->get_current_period().get_config().anon_ratelimit;
+}
+
+int RadosStore::set_buckets_enabled(const DoutPrefixProvider* dpp, vector<rgw_bucket>& buckets, bool enabled)
+{
+    return rados->set_buckets_enabled(buckets, enabled, dpp);
+}
+
+int RadosStore::get_sync_policy_handler(const DoutPrefixProvider* dpp,
+                                          std::optional<rgw_zone_id> zone,
+                                          std::optional<rgw_bucket> bucket,
+                                          RGWBucketSyncPolicyHandlerRef* phandler,
+                                          optional_yield y)
+{
+  return ctl()->bucket->get_sync_policy_handler(zone, bucket, phandler, y, dpp);
+}
+
+RGWDataSyncStatusManager* RadosStore::get_data_sync_manager(const rgw_zone_id& source_zone)
+{
+  return rados->get_data_sync_manager(source_zone);
+}
+
+int RadosStore::read_all_usage(const DoutPrefixProvider *dpp, uint64_t start_epoch, uint64_t end_epoch,
+                                 uint32_t max_entries, bool* is_truncated,
+                                 RGWUsageIter& usage_iter,
+                                 map<rgw_user_bucket, rgw_usage_log_entry>& usage)
+{
+  rgw_user uid;
+  std::string bucket_name;
+
+  return rados->read_usage(dpp, uid, bucket_name, start_epoch, end_epoch, max_entries,
+                          is_truncated, usage_iter, usage);
+}
+
+int RadosStore::trim_all_usage(const DoutPrefixProvider *dpp, uint64_t start_epoch, uint64_t end_epoch)
+{
+  rgw_user uid;
+  std::string bucket_name;
+
+  return rados->trim_usage(dpp, uid, bucket_name, start_epoch, end_epoch);
+}
+
+int RadosStore::get_config_key_val(std::string name, bufferlist* bl)
+{
+  return svc()->config_key->get(name, true, bl);
+}
+
+int RadosStore::meta_list_keys_init(const DoutPrefixProvider *dpp, const std::string& section, const std::string& marker, void** phandle)
+{
+  return ctl()->meta.mgr->list_keys_init(dpp, section, marker, phandle);
+}
+
+int RadosStore::meta_list_keys_next(const DoutPrefixProvider *dpp, void* handle, int max, list<std::string>& keys, bool* truncated)
+{
+  return ctl()->meta.mgr->list_keys_next(dpp, handle, max, keys, truncated);
+}
+
+void RadosStore::meta_list_keys_complete(void* handle)
+{
+  ctl()->meta.mgr->list_keys_complete(handle);
+}
+
+std::string RadosStore::meta_get_marker(void* handle)
+{
+  return ctl()->meta.mgr->get_marker(handle);
+}
+
+int RadosStore::meta_remove(const DoutPrefixProvider* dpp, std::string& metadata_key, optional_yield y)
+{
+  return ctl()->meta.mgr->remove(metadata_key, y, dpp);
+}
+
+void RadosStore::finalize(void)
+{
+  if (rados)
+    rados->finalize();
+}
+
+void RadosStore::register_admin_apis(RGWRESTMgr* mgr)
+{
+  mgr->register_resource("user", new RGWRESTMgr_User);
+  mgr->register_resource("bucket", new RGWRESTMgr_Bucket);
+  /*Registering resource for /admin/metadata */
+  mgr->register_resource("metadata", new RGWRESTMgr_Metadata);
+  mgr->register_resource("log", new RGWRESTMgr_Log);
+  /* XXX These may become global when cbodley is done with his zone work */
+  mgr->register_resource("config", new RGWRESTMgr_Config);
+  mgr->register_resource("realm", new RGWRESTMgr_Realm);
+  mgr->register_resource("ratelimit", new RGWRESTMgr_Ratelimit);
+}
+
+std::unique_ptr<LuaManager> RadosStore::get_lua_manager()
+{
+  return std::make_unique<RadosLuaManager>(this);
+}
+
+std::unique_ptr<RGWRole> RadosStore::get_role(std::string name,
+                                             std::string tenant,
+                                             std::string path,
+                                             std::string trust_policy,
+                                             std::string max_session_duration_str,
+                std::multimap<std::string,std::string> tags)
+{
+  return std::make_unique<RadosRole>(this, name, tenant, path, trust_policy, max_session_duration_str, tags);
+}
+
+std::unique_ptr<RGWRole> RadosStore::get_role(std::string id)
+{
+  return std::make_unique<RadosRole>(this, id);
+}
+
+std::unique_ptr<RGWRole> RadosStore::get_role(const RGWRoleInfo& info)
+{
+  return std::make_unique<RadosRole>(this, info);
+}
+
+int RadosStore::get_roles(const DoutPrefixProvider *dpp,
+                         optional_yield y,
+                         const std::string& path_prefix,
+                         const std::string& tenant,
+                         vector<std::unique_ptr<RGWRole>>& roles)
+{
+  auto pool = svc()->zone->get_zone_params().roles_pool;
+  std::string prefix;
+
+  // List all roles if path prefix is empty
+  if (! path_prefix.empty()) {
+    prefix = tenant + RGWRole::role_path_oid_prefix + path_prefix;
+  } else {
+    prefix = tenant + RGWRole::role_path_oid_prefix;
+  }
+
+  //Get the filtered objects
+  list<std::string> result;
+  bool is_truncated;
+  RGWListRawObjsCtx ctx;
+  do {
+    list<std::string> oids;
+    int r = rados->list_raw_objects(dpp, pool, prefix, 1000, ctx, oids, &is_truncated);
+    if (r < 0) {
+      ldpp_dout(dpp, 0) << "ERROR: listing filtered objects failed: "
+                  << prefix << ": " << cpp_strerror(-r) << dendl;
+      return r;
+    }
+    for (const auto& iter : oids) {
+      result.push_back(iter.substr(RGWRole::role_path_oid_prefix.size()));
+    }
+  } while (is_truncated);
+
+  for (const auto& it : result) {
+    //Find the role oid prefix from the end
+    size_t pos = it.rfind(RGWRole::role_oid_prefix);
+    if (pos == std::string::npos) {
+        continue;
+    }
+    // Split the result into path and info_oid + id
+    std::string path = it.substr(0, pos);
+
+    /*Make sure that prefix is part of path (False results could've been returned)
+      because of the role info oid + id appended to the path)*/
+    if(path_prefix.empty() || path.find(path_prefix) != std::string::npos) {
+      //Get id from info oid prefix + id
+      std::string id = it.substr(pos + RGWRole::role_oid_prefix.length());
+
+      std::unique_ptr<rgw::sal::RGWRole> role = get_role(id);
+      int ret = role->read_info(dpp, y);
+      if (ret < 0) {
+        return ret;
+      }
+      roles.push_back(std::move(role));
+    }
+  }
+
+  return 0;
+}
+
+std::unique_ptr<RGWOIDCProvider> RadosStore::get_oidc_provider()
+{
+  return std::make_unique<RadosOIDCProvider>(this);
+}
+
+int RadosStore::get_oidc_providers(const DoutPrefixProvider *dpp,
+                                  const std::string& tenant,
+                                  vector<std::unique_ptr<RGWOIDCProvider>>& providers)
+{
+  std::string prefix = tenant + RGWOIDCProvider::oidc_url_oid_prefix;
+  auto pool = svc()->zone->get_zone_params().oidc_pool;
+
+  //Get the filtered objects
+  list<std::string> result;
+  bool is_truncated;
+  RGWListRawObjsCtx ctx;
+  do {
+    list<std::string> oids;
+    int r = rados->list_raw_objects(dpp, pool, prefix, 1000, ctx, oids, &is_truncated);
+    if (r < 0) {
+      ldpp_dout(dpp, 0) << "ERROR: listing filtered objects failed: OIDC pool: "
+                  << pool.name << ": " << prefix << ": " << cpp_strerror(-r) << dendl;
+      return r;
+    }
+    for (const auto& iter : oids) {
+      std::unique_ptr<rgw::sal::RGWOIDCProvider> provider = get_oidc_provider();
+      bufferlist bl;
+
+      r = rgw_get_system_obj(svc()->sysobj, pool, iter, bl, nullptr, nullptr, null_yield, dpp);
+      if (r < 0) {
+        return r;
+      }
+
+      try {
+        using ceph::decode;
+        auto iter = bl.cbegin();
+        decode(*provider, iter);
+      } catch (buffer::error& err) {
+        ldpp_dout(dpp, 0) << "ERROR: failed to decode oidc provider info from pool: "
+         << pool.name << ": " << iter << dendl;
+        return -EIO;
+      }
+
+      providers.push_back(std::move(provider));
+    }
+  } while (is_truncated);
+
+  return 0;
+}
+
+std::unique_ptr<Writer> RadosStore::get_append_writer(const DoutPrefixProvider *dpp,
+                                 optional_yield y,
+                                 std::unique_ptr<rgw::sal::Object> _head_obj,
+                                 const rgw_user& owner,
+                                 const rgw_placement_rule *ptail_placement_rule,
+                                 const std::string& unique_tag,
+                                 uint64_t position,
+                                 uint64_t *cur_accounted_size)
+{
+  auto aio = rgw::make_throttle(ctx()->_conf->rgw_put_obj_min_window_size, y);
+  return std::make_unique<RadosAppendWriter>(dpp, y,
+                                std::move(_head_obj),
+                                this, std::move(aio), owner,
+                                ptail_placement_rule,
+                                unique_tag, position,
+                                cur_accounted_size);
+}
+
+std::unique_ptr<Writer> RadosStore::get_atomic_writer(const DoutPrefixProvider *dpp,
+                                 optional_yield y,
+                                 std::unique_ptr<rgw::sal::Object> _head_obj,
+                                 const rgw_user& owner,
+                                 const rgw_placement_rule *ptail_placement_rule,
+                                 uint64_t olh_epoch,
+                                 const std::string& unique_tag)
+{
+  auto aio = rgw::make_throttle(ctx()->_conf->rgw_put_obj_min_window_size, y);
+  return std::make_unique<RadosAtomicWriter>(dpp, y,
+                                std::move(_head_obj),
+                                this, std::move(aio), owner,
+                                ptail_placement_rule,
+                                olh_epoch, unique_tag);
+}
+
+const std::string& RadosStore::get_compression_type(const rgw_placement_rule& rule)
+{
+      return svc()->zone->get_zone_params().get_compression_type(rule);
+}
+
+bool RadosStore::valid_placement(const rgw_placement_rule& rule)
+{
+  return svc()->zone->get_zone_params().valid_placement(rule);
+}
+
+int RadosStore::get_obj_head_ioctx(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj, librados::IoCtx* ioctx)
+{
+  return rados->get_obj_head_ioctx(dpp, bucket_info, obj, ioctx);
+}
+
+RadosObject::~RadosObject()
+{
+  if (rados_ctx_owned)
+    delete rados_ctx;
+}
+
+int RadosObject::get_obj_state(const DoutPrefixProvider* dpp, RGWObjState **pstate, optional_yield y, bool follow_olh)
+{
+  int ret = store->getRados()->get_obj_state(dpp, rados_ctx, bucket->get_info(), this, pstate, &manifest, follow_olh, y);
+  if (ret < 0) {
+    return ret;
+  }
+
+  /* Don't overwrite obj, atomic, or prefetch */
+  rgw_obj obj = get_obj();
+  bool is_atomic = state.is_atomic;
+  bool prefetch_data = state.prefetch_data;
+
+  state = **pstate;
+
+  state.obj = obj;
+  state.is_atomic = is_atomic;
+  state.prefetch_data = prefetch_data;
+  return ret;
+}
+
+int RadosObject::read_attrs(const DoutPrefixProvider* dpp, RGWRados::Object::Read &read_op, optional_yield y, rgw_obj* target_obj)
+{
+  read_op.params.attrs = &attrs;
+  read_op.params.target_obj = target_obj;
+  read_op.params.obj_size = &state.size;
+  read_op.params.lastmod = &state.mtime;
+
+  return read_op.prepare(y, dpp);
+}
+
+int RadosObject::set_obj_attrs(const DoutPrefixProvider* dpp, Attrs* setattrs, Attrs* delattrs, optional_yield y)
+{
+  Attrs empty;
+  return store->getRados()->set_attrs(dpp, rados_ctx,
+                       bucket->get_info(),
+                       this,
+                       setattrs ? *setattrs : empty,
+                       delattrs ? delattrs : nullptr,
+                       y);
+}
+
+int RadosObject::get_obj_attrs(optional_yield y, const DoutPrefixProvider* dpp, rgw_obj* target_obj)
+{
+  RGWRados::Object op_target(store->getRados(), bucket, *rados_ctx, this);
+  RGWRados::Object::Read read_op(&op_target);
+
+  return read_attrs(dpp, read_op, y, target_obj);
+}
+
+int RadosObject::modify_obj_attrs(const char* attr_name, bufferlist& attr_val, optional_yield y, const DoutPrefixProvider* dpp)
+{
+  rgw_obj target = get_obj();
+  rgw_obj save = get_obj();
+  int r = get_obj_attrs(y, dpp, &target);
+  if (r < 0) {
+    return r;
+  }
+
+  /* Temporarily set target */
+  state.obj = target;
+  set_atomic();
+  attrs[attr_name] = attr_val;
+  r = set_obj_attrs(dpp, &attrs, nullptr, y);
+  /* Restore target */
+  state.obj = save;
+
+  return r;
+}
+
+int RadosObject::delete_obj_attrs(const DoutPrefixProvider* dpp, const char* attr_name, optional_yield y)
+{
+  Attrs rmattr;
+  bufferlist bl;
+
+  set_atomic();
+  rmattr[attr_name] = bl;
+  return set_obj_attrs(dpp, nullptr, &rmattr, y);
+}
+
+bool RadosObject::is_expired() {
+  auto iter = attrs.find(RGW_ATTR_DELETE_AT);
+  if (iter != attrs.end()) {
+    utime_t delete_at;
+    try {
+      auto bufit = iter->second.cbegin();
+      decode(delete_at, bufit);
+    } catch (buffer::error& err) {
+      ldout(store->ctx(), 0) << "ERROR: " << __func__ << ": failed to decode " RGW_ATTR_DELETE_AT " attr" << dendl;
+      return false;
+    }
+
+    if (delete_at <= ceph_clock_now() && !delete_at.is_zero()) {
+      return true;
+    }
+  }
+
+  return false;
+}
+
+void RadosObject::gen_rand_obj_instance_name()
+{
+  store->getRados()->gen_rand_obj_instance_name(&state.obj.key);
+}
+
+void RadosObject::raw_obj_to_obj(const rgw_raw_obj& raw_obj)
+{
+  rgw_obj tobj = get_obj();
+  RGWSI_Tier_RADOS::raw_obj_to_obj(get_bucket()->get_key(), raw_obj, &tobj);
+  set_key(tobj.key);
+}
+
+void RadosObject::get_raw_obj(rgw_raw_obj* raw_obj)
+{
+  store->getRados()->obj_to_raw((bucket->get_info()).placement_rule, get_obj(), raw_obj);
+}
+
+int RadosObject::omap_get_vals(const DoutPrefixProvider *dpp, const std::string& marker, uint64_t count,
+                                 std::map<std::string, bufferlist> *m,
+                                 bool* pmore, optional_yield y)
+{
+  rgw_raw_obj raw_obj;
+  get_raw_obj(&raw_obj);
+  auto sysobj = store->svc()->sysobj->get_obj(raw_obj);
+
+  return sysobj.omap().get_vals(dpp, marker, count, m, pmore, y);
+}
+
+int RadosObject::omap_get_all(const DoutPrefixProvider *dpp, std::map<std::string, bufferlist> *m,
+                                optional_yield y)
+{
+  rgw_raw_obj raw_obj;
+  get_raw_obj(&raw_obj);
+  auto sysobj = store->svc()->sysobj->get_obj(raw_obj);
+
+  return sysobj.omap().get_all(dpp, m, y);
+}
+
+int RadosObject::omap_get_vals_by_keys(const DoutPrefixProvider *dpp, const std::string& oid,
+                                         const std::set<std::string>& keys,
+                                         Attrs* vals)
+{
+  int ret;
+  rgw_raw_obj head_obj;
+  librados::IoCtx cur_ioctx;
+  rgw_obj obj = get_obj();
+
+  store->getRados()->obj_to_raw(bucket->get_placement_rule(), obj, &head_obj);
+  ret = store->get_obj_head_ioctx(dpp, bucket->get_info(), obj, &cur_ioctx);
+  if (ret < 0) {
+    return ret;
+  }
+
+  return cur_ioctx.omap_get_vals_by_keys(oid, keys, vals);
+}
+
+int RadosObject::omap_set_val_by_key(const DoutPrefixProvider *dpp, const std::string& key, bufferlist& val,
+                                       bool must_exist, optional_yield y)
+{
+  rgw_raw_obj raw_meta_obj;
+  rgw_obj obj = get_obj();
+
+  store->getRados()->obj_to_raw(bucket->get_placement_rule(), obj, &raw_meta_obj);
+
+  auto sysobj = store->svc()->sysobj->get_obj(raw_meta_obj);
+
+  return sysobj.omap().set_must_exist(must_exist).set(dpp, key, val, y);
+}
+
+std::unique_ptr<MPSerializer> RadosObject::get_serializer(const DoutPrefixProvider *dpp, const std::string& lock_name)
+{
+  return std::make_unique<MPRadosSerializer>(dpp, store, this, lock_name);
+}
+
+int RadosObject::transition(Bucket* bucket,
+                           const rgw_placement_rule& placement_rule,
+                           const real_time& mtime,
+                           uint64_t olh_epoch,
+                           const DoutPrefixProvider* dpp,
+                           optional_yield y)
+{
+  return store->getRados()->transition_obj(*rados_ctx, bucket, *this, placement_rule, mtime, olh_epoch, dpp, y);
+}
+
+int RadosObject::transition_to_cloud(Bucket* bucket,
+                          rgw::sal::PlacementTier* tier,
+                          rgw_bucket_dir_entry& o,
+                          std::set<std::string>& cloud_targets,
+                          CephContext* cct,
+                          bool update_object,
+                          const DoutPrefixProvider* dpp,
+                          optional_yield y)
+{
+  /* init */
+  rgw::sal::RadosPlacementTier* rtier = static_cast<rgw::sal::RadosPlacementTier*>(tier);
+  string id = "cloudid";
+  string endpoint = rtier->get_rt().t.s3.endpoint;
+  RGWAccessKey key = rtier->get_rt().t.s3.key;
+  string region = rtier->get_rt().t.s3.region;
+  HostStyle host_style = rtier->get_rt().t.s3.host_style;
+  string bucket_name = rtier->get_rt().t.s3.target_path;
+  const rgw::sal::ZoneGroup& zonegroup = store->get_zone()->get_zonegroup();
+
+  if (bucket_name.empty()) {
+    bucket_name = "rgwx-" + zonegroup.get_name() + "-" + tier->get_storage_class() +
+                    "-cloud-bucket";
+    boost::algorithm::to_lower(bucket_name);
+  }
+
+  /* Create RGW REST connection */
+  S3RESTConn conn(cct, id, { endpoint }, key, zonegroup.get_id(), region, host_style);
+
+  RGWLCCloudTierCtx tier_ctx(cct, dpp, o, store, bucket->get_info(),
+                            this, conn, bucket_name,
+                            rtier->get_rt().t.s3.target_storage_class);
+  tier_ctx.acl_mappings = rtier->get_rt().t.s3.acl_mappings;
+  tier_ctx.multipart_min_part_size = rtier->get_rt().t.s3.multipart_min_part_size;
+  tier_ctx.multipart_sync_threshold = rtier->get_rt().t.s3.multipart_sync_threshold;
+  tier_ctx.storage_class = tier->get_storage_class();
+
+  ldpp_dout(dpp, 0) << "Transitioning object(" << o.key << ") to the cloud endpoint(" << endpoint << ")" << dendl;
+
+  /* Transition object to cloud end point */
+  int ret = rgw_cloud_tier_transfer_object(tier_ctx, cloud_targets);
+
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: failed to transfer object(" << o.key << ") to the cloud endpoint(" << endpoint << ") ret=" << ret << dendl;
+    return ret;
+  }
+
+  if (update_object) {
+    real_time read_mtime;
+
+    std::unique_ptr<rgw::sal::Object::ReadOp> read_op(get_read_op());
+    read_op->params.lastmod = &read_mtime;
+
+    ret = read_op->prepare(null_yield, dpp);
+    if (ret < 0) {
+      ldpp_dout(dpp, 0) << "ERROR: Updating tier object(" << o.key << ") failed ret=" << ret << dendl;
+      return ret;
+    }
+
+    if (read_mtime != tier_ctx.o.meta.mtime) {
+      /* raced */
+      ldpp_dout(dpp, 0) << "ERROR: Updating tier object(" << o.key << ") failed ret=" << -ECANCELED << dendl;
+      return -ECANCELED;
+    }
+
+    rgw_placement_rule target_placement;
+    target_placement.inherit_from(tier_ctx.bucket_info.placement_rule);
+    target_placement.storage_class = tier->get_storage_class();
+
+    ret = write_cloud_tier(dpp, null_yield, tier_ctx.o.versioned_epoch,
+                          tier, tier_ctx.is_multipart_upload,
+                          target_placement, tier_ctx.obj);
+
+  }
+
+  return ret;
+}
+
+int RadosObject::write_cloud_tier(const DoutPrefixProvider* dpp,
+                                 optional_yield y,
+                                 uint64_t olh_epoch,
+                                 PlacementTier* tier,
+                                 bool is_multipart_upload,
+                                 rgw_placement_rule& target_placement,
+                                 Object* head_obj)
+{
+  rgw::sal::RadosPlacementTier* rtier = static_cast<rgw::sal::RadosPlacementTier*>(tier);
+  map<string, bufferlist> attrs = get_attrs();
+  RGWRados::Object op_target(store->getRados(), bucket, *rados_ctx, this);
+  RGWRados::Object::Write obj_op(&op_target);
+
+  obj_op.meta.modify_tail = true;
+  obj_op.meta.flags = PUT_OBJ_CREATE;
+  obj_op.meta.category = RGWObjCategory::CloudTiered;
+  obj_op.meta.delete_at = real_time();
+  bufferlist blo;
+  obj_op.meta.data = &blo;
+  obj_op.meta.if_match = NULL;
+  obj_op.meta.if_nomatch = NULL;
+  obj_op.meta.user_data = NULL;
+  obj_op.meta.zones_trace = NULL;
+  obj_op.meta.delete_at = real_time();
+  obj_op.meta.olh_epoch = olh_epoch;
+
+  RGWObjManifest *pmanifest;
+  RGWObjManifest manifest;
+
+  pmanifest = &manifest;
+  RGWObjTier tier_config;
+  tier_config.name = tier->get_storage_class();
+  tier_config.tier_placement = rtier->get_rt();
+  tier_config.is_multipart_upload = is_multipart_upload;
+
+  pmanifest->set_tier_type("cloud-s3");
+  pmanifest->set_tier_config(tier_config);
+
+  /* check if its necessary */
+  pmanifest->set_head(target_placement, head_obj->get_obj(), 0);
+  pmanifest->set_tail_placement(target_placement, head_obj->get_obj().bucket);
+  pmanifest->set_obj_size(0);
+  obj_op.meta.manifest = pmanifest;
+
+  /* update storage class */
+  bufferlist bl;
+  bl.append(tier->get_storage_class());
+  attrs[RGW_ATTR_STORAGE_CLASS] = bl;
+
+  attrs.erase(RGW_ATTR_ID_TAG);
+  attrs.erase(RGW_ATTR_TAIL_TAG);
+
+  return obj_op.write_meta(dpp, 0, 0, attrs, y);
+}
+
+int RadosObject::get_max_chunk_size(const DoutPrefixProvider* dpp, rgw_placement_rule placement_rule, uint64_t* max_chunk_size, uint64_t* alignment)
+{
+  return store->getRados()->get_max_chunk_size(placement_rule, get_obj(), max_chunk_size, dpp, alignment);
+}
+
+void RadosObject::get_max_aligned_size(uint64_t size, uint64_t alignment,
+                                    uint64_t* max_size)
+{
+  store->getRados()->get_max_aligned_size(size, alignment, max_size);
+}
+
+bool RadosObject::placement_rules_match(rgw_placement_rule& r1, rgw_placement_rule& r2)
+{
+  rgw_obj obj;
+  rgw_pool p1, p2;
+
+  obj = get_obj();
+
+  if (r1 == r2)
+    return true;
+
+  if (!store->getRados()->get_obj_data_pool(r1, obj, &p1)) {
+    return false;
+  }
+  if (!store->getRados()->get_obj_data_pool(r2, obj, &p2)) {
+    return false;
+  }
+
+  return p1 == p2;
+}
+
+int RadosObject::dump_obj_layout(const DoutPrefixProvider *dpp, optional_yield y, Formatter* f)
+{
+  int ret;
+  RGWObjManifest *amanifest{nullptr};
+  rgw_raw_obj head_obj;
+
+  RGWRados::Object op_target(store->getRados(), get_bucket(), *rados_ctx, this);
+  RGWRados::Object::Read parent_op(&op_target);
+  uint64_t obj_size;
+
+  parent_op.params.obj_size = &obj_size;
+  parent_op.params.attrs = &get_attrs();
+
+  ret = parent_op.prepare(y, dpp);
+  if (ret < 0) {
+    return ret;
+  }
+
+  head_obj = parent_op.state.head_obj;
+
+  ret = op_target.get_manifest(dpp, &amanifest, y);
+  if (ret < 0) {
+    return ret;
+  }
+
+  ::encode_json("head", head_obj, f);
+  ::encode_json("manifest", *amanifest, f);
+  f->open_array_section("data_location");
+  for (auto miter = amanifest->obj_begin(dpp); miter != amanifest->obj_end(dpp); ++miter) {
+    f->open_object_section("obj");
+    rgw_raw_obj raw_loc = miter.get_location().get_raw_obj(store);
+    uint64_t ofs = miter.get_ofs();
+    uint64_t left = amanifest->get_obj_size() - ofs;
+    ::encode_json("ofs", miter.get_ofs(), f);
+    ::encode_json("loc", raw_loc, f);
+    ::encode_json("loc_ofs", miter.location_ofs(), f);
+    uint64_t loc_size = miter.get_stripe_size();
+    if (loc_size > left) {
+      loc_size = left;
+    }
+    ::encode_json("loc_size", loc_size, f);
+    f->close_section();
+  }
+  f->close_section();
+
+  return 0;
+}
+
+std::unique_ptr<Object::ReadOp> RadosObject::get_read_op()
+{
+  return std::make_unique<RadosObject::RadosReadOp>(this, rados_ctx);
+}
+
+RadosObject::RadosReadOp::RadosReadOp(RadosObject *_source, RGWObjectCtx *_rctx) :
+       source(_source),
+       rctx(_rctx),
+       op_target(_source->store->getRados(),
+                 _source->get_bucket(),
+                 *static_cast<RGWObjectCtx *>(rctx),
+                 _source),
+       parent_op(&op_target)
+{ }
+
+int RadosObject::RadosReadOp::prepare(optional_yield y, const DoutPrefixProvider* dpp)
+{
+  uint64_t obj_size;
+
+  parent_op.conds.mod_ptr = params.mod_ptr;
+  parent_op.conds.unmod_ptr = params.unmod_ptr;
+  parent_op.conds.high_precision_time = params.high_precision_time;
+  parent_op.conds.mod_zone_id = params.mod_zone_id;
+  parent_op.conds.mod_pg_ver = params.mod_pg_ver;
+  parent_op.conds.if_match = params.if_match;
+  parent_op.conds.if_nomatch = params.if_nomatch;
+  parent_op.params.lastmod = params.lastmod;
+  parent_op.params.target_obj = params.target_obj;
+  parent_op.params.obj_size = &obj_size;
+  parent_op.params.attrs = &source->get_attrs();
+
+  int ret = parent_op.prepare(y, dpp);
+  if (ret < 0)
+    return ret;
+
+  source->set_key(parent_op.state.obj.key);
+  source->set_obj_size(obj_size);
+
+  return ret;
+}
+
+int RadosObject::RadosReadOp::read(int64_t ofs, int64_t end, bufferlist& bl, optional_yield y, const DoutPrefixProvider* dpp)
+{
+  return parent_op.read(ofs, end, bl, y, dpp);
+}
+
+int RadosObject::RadosReadOp::get_attr(const DoutPrefixProvider* dpp, const char* name, bufferlist& dest, optional_yield y)
+{
+  return parent_op.get_attr(dpp, name, dest, y);
+}
+
+std::unique_ptr<Object::DeleteOp> RadosObject::get_delete_op()
+{
+  return std::make_unique<RadosObject::RadosDeleteOp>(this);
+}
+
+RadosObject::RadosDeleteOp::RadosDeleteOp(RadosObject *_source) :
+       source(_source),
+       op_target(_source->store->getRados(),
+                 _source->get_bucket(),
+                 _source->get_ctx(),
+                 _source),
+       parent_op(&op_target)
+{ }
+
+int RadosObject::RadosDeleteOp::delete_obj(const DoutPrefixProvider* dpp, optional_yield y)
+{
+  parent_op.params.bucket_owner = params.bucket_owner.get_id();
+  parent_op.params.versioning_status = params.versioning_status;
+  parent_op.params.obj_owner = params.obj_owner;
+  parent_op.params.olh_epoch = params.olh_epoch;
+  parent_op.params.marker_version_id = params.marker_version_id;
+  parent_op.params.bilog_flags = params.bilog_flags;
+  parent_op.params.remove_objs = params.remove_objs;
+  parent_op.params.expiration_time = params.expiration_time;
+  parent_op.params.unmod_since = params.unmod_since;
+  parent_op.params.mtime = params.mtime;
+  parent_op.params.high_precision_time = params.high_precision_time;
+  parent_op.params.zones_trace = params.zones_trace;
+  parent_op.params.abortmp = params.abortmp;
+  parent_op.params.parts_accounted_size = params.parts_accounted_size;
+
+  int ret = parent_op.delete_obj(y, dpp);
+  if (ret < 0)
+    return ret;
+
+  result.delete_marker = parent_op.result.delete_marker;
+  result.version_id = parent_op.result.version_id;
+
+  return ret;
+}
+
+int RadosObject::delete_object(const DoutPrefixProvider* dpp,
+                              optional_yield y,
+                              bool prevent_versioning)
+{
+  RGWRados::Object del_target(store->getRados(), bucket, *rados_ctx, this);
+  RGWRados::Object::Delete del_op(&del_target);
+
+  del_op.params.bucket_owner = bucket->get_info().owner;
+  del_op.params.versioning_status = prevent_versioning ? 0 : bucket->get_info().versioning_status();
+
+  return del_op.delete_obj(y, dpp);
+}
+
+int RadosObject::delete_obj_aio(const DoutPrefixProvider* dpp, RGWObjState* astate,
+                                  Completions* aio, bool keep_index_consistent,
+                                  optional_yield y)
+{
+  RadosCompletions* raio = static_cast<RadosCompletions*>(aio);
+
+  return store->getRados()->delete_obj_aio(dpp, get_obj(), bucket->get_info(), astate,
+                                          raio->handles, keep_index_consistent, y);
+}
+
+int RadosObject::copy_object(User* user,
+                               req_info* info,
+                               const rgw_zone_id& source_zone,
+                               rgw::sal::Object* dest_object,
+                               rgw::sal::Bucket* dest_bucket,
+                               rgw::sal::Bucket* src_bucket,
+                               const rgw_placement_rule& dest_placement,
+                               ceph::real_time* src_mtime,
+                               ceph::real_time* mtime,
+                               const ceph::real_time* mod_ptr,
+                               const ceph::real_time* unmod_ptr,
+                               bool high_precision_time,
+                               const char* if_match,
+                               const char* if_nomatch,
+                               AttrsMod attrs_mod,
+                               bool copy_if_newer,
+                               Attrs& attrs,
+                               RGWObjCategory category,
+                               uint64_t olh_epoch,
+                               boost::optional<ceph::real_time> delete_at,
+                               std::string* version_id,
+                               std::string* tag,
+                               std::string* etag,
+                               void (*progress_cb)(off_t, void *),
+                               void* progress_data,
+                               const DoutPrefixProvider* dpp,
+                               optional_yield y)
+{
+  return store->getRados()->copy_obj(*rados_ctx,
+                                    user->get_id(),
+                                    info,
+                                    source_zone,
+                                    dest_object,
+                                    this,
+                                    dest_bucket,
+                                    src_bucket,
+                                    dest_placement,
+                                    src_mtime,
+                                    mtime,
+                                    mod_ptr,
+                                    unmod_ptr,
+                                    high_precision_time,
+                                    if_match,
+                                    if_nomatch,
+                                    static_cast<RGWRados::AttrsMod>(attrs_mod),
+                                    copy_if_newer,
+                                    attrs,
+                                    category,
+                                    olh_epoch,
+                                    (delete_at ? *delete_at : real_time()),
+                                    version_id,
+                                    tag,
+                                    etag,
+                                    progress_cb,
+                                    progress_data,
+                                    dpp,
+                                    y);
+}
+
+int RadosObject::RadosReadOp::iterate(const DoutPrefixProvider* dpp, int64_t ofs, int64_t end, RGWGetDataCB* cb, optional_yield y)
+{
+  return parent_op.iterate(dpp, ofs, end, cb, y);
+}
+
+int RadosObject::swift_versioning_restore(bool& restored,
+                                         const DoutPrefixProvider* dpp)
+{
+  return store->getRados()->swift_versioning_restore(*rados_ctx,
+                                                    bucket->get_owner()->get_id(),
+                                                    bucket,
+                                                    this,
+                                                    restored,
+                                                    dpp);
+}
+
+int RadosObject::swift_versioning_copy(const DoutPrefixProvider* dpp, optional_yield y)
+{
+  return store->getRados()->swift_versioning_copy(*rados_ctx,
+                                        bucket->get_info().owner,
+                                        bucket,
+                                        this,
+                                        dpp,
+                                        y);
+}
+
+int RadosMultipartUpload::abort(const DoutPrefixProvider *dpp, CephContext *cct)
+{
+  std::unique_ptr<rgw::sal::Object> meta_obj = get_meta_obj();
+  meta_obj->set_in_extra_data(true);
+  meta_obj->set_hash_source(mp_obj.get_key());
+  cls_rgw_obj_chain chain;
+  list<rgw_obj_index_key> remove_objs;
+  bool truncated;
+  int marker = 0;
+  int ret;
+  uint64_t parts_accounted_size = 0;
+
+  do {
+    ret = list_parts(dpp, cct, 1000, marker, &marker, &truncated);
+    if (ret < 0) {
+      ldpp_dout(dpp, 20) << __func__ << ": RadosMultipartUpload::list_parts returned " <<
+       ret << dendl;
+      return (ret == -ENOENT) ? -ERR_NO_SUCH_UPLOAD : ret;
+    }
+
+    for (auto part_it = parts.begin();
+        part_it != parts.end();
+        ++part_it) {
+      RadosMultipartPart* obj_part = dynamic_cast<RadosMultipartPart*>(part_it->second.get());
+      if (obj_part->info.manifest.empty()) {
+       std::unique_ptr<rgw::sal::Object> obj = bucket->get_object(
+                                   rgw_obj_key(obj_part->oid, std::string(), RGW_OBJ_NS_MULTIPART));
+       obj->set_hash_source(mp_obj.get_key());
+       ret = obj->delete_object(dpp, null_yield);
+        if (ret < 0 && ret != -ENOENT)
+          return ret;
+      } else {
+       auto target = meta_obj->get_obj();
+       store->getRados()->update_gc_chain(dpp, target, obj_part->info.manifest, &chain);
+        RGWObjManifest::obj_iterator oiter = obj_part->info.manifest.obj_begin(dpp);
+        if (oiter != obj_part->info.manifest.obj_end(dpp)) {
+         std::unique_ptr<rgw::sal::Object> head = bucket->get_object(rgw_obj_key());
+          rgw_raw_obj raw_head = oiter.get_location().get_raw_obj(store);
+         dynamic_cast<rgw::sal::RadosObject*>(head.get())->raw_obj_to_obj(raw_head);
+
+          rgw_obj_index_key key;
+          head->get_key().get_index_key(&key);
+          remove_objs.push_back(key);
+        }
+      }
+      parts_accounted_size += obj_part->info.accounted_size;
+    }
+  } while (truncated);
+
+  if (store->getRados()->get_gc() == nullptr) {
+    //Delete objects inline if gc hasn't been initialised (in case when bypass gc is specified)
+    store->getRados()->delete_objs_inline(dpp, chain, mp_obj.get_upload_id());
+  } else {
+    /* use upload id as tag and do it synchronously */
+    auto [ret, leftover_chain] = store->getRados()->send_chain_to_gc(chain, mp_obj.get_upload_id());
+    if (ret < 0 && leftover_chain) {
+      ldpp_dout(dpp, 5) << __func__ << ": gc->send_chain() returned " << ret << dendl;
+      if (ret == -ENOENT) {
+        return -ERR_NO_SUCH_UPLOAD;
+      }
+      //Delete objects inline if send chain to gc fails
+      store->getRados()->delete_objs_inline(dpp, *leftover_chain, mp_obj.get_upload_id());
+    }
+  }
+
+  std::unique_ptr<rgw::sal::Object::DeleteOp> del_op = meta_obj->get_delete_op();
+  del_op->params.bucket_owner = bucket->get_acl_owner();
+  del_op->params.versioning_status = 0;
+  if (!remove_objs.empty()) {
+    del_op->params.remove_objs = &remove_objs;
+  }
+  
+  del_op->params.abortmp = true;
+  del_op->params.parts_accounted_size = parts_accounted_size;
+
+  // and also remove the metadata obj
+  ret = del_op->delete_obj(dpp, null_yield);
+  if (ret < 0) {
+    ldpp_dout(dpp, 20) << __func__ << ": del_op.delete_obj returned " <<
+      ret << dendl;
+  }
+  return (ret == -ENOENT) ? -ERR_NO_SUCH_UPLOAD : ret;
+}
+
+std::unique_ptr<rgw::sal::Object> RadosMultipartUpload::get_meta_obj()
+{
+  return bucket->get_object(rgw_obj_key(get_meta(), string(), mp_ns));
+}
+
+int RadosMultipartUpload::init(const DoutPrefixProvider *dpp, optional_yield y, ACLOwner& owner, rgw_placement_rule& dest_placement, rgw::sal::Attrs& attrs)
+{
+  int ret;
+  std::string oid = mp_obj.get_key();
+  RGWObjectCtx obj_ctx(store);
+
+  do {
+    char buf[33];
+    string tmp_obj_name;
+    std::unique_ptr<rgw::sal::Object> obj;
+    gen_rand_alphanumeric(store->ctx(), buf, sizeof(buf) - 1);
+    std::string upload_id = MULTIPART_UPLOAD_ID_PREFIX; /* v2 upload id */
+    upload_id.append(buf);
+
+    mp_obj.init(oid, upload_id);
+    tmp_obj_name = mp_obj.get_meta();
+
+    obj = bucket->get_object(rgw_obj_key(tmp_obj_name, string(), mp_ns));
+    // the meta object will be indexed with 0 size, we c
+    obj->set_in_extra_data(true);
+    obj->set_hash_source(oid);
+
+    RGWRados::Object op_target(store->getRados(),
+                              obj->get_bucket(),
+                              obj_ctx, obj.get());
+    RGWRados::Object::Write obj_op(&op_target);
+
+    op_target.set_versioning_disabled(true); /* no versioning for multipart meta */
+    obj_op.meta.owner = owner.get_id();
+    obj_op.meta.category = RGWObjCategory::MultiMeta;
+    obj_op.meta.flags = PUT_OBJ_CREATE_EXCL;
+    obj_op.meta.mtime = &mtime;
+
+    multipart_upload_info upload_info;
+    upload_info.dest_placement = dest_placement;
+
+    bufferlist bl;
+    encode(upload_info, bl);
+    obj_op.meta.data = &bl;
+
+    ret = obj_op.write_meta(dpp, bl.length(), 0, attrs, y);
+  } while (ret == -EEXIST);
+
+  return ret;
+}
+
+int RadosMultipartUpload::list_parts(const DoutPrefixProvider *dpp, CephContext *cct,
+                                    int num_parts, int marker,
+                                    int *next_marker, bool *truncated,
+                                    bool assume_unsorted)
+{
+  map<string, bufferlist> parts_map;
+  map<string, bufferlist>::iterator iter;
+
+  std::unique_ptr<rgw::sal::Object> obj = bucket->get_object(
+                     rgw_obj_key(get_meta(), std::string(), RGW_OBJ_NS_MULTIPART));
+  obj->set_in_extra_data(true);
+
+  bool sorted_omap = is_v2_upload_id(get_upload_id()) && !assume_unsorted;
+
+  parts.clear();
+
+  int ret;
+  if (sorted_omap) {
+    string p;
+    p = "part.";
+    char buf[32];
+
+    snprintf(buf, sizeof(buf), "%08d", marker);
+    p.append(buf);
+
+    ret = obj->omap_get_vals(dpp, p, num_parts + 1, &parts_map,
+                                 nullptr, null_yield);
+  } else {
+    ret = obj->omap_get_all(dpp, &parts_map, null_yield);
+  }
+  if (ret < 0) {
+    return ret;
+  }
+
+  int i;
+  int last_num = 0;
+
+  uint32_t expected_next = marker + 1;
+
+  for (i = 0, iter = parts_map.begin();
+       (i < num_parts || !sorted_omap) && iter != parts_map.end();
+       ++iter, ++i) {
+    bufferlist& bl = iter->second;
+    auto bli = bl.cbegin();
+    std::unique_ptr<RadosMultipartPart> part = std::make_unique<RadosMultipartPart>();
+    try {
+      decode(part->info, bli);
+    } catch (buffer::error& err) {
+      ldpp_dout(dpp, 0) << "ERROR: could not part info, caught buffer::error" <<
+       dendl;
+      return -EIO;
+    }
+    if (sorted_omap) {
+      if (part->info.num != expected_next) {
+        /* ouch, we expected a specific part num here, but we got a
+         * different one. Either a part is missing, or it could be a
+         * case of mixed rgw versions working on the same upload,
+         * where one gateway doesn't support correctly sorted omap
+         * keys for multipart upload just assume data is unsorted.
+         */
+        return list_parts(dpp, cct, num_parts, marker, next_marker, truncated, true);
+      }
+      expected_next++;
+    }
+    if (sorted_omap ||
+      (int)part->info.num > marker) {
+      last_num = part->info.num;
+      parts[part->info.num] = std::move(part);
+    }
+  }
+
+  if (sorted_omap) {
+    if (truncated) {
+      *truncated = (iter != parts_map.end());
+    }
+  } else {
+    /* rebuild a map with only num_parts entries */
+    std::map<uint32_t, std::unique_ptr<MultipartPart>> new_parts;
+    std::map<uint32_t, std::unique_ptr<MultipartPart>>::iterator piter;
+    for (i = 0, piter = parts.begin();
+        i < num_parts && piter != parts.end();
+        ++i, ++piter) {
+      last_num = piter->first;
+      new_parts[piter->first] = std::move(piter->second);
+    }
+
+    if (truncated) {
+      *truncated = (piter != parts.end());
+    }
+
+    parts.swap(new_parts);
+  }
+
+  if (next_marker) {
+    *next_marker = last_num;
+  }
+
+  return 0;
+}
+
+int RadosMultipartUpload::complete(const DoutPrefixProvider *dpp,
+                                  optional_yield y, CephContext* cct,
+                                  map<int, string>& part_etags,
+                                  list<rgw_obj_index_key>& remove_objs,
+                                  uint64_t& accounted_size, bool& compressed,
+                                  RGWCompressionInfo& cs_info, off_t& ofs,
+                                  std::string& tag, ACLOwner& owner,
+                                  uint64_t olh_epoch,
+                                  rgw::sal::Object* target_obj)
+{
+  char final_etag[CEPH_CRYPTO_MD5_DIGESTSIZE];
+  char final_etag_str[CEPH_CRYPTO_MD5_DIGESTSIZE * 2 + 16];
+  std::string etag;
+  bufferlist etag_bl;
+  MD5 hash;
+  // Allow use of MD5 digest in FIPS mode for non-cryptographic purposes
+  hash.SetFlags(EVP_MD_CTX_FLAG_NON_FIPS_ALLOW);
+  bool truncated;
+  int ret;
+
+  int total_parts = 0;
+  int handled_parts = 0;
+  int max_parts = 1000;
+  int marker = 0;
+  uint64_t min_part_size = cct->_conf->rgw_multipart_min_part_size;
+  auto etags_iter = part_etags.begin();
+  rgw::sal::Attrs attrs = target_obj->get_attrs();
+
+  do {
+    ret = list_parts(dpp, cct, max_parts, marker, &marker, &truncated);
+    if (ret == -ENOENT) {
+      ret = -ERR_NO_SUCH_UPLOAD;
+    }
+    if (ret < 0)
+      return ret;
+
+    total_parts += parts.size();
+    if (!truncated && total_parts != (int)part_etags.size()) {
+      ldpp_dout(dpp, 0) << "NOTICE: total parts mismatch: have: " << total_parts
+                      << " expected: " << part_etags.size() << dendl;
+      ret = -ERR_INVALID_PART;
+      return ret;
+    }
+
+    for (auto obj_iter = parts.begin(); etags_iter != part_etags.end() && obj_iter != parts.end(); ++etags_iter, ++obj_iter, ++handled_parts) {
+      RadosMultipartPart* part = dynamic_cast<rgw::sal::RadosMultipartPart*>(obj_iter->second.get());
+      uint64_t part_size = part->get_size();
+      if (handled_parts < (int)part_etags.size() - 1 &&
+          part_size < min_part_size) {
+        ret = -ERR_TOO_SMALL;
+        return ret;
+      }
+
+      char petag[CEPH_CRYPTO_MD5_DIGESTSIZE];
+      if (etags_iter->first != (int)obj_iter->first) {
+        ldpp_dout(dpp, 0) << "NOTICE: parts num mismatch: next requested: "
+                        << etags_iter->first << " next uploaded: "
+                        << obj_iter->first << dendl;
+        ret = -ERR_INVALID_PART;
+        return ret;
+      }
+      string part_etag = rgw_string_unquote(etags_iter->second);
+      if (part_etag.compare(part->get_etag()) != 0) {
+        ldpp_dout(dpp, 0) << "NOTICE: etag mismatch: part: " << etags_iter->first
+                        << " etag: " << etags_iter->second << dendl;
+        ret = -ERR_INVALID_PART;
+        return ret;
+      }
+
+      hex_to_buf(part->get_etag().c_str(), petag,
+               CEPH_CRYPTO_MD5_DIGESTSIZE);
+      hash.Update((const unsigned char *)petag, sizeof(petag));
+
+      RGWUploadPartInfo& obj_part = part->info;
+
+      /* update manifest for part */
+      string oid = mp_obj.get_part(part->info.num);
+      rgw_obj src_obj;
+      src_obj.init_ns(bucket->get_key(), oid, mp_ns);
+
+      if (obj_part.manifest.empty()) {
+        ldpp_dout(dpp, 0) << "ERROR: empty manifest for object part: obj="
+                        << src_obj << dendl;
+        ret = -ERR_INVALID_PART;
+        return ret;
+      } else {
+        manifest.append(dpp, obj_part.manifest, store->svc()->zone->get_zonegroup(), store->svc()->zone->get_zone_params());
+      }
+
+      bool part_compressed = (obj_part.cs_info.compression_type != "none");
+      if ((handled_parts > 0) &&
+          ((part_compressed != compressed) ||
+            (cs_info.compression_type != obj_part.cs_info.compression_type))) {
+          ldpp_dout(dpp, 0) << "ERROR: compression type was changed during multipart upload ("
+                           << cs_info.compression_type << ">>" << obj_part.cs_info.compression_type << ")" << dendl;
+          ret = -ERR_INVALID_PART;
+          return ret; 
+      }
+      
+      if (part_compressed) {
+        int64_t new_ofs; // offset in compression data for new part
+        if (cs_info.blocks.size() > 0)
+          new_ofs = cs_info.blocks.back().new_ofs + cs_info.blocks.back().len;
+        else
+          new_ofs = 0;
+        for (const auto& block : obj_part.cs_info.blocks) {
+          compression_block cb;
+          cb.old_ofs = block.old_ofs + cs_info.orig_size;
+          cb.new_ofs = new_ofs;
+          cb.len = block.len;
+          cs_info.blocks.push_back(cb);
+          new_ofs = cb.new_ofs + cb.len;
+        } 
+        if (!compressed)
+          cs_info.compression_type = obj_part.cs_info.compression_type;
+        cs_info.orig_size += obj_part.cs_info.orig_size;
+        compressed = true;
+      }
+
+      rgw_obj_index_key remove_key;
+      src_obj.key.get_index_key(&remove_key);
+
+      remove_objs.push_back(remove_key);
+
+      ofs += obj_part.size;
+      accounted_size += obj_part.accounted_size;
+    }
+  } while (truncated);
+  hash.Final((unsigned char *)final_etag);
+
+  buf_to_hex((unsigned char *)final_etag, sizeof(final_etag), final_etag_str);
+  snprintf(&final_etag_str[CEPH_CRYPTO_MD5_DIGESTSIZE * 2],
+          sizeof(final_etag_str) - CEPH_CRYPTO_MD5_DIGESTSIZE * 2,
+           "-%lld", (long long)part_etags.size());
+  etag = final_etag_str;
+  ldpp_dout(dpp, 10) << "calculated etag: " << etag << dendl;
+
+  etag_bl.append(etag);
+
+  attrs[RGW_ATTR_ETAG] = etag_bl;
+
+  if (compressed) {
+    // write compression attribute to full object
+    bufferlist tmp;
+    encode(cs_info, tmp);
+    attrs[RGW_ATTR_COMPRESSION] = tmp;
+  }
+
+  target_obj->set_atomic();
+
+  RGWRados::Object op_target(store->getRados(),
+                            target_obj->get_bucket(),
+                            dynamic_cast<RadosObject*>(target_obj)->get_ctx(),
+                            target_obj);
+  RGWRados::Object::Write obj_op(&op_target);
+
+  obj_op.meta.manifest = &manifest;
+  obj_op.meta.remove_objs = &remove_objs;
+
+  obj_op.meta.ptag = &tag; /* use req_id as operation tag */
+  obj_op.meta.owner = owner.get_id();
+  obj_op.meta.flags = PUT_OBJ_CREATE;
+  obj_op.meta.modify_tail = true;
+  obj_op.meta.completeMultipart = true;
+  obj_op.meta.olh_epoch = olh_epoch;
+
+  ret = obj_op.write_meta(dpp, ofs, accounted_size, attrs, y);
+  if (ret < 0)
+    return ret;
+
+  return ret;
+}
+
+int RadosMultipartUpload::get_info(const DoutPrefixProvider *dpp, optional_yield y, rgw_placement_rule** rule, rgw::sal::Attrs* attrs)
+{
+  if (!rule && !attrs) {
+    return 0;
+  }
+
+  if (rule) {
+    if (!placement.empty()) {
+      *rule = &placement;
+      if (!attrs) {
+       /* Don't need attrs, done */
+       return 0;
+      }
+    } else {
+      *rule = nullptr;
+    }
+  }
+
+  /* We need either attributes or placement, so we need a read */
+  std::unique_ptr<rgw::sal::Object> meta_obj;
+  meta_obj = get_meta_obj();
+  meta_obj->set_in_extra_data(true);
+
+  multipart_upload_info upload_info;
+  bufferlist headbl;
+
+  /* Read the obj head which contains the multipart_upload_info */
+  std::unique_ptr<rgw::sal::Object::ReadOp> read_op = meta_obj->get_read_op();
+  meta_obj->set_prefetch_data();
+
+  int ret = read_op->prepare(y, dpp);
+  if (ret < 0) {
+    if (ret == -ENOENT) {
+      return -ERR_NO_SUCH_UPLOAD;
+    }
+    return ret;
+  }
+
+  extract_span_context(meta_obj->get_attrs(), trace_ctx);
+
+  if (attrs) {
+    /* Attrs are filled in by prepare */
+    *attrs = meta_obj->get_attrs();
+    if (!rule || *rule != nullptr) {
+      /* placement was cached; don't actually read */
+      return 0;
+    }
+  }
+
+  /* Now read the placement from the head */
+  ret = read_op->read(0, store->ctx()->_conf->rgw_max_chunk_size, headbl, y, dpp);
+  if (ret < 0) {
+    if (ret == -ENOENT) {
+      return -ERR_NO_SUCH_UPLOAD;
+    }
+    return ret;
+  }
+
+  if (headbl.length() <= 0) {
+    return -ERR_NO_SUCH_UPLOAD;
+  }
+
+  /* Decode multipart_upload_info */
+  auto hiter = headbl.cbegin();
+  try {
+    decode(upload_info, hiter);
+  } catch (buffer::error& err) {
+    ldpp_dout(dpp, 0) << "ERROR: failed to decode multipart upload info" << dendl;
+    return -EIO;
+  }
+  placement = upload_info.dest_placement;
+  *rule = &placement;
+
+  return 0;
+}
+
+std::unique_ptr<Writer> RadosMultipartUpload::get_writer(
+                                 const DoutPrefixProvider *dpp,
+                                 optional_yield y,
+                                 std::unique_ptr<rgw::sal::Object> _head_obj,
+                                 const rgw_user& owner,
+                                 const rgw_placement_rule *ptail_placement_rule,
+                                 uint64_t part_num,
+                                 const std::string& part_num_str)
+{
+  auto aio = rgw::make_throttle(store->ctx()->_conf->rgw_put_obj_min_window_size, y);
+  return std::make_unique<RadosMultipartWriter>(dpp, y, this,
+                                std::move(_head_obj), store, std::move(aio), owner,
+                                ptail_placement_rule, part_num, part_num_str);
+}
+
+MPRadosSerializer::MPRadosSerializer(const DoutPrefixProvider *dpp, RadosStore* store, RadosObject* obj, const std::string& lock_name) :
+  lock(lock_name)
+{
+  rgw_pool meta_pool;
+  rgw_raw_obj raw_obj;
+
+  obj->get_raw_obj(&raw_obj);
+  oid = raw_obj.oid;
+  store->getRados()->get_obj_data_pool(obj->get_bucket()->get_placement_rule(),
+                                      obj->get_obj(), &meta_pool);
+  store->getRados()->open_pool_ctx(dpp, meta_pool, ioctx, true);
+}
+
+int MPRadosSerializer::try_lock(const DoutPrefixProvider *dpp, utime_t dur, optional_yield y)
+{
+  op.assert_exists();
+  lock.set_duration(dur);
+  lock.lock_exclusive(&op);
+  int ret = rgw_rados_operate(dpp, ioctx, oid, &op, y);
+  if (! ret) {
+    locked = true;
+  }
+  return ret;
+}
+
+LCRadosSerializer::LCRadosSerializer(RadosStore* store, const std::string& _oid, const std::string& lock_name, const std::string& cookie) :
+  StoreLCSerializer(_oid),
+  lock(lock_name)
+{
+  ioctx = &store->getRados()->lc_pool_ctx;
+  lock.set_cookie(cookie);
+}
+
+int LCRadosSerializer::try_lock(const DoutPrefixProvider *dpp, utime_t dur, optional_yield y)
+{
+  lock.set_duration(dur);
+  return lock.lock_exclusive(ioctx, oid);
+}
+
+int RadosLifecycle::get_entry(const std::string& oid, const std::string& marker,
+                             std::unique_ptr<LCEntry>* entry)
+{
+  cls_rgw_lc_entry cls_entry;
+  int ret = cls_rgw_lc_get_entry(*store->getRados()->get_lc_pool_ctx(), oid, marker, cls_entry);
+  if (ret)
+    return ret;
+
+  LCEntry* e;
+  e = new StoreLCEntry(cls_entry.bucket, cls_entry.start_time, cls_entry.status);
+  if (!e)
+    return -ENOMEM;
+
+  entry->reset(e);
+  return 0;
+}
+
+int RadosLifecycle::get_next_entry(const std::string& oid, const std::string& marker,
+                                  std::unique_ptr<LCEntry>* entry)
+{
+  cls_rgw_lc_entry cls_entry;
+  int ret = cls_rgw_lc_get_next_entry(*store->getRados()->get_lc_pool_ctx(), oid, marker,
+                                     cls_entry);
+
+  if (ret)
+    return ret;
+
+  LCEntry* e;
+  e = new StoreLCEntry(cls_entry.bucket, cls_entry.start_time, cls_entry.status);
+  if (!e)
+    return -ENOMEM;
+
+  entry->reset(e);
+  return 0;
+}
+
+int RadosLifecycle::set_entry(const std::string& oid, LCEntry& entry)
+{
+  cls_rgw_lc_entry cls_entry;
+
+  cls_entry.bucket = entry.get_bucket();
+  cls_entry.start_time = entry.get_start_time();
+  cls_entry.status = entry.get_status();
+
+  return cls_rgw_lc_set_entry(*store->getRados()->get_lc_pool_ctx(), oid, cls_entry);
+}
+
+int RadosLifecycle::list_entries(const std::string& oid, const std::string& marker,
+                                uint32_t max_entries, std::vector<std::unique_ptr<LCEntry>>& entries)
+{
+  entries.clear();
+
+  vector<cls_rgw_lc_entry> cls_entries;
+  int ret = cls_rgw_lc_list(*store->getRados()->get_lc_pool_ctx(), oid, marker, max_entries, cls_entries);
+
+  if (ret < 0)
+    return ret;
+
+  for (auto& entry : cls_entries) {
+    entries.push_back(std::make_unique<StoreLCEntry>(entry.bucket, oid,
+                               entry.start_time, entry.status));
+  }
+
+  return ret;
+}
+
+int RadosLifecycle::rm_entry(const std::string& oid, LCEntry& entry)
+{
+  cls_rgw_lc_entry cls_entry;
+
+  cls_entry.bucket = entry.get_bucket();
+  cls_entry.start_time = entry.get_start_time();
+  cls_entry.status = entry.get_status();
+
+  return cls_rgw_lc_rm_entry(*store->getRados()->get_lc_pool_ctx(), oid, cls_entry);
+}
+
+int RadosLifecycle::get_head(const std::string& oid, std::unique_ptr<LCHead>* head)
+{
+  cls_rgw_lc_obj_head cls_head;
+  int ret = cls_rgw_lc_get_head(*store->getRados()->get_lc_pool_ctx(), oid, cls_head);
+  if (ret)
+    return ret;
+
+  LCHead* h;
+  h = new StoreLCHead(cls_head.start_date, cls_head.shard_rollover_date, cls_head.marker);
+  if (!h)
+    return -ENOMEM;
+
+  head->reset(h);
+  return 0;
+}
+
+int RadosLifecycle::put_head(const std::string& oid, LCHead& head)
+{
+  cls_rgw_lc_obj_head cls_head;
+
+  cls_head.marker = head.get_marker();
+  cls_head.start_date = head.get_start_date();
+  cls_head.shard_rollover_date = head.get_shard_rollover_date();
+
+  return cls_rgw_lc_put_head(*store->getRados()->get_lc_pool_ctx(), oid, cls_head);
+}
+
+std::unique_ptr<LCSerializer> RadosLifecycle::get_serializer(const std::string& lock_name,
+                                                            const std::string& oid,
+                                                            const std::string& cookie)
+{
+  return std::make_unique<LCRadosSerializer>(store, oid, lock_name, cookie);
+}
+
+int RadosNotification::publish_reserve(const DoutPrefixProvider *dpp, RGWObjTags* obj_tags)
+{
+  return rgw::notify::publish_reserve(dpp, event_type, res, obj_tags);
+}
+
+int RadosNotification::publish_commit(const DoutPrefixProvider* dpp, uint64_t size,
+                                    const ceph::real_time& mtime, const std::string& etag, const std::string& version)
+{
+  return rgw::notify::publish_commit(obj, size, mtime, etag, version, event_type, res, dpp);
+}
+
+int RadosAtomicWriter::prepare(optional_yield y)
+{
+  return processor.prepare(y);
+}
+
+int RadosAtomicWriter::process(bufferlist&& data, uint64_t offset)
+{
+  return processor.process(std::move(data), offset);
+}
+
+int RadosAtomicWriter::complete(size_t accounted_size, const std::string& etag,
+                       ceph::real_time *mtime, ceph::real_time set_mtime,
+                       std::map<std::string, bufferlist>& attrs,
+                       ceph::real_time delete_at,
+                       const char *if_match, const char *if_nomatch,
+                       const std::string *user_data,
+                       rgw_zone_set *zones_trace, bool *canceled,
+                       optional_yield y)
+{
+  return processor.complete(accounted_size, etag, mtime, set_mtime, attrs, delete_at,
+                           if_match, if_nomatch, user_data, zones_trace, canceled, y);
+}
+
+int RadosAppendWriter::prepare(optional_yield y)
+{
+  return processor.prepare(y);
+}
+
+int RadosAppendWriter::process(bufferlist&& data, uint64_t offset)
+{
+  return processor.process(std::move(data), offset);
+}
+
+int RadosAppendWriter::complete(size_t accounted_size, const std::string& etag,
+                       ceph::real_time *mtime, ceph::real_time set_mtime,
+                       std::map<std::string, bufferlist>& attrs,
+                       ceph::real_time delete_at,
+                       const char *if_match, const char *if_nomatch,
+                       const std::string *user_data,
+                       rgw_zone_set *zones_trace, bool *canceled,
+                       optional_yield y)
+{
+  return processor.complete(accounted_size, etag, mtime, set_mtime, attrs, delete_at,
+                           if_match, if_nomatch, user_data, zones_trace, canceled, y);
+}
+
+int RadosMultipartWriter::prepare(optional_yield y)
+{
+  return processor.prepare(y);
+}
+
+int RadosMultipartWriter::process(bufferlist&& data, uint64_t offset)
+{
+  return processor.process(std::move(data), offset);
+}
+
+int RadosMultipartWriter::complete(size_t accounted_size, const std::string& etag,
+                       ceph::real_time *mtime, ceph::real_time set_mtime,
+                       std::map<std::string, bufferlist>& attrs,
+                       ceph::real_time delete_at,
+                       const char *if_match, const char *if_nomatch,
+                       const std::string *user_data,
+                       rgw_zone_set *zones_trace, bool *canceled,
+                       optional_yield y)
+{
+  return processor.complete(accounted_size, etag, mtime, set_mtime, attrs, delete_at,
+                           if_match, if_nomatch, user_data, zones_trace, canceled, y);
+}
+
+const std::string& RadosZoneGroup::get_endpoint() const
+{
+  if (!group.endpoints.empty()) {
+      return group.endpoints.front();
+  } else {
+    // use zonegroup's master zone endpoints
+    auto z = group.zones.find(group.master_zone);
+    if (z != group.zones.end() && !z->second.endpoints.empty()) {
+      return z->second.endpoints.front();
+    }
+  }
+  return empty;
+}
+
+bool RadosZoneGroup::placement_target_exists(std::string& target) const
+{
+  return !!group.placement_targets.count(target);
+}
+
+int RadosZoneGroup::get_placement_target_names(std::set<std::string>& names) const
+{
+  for (const auto& target : group.placement_targets) {
+    names.emplace(target.second.name);
+  }
+
+  return 0;
+}
+
+int RadosZoneGroup::get_placement_tier(const rgw_placement_rule& rule,
+                                      std::unique_ptr<PlacementTier>* tier)
+{
+  std::map<std::string, RGWZoneGroupPlacementTarget>::const_iterator titer;
+  titer = group.placement_targets.find(rule.name);
+  if (titer == group.placement_targets.end()) {
+    return -ENOENT;
+  }
+
+  const auto& target_rule = titer->second;
+  std::map<std::string, RGWZoneGroupPlacementTier>::const_iterator ttier;
+  ttier = target_rule.tier_targets.find(rule.storage_class);
+  if (ttier == target_rule.tier_targets.end()) {
+    // not found
+    return -ENOENT;
+  }
+
+  PlacementTier* t;
+  t = new RadosPlacementTier(store, ttier->second);
+  if (!t)
+    return -ENOMEM;
+
+  tier->reset(t);
+  return 0;
+}
+
+int RadosZoneGroup::get_zone_by_id(const std::string& id, std::unique_ptr<Zone>* zone)
+{
+  RGWZone* rz = store->svc()->zone->find_zone(id);
+  if (!rz)
+    return -ENOENT;
+
+  Zone* z = new RadosZone(store, clone(), *rz);
+  zone->reset(z);
+  return 0;
+}
+
+int RadosZoneGroup::get_zone_by_name(const std::string& name, std::unique_ptr<Zone>* zone)
+{
+  rgw_zone_id id;
+  int ret = store->svc()->zone->find_zone_id_by_name(name, &id);
+  if (ret < 0)
+    return ret;
+
+  RGWZone* rz = store->svc()->zone->find_zone(id.id);
+  if (!rz)
+    return -ENOENT;
+
+  Zone* z = new RadosZone(store, clone(), *rz);
+  zone->reset(z);
+  return 0;
+}
+
+int RadosZoneGroup::list_zones(std::list<std::string>& zone_ids)
+{
+  for (const auto& entry : group.zones)
+    {
+      zone_ids.push_back(entry.second.id);
+    }
+  return 0;
+}
+
+std::unique_ptr<Zone> RadosZone::clone()
+{
+  if (local_zone)
+    return std::make_unique<RadosZone>(store, group->clone());
+
+  return std::make_unique<RadosZone>(store, group->clone(), rgw_zone);
+}
+
+const std::string& RadosZone::get_id()
+{
+  if (local_zone)
+    return store->svc()->zone->zone_id().id;
+
+  return rgw_zone.id;
+}
+
+const std::string& RadosZone::get_name() const
+{
+  if (local_zone)
+    return store->svc()->zone->zone_name();
+
+  return rgw_zone.name;
+}
+
+bool RadosZone::is_writeable()
+{
+  if (local_zone)
+    return store->svc()->zone->zone_is_writeable();
+
+  return !rgw_zone.read_only;
+}
+
+bool RadosZone::get_redirect_endpoint(std::string* endpoint)
+{
+  if (local_zone)
+    return store->svc()->zone->get_redirect_zone_endpoint(endpoint);
+
+  endpoint = &rgw_zone.redirect_zone;
+  return true;
+}
+
+bool RadosZone::has_zonegroup_api(const std::string& api) const
+{
+  return store->svc()->zone->has_zonegroup_api(api);
+}
+
+const std::string& RadosZone::get_current_period_id()
+{
+  return store->svc()->zone->get_current_period_id();
+}
+
+const RGWAccessKey& RadosZone::get_system_key()
+{
+  return store->svc()->zone->get_zone_params().system_key;
+}
+
+const std::string& RadosZone::get_realm_name()
+{
+  return store->svc()->zone->get_realm().get_name();
+}
+
+const std::string& RadosZone::get_realm_id()
+{
+  return store->svc()->zone->get_realm().get_id();
+}
+
+const std::string_view RadosZone::get_tier_type()
+{
+  if (local_zone)
+    return store->svc()->zone->get_zone().tier_type;
+
+  return rgw_zone.id;
+}
+
+RGWBucketSyncPolicyHandlerRef RadosZone::get_sync_policy_handler()
+{
+  return store->svc()->zone->get_sync_policy_handler(get_id());
+}
+
+RadosLuaManager::RadosLuaManager(RadosStore* _s) : 
+  store(_s),
+  pool((store->svc() && store->svc()->zone) ? store->svc()->zone->get_zone_params().log_pool : rgw_pool())
+{ }
+
+int RadosLuaManager::get_script(const DoutPrefixProvider* dpp, optional_yield y, const std::string& key, std::string& script)
+{
+  if (pool.empty()) {
+    ldpp_dout(dpp, 10) << "WARNING: missing pool when reading lua script " << dendl;
+    return 0;
+  }
+  bufferlist bl;
+
+  int r = rgw_get_system_obj(store->svc()->sysobj, pool, key, bl, nullptr, nullptr, y, dpp);
+  if (r < 0) {
+    return r;
+  }
+
+  auto iter = bl.cbegin();
+  try {
+    ceph::decode(script, iter);
+  } catch (buffer::error& err) {
+    return -EIO;
+  }
+
+  return 0;
+}
+
+int RadosLuaManager::put_script(const DoutPrefixProvider* dpp, optional_yield y, const std::string& key, const std::string& script)
+{
+  if (pool.empty()) {
+    ldpp_dout(dpp, 10) << "WARNING: missing pool when writing lua script " << dendl;
+    return 0;
+  }
+  bufferlist bl;
+  ceph::encode(script, bl);
+
+  int r = rgw_put_system_obj(dpp, store->svc()->sysobj, pool, key, bl, false, nullptr, real_time(), y);
+  if (r < 0) {
+    return r;
+  }
+
+  return 0;
+}
+
+int RadosLuaManager::del_script(const DoutPrefixProvider* dpp, optional_yield y, const std::string& key)
+{
+  if (pool.empty()) {
+    ldpp_dout(dpp, 10) << "WARNING: missing pool when deleting lua script " << dendl;
+    return 0;
+  }
+  int r = rgw_delete_system_obj(dpp, store->svc()->sysobj, pool, key, nullptr, y);
+  if (r < 0 && r != -ENOENT) {
+    return r;
+  }
+
+  return 0;
+}
+
+const std::string PACKAGE_LIST_OBJECT_NAME = "lua_package_allowlist";
+
+int RadosLuaManager::add_package(const DoutPrefixProvider *dpp, optional_yield y, const std::string& package_name)
+{
+  // add package to list
+  const bufferlist empty_bl;
+  std::map<std::string, bufferlist> new_package{{package_name, empty_bl}};
+  librados::ObjectWriteOperation op;
+  op.omap_set(new_package);
+  auto ret = rgw_rados_operate(dpp, *(store->getRados()->get_lc_pool_ctx()),
+      PACKAGE_LIST_OBJECT_NAME, &op, y);
+
+  if (ret < 0) {
+    return ret;
+  }
+  return 0;
+}
+
+int RadosLuaManager::remove_package(const DoutPrefixProvider *dpp, optional_yield y, const std::string& package_name)
+{
+  librados::ObjectWriteOperation op;
+  size_t pos = package_name.find(" ");
+  if (pos != package_name.npos) {
+    // remove specfic version of the the package
+    op.omap_rm_keys(std::set<std::string>({package_name}));
+    auto ret = rgw_rados_operate(dpp, *(store->getRados()->get_lc_pool_ctx()),
+        PACKAGE_LIST_OBJECT_NAME, &op, y);
+    if (ret < 0) {
+        return ret;
+    }
+    return 0;
+  }
+  // otherwise, remove any existing versions of the package
+  rgw::lua::packages_t packages;
+  auto ret = list_packages(dpp, y, packages);
+  if (ret < 0 && ret != -ENOENT) {
+    return ret;
+  }
+  for(const auto& package : packages) {
+    const std::string package_no_version = package.substr(0, package.find(" "));
+    if (package_no_version.compare(package_name) == 0) {
+        op.omap_rm_keys(std::set<std::string>({package}));
+        ret = rgw_rados_operate(dpp, *(store->getRados()->get_lc_pool_ctx()),
+            PACKAGE_LIST_OBJECT_NAME, &op, y);
+        if (ret < 0) {
+            return ret;
+        }
+    }
+  }
+  return 0;
+}
+
+int RadosLuaManager::list_packages(const DoutPrefixProvider *dpp, optional_yield y, rgw::lua::packages_t& packages)
+{
+  constexpr auto max_chunk = 1024U;
+  std::string start_after;
+  bool more = true;
+  int rval;
+  while (more) {
+    librados::ObjectReadOperation op;
+    rgw::lua::packages_t packages_chunk;
+    op.omap_get_keys2(start_after, max_chunk, &packages_chunk, &more, &rval);
+    const auto ret = rgw_rados_operate(dpp, *(store->getRados()->get_lc_pool_ctx()),
+      PACKAGE_LIST_OBJECT_NAME, &op, nullptr, y);
+
+    if (ret < 0) {
+      return ret;
+    }
+
+    packages.merge(packages_chunk);
+  }
+
+  return 0;
+}
+
+int RadosOIDCProvider::store_url(const DoutPrefixProvider *dpp, const std::string& url, bool exclusive, optional_yield y)
+{
+  auto sysobj = store->svc()->sysobj;
+  std::string oid = tenant + get_url_oid_prefix() + url;
+
+  bufferlist bl;
+  using ceph::encode;
+  encode(*this, bl);
+  return rgw_put_system_obj(dpp, sysobj, store->svc()->zone->get_zone_params().oidc_pool, oid, bl, exclusive, nullptr, real_time(), y);
+}
+
+int RadosOIDCProvider::read_url(const DoutPrefixProvider *dpp, const std::string& url, const std::string& tenant)
+{
+  auto sysobj = store->svc()->sysobj;
+  auto& pool = store->svc()->zone->get_zone_params().oidc_pool;
+  std::string oid = tenant + get_url_oid_prefix() + url;
+  bufferlist bl;
+
+  int ret = rgw_get_system_obj(sysobj, pool, oid, bl, nullptr, nullptr, null_yield, dpp);
+  if (ret < 0) {
+    return ret;
+  }
+
+  try {
+    using ceph::decode;
+    auto iter = bl.cbegin();
+    decode(*this, iter);
+  } catch (buffer::error& err) {
+    ldpp_dout(dpp, 0) << "ERROR: failed to decode oidc provider info from pool: " << pool.name <<
+                  ": " << url << dendl;
+    return -EIO;
+  }
+
+  return 0;
+}
+
+int RadosOIDCProvider::delete_obj(const DoutPrefixProvider *dpp, optional_yield y)
+{
+  auto& pool = store->svc()->zone->get_zone_params().oidc_pool;
+
+  std::string url, tenant;
+  auto ret = get_tenant_url_from_arn(tenant, url);
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: failed to parse arn" << dendl;
+    return -EINVAL;
+  }
+
+  if (this->tenant != tenant) {
+    ldpp_dout(dpp, 0) << "ERROR: tenant in arn doesn't match that of user " << this->tenant << ", "
+                  << tenant << ": " << dendl;
+    return -EINVAL;
+  }
+
+  // Delete url
+  std::string oid = tenant + get_url_oid_prefix() + url;
+  ret = rgw_delete_system_obj(dpp, store->svc()->sysobj, pool, oid, nullptr, y);
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: deleting oidc url from pool: " << pool.name << ": "
+                  << provider_url << ": " << cpp_strerror(-ret) << dendl;
+  }
+
+  return ret;
+}
+
+int RadosRole::store_info(const DoutPrefixProvider *dpp, bool exclusive, optional_yield y)
+{
+  using ceph::encode;
+  std::string oid;
+
+  oid = info.id;
+
+  bufferlist bl;
+  encode(this->info, bl);
+
+  if (!this->info.tags.empty()) {
+    bufferlist bl_tags;
+    encode(this->info.tags, bl_tags);
+    map<string, bufferlist> attrs;
+    attrs.emplace("tagging", bl_tags);
+
+    RGWSI_MBSObj_PutParams params(bl, &attrs, info.mtime, exclusive);
+    std::unique_ptr<RGWSI_MetaBackend::Context> ctx(store->svc()->role->svc.meta_be->alloc_ctx());
+    ctx->init(store->svc()->role->get_be_handler());
+    return store->svc()->role->svc.meta_be->put(ctx.get(), oid, params, &info.objv_tracker, y, dpp);
+  } else {
+    RGWSI_MBSObj_PutParams params(bl, nullptr, info.mtime, exclusive);
+    std::unique_ptr<RGWSI_MetaBackend::Context> ctx(store->svc()->role->svc.meta_be->alloc_ctx());
+    ctx->init(store->svc()->role->get_be_handler());
+    return store->svc()->role->svc.meta_be->put(ctx.get(), oid, params, &info.objv_tracker, y, dpp);
+  }
+}
+
+int RadosRole::store_name(const DoutPrefixProvider *dpp, bool exclusive, optional_yield y)
+{
+  auto sysobj = store->svc()->sysobj;
+  RGWNameToId nameToId;
+  nameToId.obj_id = info.id;
+
+  std::string oid = info.tenant + get_names_oid_prefix() + info.name;
+
+  bufferlist bl;
+  using ceph::encode;
+  encode(nameToId, bl);
+
+  return rgw_put_system_obj(dpp, sysobj, store->svc()->zone->get_zone_params().roles_pool, oid, bl, exclusive, &info.objv_tracker, real_time(), y);
+}
+
+int RadosRole::store_path(const DoutPrefixProvider *dpp, bool exclusive, optional_yield y)
+{
+  auto sysobj = store->svc()->sysobj;
+  std::string oid = info.tenant + get_path_oid_prefix() + info.path + get_info_oid_prefix() + info.id;
+
+  bufferlist bl;
+
+  return rgw_put_system_obj(dpp, sysobj, store->svc()->zone->get_zone_params().roles_pool, oid, bl, exclusive, &info.objv_tracker, real_time(), y);
+}
+
+int RadosRole::read_id(const DoutPrefixProvider *dpp, const std::string& role_name, const std::string& tenant, std::string& role_id, optional_yield y)
+{
+  auto sysobj = store->svc()->sysobj;
+  std::string oid = info.tenant + get_names_oid_prefix() + role_name;
+  bufferlist bl;
+
+  int ret = rgw_get_system_obj(sysobj, store->svc()->zone->get_zone_params().roles_pool, oid, bl, nullptr, nullptr, null_yield, dpp);
+  if (ret < 0) {
+    return ret;
+  }
+
+  RGWNameToId nameToId;
+  try {
+    auto iter = bl.cbegin();
+    using ceph::decode;
+    decode(nameToId, iter);
+  } catch (buffer::error& err) {
+    ldpp_dout(dpp, 0) << "ERROR: failed to decode role from Role pool: " << role_name << dendl;
+    return -EIO;
+  }
+  role_id = nameToId.obj_id;
+  return 0;
+}
+
+int RadosRole::read_name(const DoutPrefixProvider *dpp, optional_yield y)
+{
+  auto sysobj = store->svc()->sysobj;
+  std::string oid = info.tenant + get_names_oid_prefix() + info.name;
+  bufferlist bl;
+
+  int ret = rgw_get_system_obj(sysobj, store->svc()->zone->get_zone_params().roles_pool, oid, bl, nullptr, nullptr, null_yield, dpp);
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: failed reading role name from Role pool: " << info.name <<
+      ": " << cpp_strerror(-ret) << dendl;
+    return ret;
+  }
+
+  RGWNameToId nameToId;
+  try {
+    using ceph::decode;
+    auto iter = bl.cbegin();
+    decode(nameToId, iter);
+  } catch (buffer::error& err) {
+    ldpp_dout(dpp, 0) << "ERROR: failed to decode role name from Role pool: " << info.name << dendl;
+    return -EIO;
+  }
+  info.id = nameToId.obj_id;
+  return 0;
+}
+
+int RadosRole::read_info(const DoutPrefixProvider *dpp, optional_yield y)
+{
+  std::string oid;
+
+  oid = info.id;
+  ldpp_dout(dpp, 20) << "INFO: oid in read_info is: " << oid << dendl;
+
+  bufferlist bl;
+
+  RGWSI_MBSObj_GetParams params(&bl, &info.attrs, &info.mtime);
+  std::unique_ptr<RGWSI_MetaBackend::Context> ctx(store->svc()->role->svc.meta_be->alloc_ctx());
+  ctx->init(store->svc()->role->get_be_handler());
+  int ret = store->svc()->role->svc.meta_be->get(ctx.get(), oid, params, &info.objv_tracker, y, dpp, true);
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: failed reading role info from Role pool: " << info.id << ": " << cpp_strerror(-ret) << dendl;
+    return ret;
+  }
+
+  try {
+    using ceph::decode;
+    auto iter = bl.cbegin();
+    decode(this->info, iter);
+  } catch (buffer::error& err) {
+    ldpp_dout(dpp, 0) << "ERROR: failed to decode role info from Role pool: " << info.id << dendl;
+    return -EIO;
+  }
+
+  auto it = info.attrs.find("tagging");
+  if (it != info.attrs.end()) {
+    bufferlist bl_tags = it->second;
+    try {
+      using ceph::decode;
+      auto iter = bl_tags.cbegin();
+      decode(info.tags, iter);
+    } catch (buffer::error& err) {
+      ldpp_dout(dpp, 0) << "ERROR: failed to decode attrs" << info.id << dendl;
+      return -EIO;
+    }
+  }
+
+  return 0;
+}
+
+int RadosRole::create(const DoutPrefixProvider *dpp, bool exclusive, const std::string& role_id, optional_yield y)
+{
+  int ret;
+
+  if (! validate_input(dpp)) {
+    return -EINVAL;
+  }
+
+  if (!role_id.empty()) {
+    info.id = role_id;
+  }
+
+  /* check to see the name is not used */
+  ret = read_id(dpp, info.name, info.tenant, info.id, y);
+  if (exclusive && ret == 0) {
+    ldpp_dout(dpp, 0) << "ERROR: name " << info.name << " already in use for role id "
+                    << info.id << dendl;
+    return -EEXIST;
+  } else if ( ret < 0 && ret != -ENOENT) {
+    ldpp_dout(dpp, 0) << "failed reading role id  " << info.id << ": "
+                  << cpp_strerror(-ret) << dendl;
+    return ret;
+  }
+
+  if (info.id.empty()) {
+    /* create unique id */
+    uuid_d new_uuid;
+    char uuid_str[37];
+    new_uuid.generate_random();
+    new_uuid.print(uuid_str);
+    info.id = uuid_str;
+  }
+
+  //arn
+  info.arn = role_arn_prefix + info.tenant + ":role" + info.path + info.name;
+
+  // Creation time
+  real_clock::time_point t = real_clock::now();
+
+  struct timeval tv;
+  real_clock::to_timeval(t, tv);
+
+  char buf[30];
+  struct tm result;
+  gmtime_r(&tv.tv_sec, &result);
+  strftime(buf,30,"%Y-%m-%dT%H:%M:%S", &result);
+  sprintf(buf + strlen(buf),".%dZ",(int)tv.tv_usec/1000);
+  info.creation_date.assign(buf, strlen(buf));
+
+  auto& pool = store->svc()->zone->get_zone_params().roles_pool;
+  ret = store_info(dpp, exclusive, y);
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "ERROR:  storing role info in Role pool: "
+                  << info.id << ": " << cpp_strerror(-ret) << dendl;
+    return ret;
+  }
+
+  ret = store_name(dpp, exclusive, y);
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: storing role name in Role pool: "
+                  << info.name << ": " << cpp_strerror(-ret) << dendl;
+
+    //Delete the role info that was stored in the previous call
+    std::string oid = get_info_oid_prefix() + info.id;
+    int info_ret = rgw_delete_system_obj(dpp, store->svc()->sysobj, pool, oid, nullptr, y);
+    if (info_ret < 0) {
+      ldpp_dout(dpp, 0) << "ERROR: cleanup of role id from Role pool: "
+                  << info.id << ": " << cpp_strerror(-info_ret) << dendl;
+    }
+    return ret;
+  }
+
+  ret = store_path(dpp, exclusive, y);
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: storing role path in Role pool: "
+                  << info.path << ": " << cpp_strerror(-ret) << dendl;
+    //Delete the role info that was stored in the previous call
+    std::string oid = get_info_oid_prefix() + info.id;
+    int info_ret = rgw_delete_system_obj(dpp, store->svc()->sysobj, pool, oid, nullptr, y);
+    if (info_ret < 0) {
+      ldpp_dout(dpp, 0) << "ERROR: cleanup of role id from Role pool: "
+                  << info.id << ": " << cpp_strerror(-info_ret) << dendl;
+    }
+    //Delete role name that was stored in previous call
+    oid = info.tenant + get_names_oid_prefix() + info.name;
+    int name_ret = rgw_delete_system_obj(dpp, store->svc()->sysobj, pool, oid, nullptr, y);
+    if (name_ret < 0) {
+      ldpp_dout(dpp, 0) << "ERROR: cleanup of role name from Role pool: "
+                  << info.name << ": " << cpp_strerror(-name_ret) << dendl;
+    }
+    return ret;
+  }
+  return 0;
+}
+
+int RadosRole::delete_obj(const DoutPrefixProvider *dpp, optional_yield y)
+{
+  auto& pool = store->svc()->zone->get_zone_params().roles_pool;
+
+  int ret = read_name(dpp, y);
+  if (ret < 0) {
+    return ret;
+  }
+
+  ret = read_info(dpp, y);
+  if (ret < 0) {
+    return ret;
+  }
+
+  if (! info.perm_policy_map.empty()) {
+    return -ERR_DELETE_CONFLICT;
+  }
+
+  // Delete id
+  std::string oid = get_info_oid_prefix() + info.id;
+  ret = rgw_delete_system_obj(dpp, store->svc()->sysobj, pool, oid, nullptr, y);
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: deleting role id from Role pool: "
+                  << info.id << ": " << cpp_strerror(-ret) << dendl;
+  }
+
+  // Delete name
+  oid = info.tenant + get_names_oid_prefix() + info.name;
+  ret = rgw_delete_system_obj(dpp, store->svc()->sysobj, pool, oid, nullptr, y);
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: deleting role name from Role pool: "
+                  << info.name << ": " << cpp_strerror(-ret) << dendl;
+  }
+
+  // Delete path
+  oid = info.tenant + get_path_oid_prefix() + info.path + get_info_oid_prefix() + info.id;
+  ret = rgw_delete_system_obj(dpp, store->svc()->sysobj, pool, oid, nullptr, y);
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: deleting role path from Role pool: "
+                  << info.path << ": " << cpp_strerror(-ret) << dendl;
+  }
+  return ret;
+}
+
+} // namespace rgw::sal
+
+extern "C" {
+
+void* newRadosStore(void)
+{
+  rgw::sal::RadosStore* store = new rgw::sal::RadosStore();
+  if (store) {
+    RGWRados* rados = new RGWRados();
+
+    if (!rados) {
+      delete store; store = nullptr;
+    } else {
+      store->setRados(rados);
+      rados->set_store(store);
+    }
+  }
+
+  return store;
+}
+
+}
diff --git a/src/rgw/driver/rados/rgw_sal_rados.h b/src/rgw/driver/rados/rgw_sal_rados.h
new file mode 100644 (file)
index 0000000..499e099
--- /dev/null
@@ -0,0 +1,959 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2020 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#pragma once
+
+#include "rgw_sal_store.h"
+#include "rgw_rados.h"
+#include "rgw_notify.h"
+#include "rgw_oidc_provider.h"
+#include "rgw_role.h"
+#include "rgw_multi.h"
+#include "rgw_putobj_processor.h"
+#include "services/svc_tier_rados.h"
+#include "cls/lock/cls_lock_client.h"
+
+namespace rgw { namespace sal {
+
+class RadosMultipartUpload;
+
+class RadosCompletions : public Completions {
+  public:
+    std::list<librados::AioCompletion*> handles;
+    RadosCompletions() {}
+    ~RadosCompletions() = default;
+    virtual int drain() override;
+};
+
+class RadosPlacementTier: public StorePlacementTier {
+  RadosStore* store;
+  RGWZoneGroupPlacementTier tier;
+public:
+  RadosPlacementTier(RadosStore* _store, const RGWZoneGroupPlacementTier& _tier) : store(_store), tier(_tier) {}
+  virtual ~RadosPlacementTier() = default;
+
+  virtual const std::string& get_tier_type() { return tier.tier_type; }
+  virtual const std::string& get_storage_class() { return tier.storage_class; }
+  virtual bool retain_head_object() { return tier.retain_head_object; }
+  RGWZoneGroupPlacementTier& get_rt() { return tier; }
+};
+
+class RadosZoneGroup : public StoreZoneGroup {
+  RadosStore* store;
+  const RGWZoneGroup group;
+  std::string empty;
+public:
+  RadosZoneGroup(RadosStore* _store, const RGWZoneGroup& _group) : store(_store), group(_group) {}
+  virtual ~RadosZoneGroup() = default;
+
+  virtual const std::string& get_id() const override { return group.get_id(); };
+  virtual const std::string& get_name() const override { return group.get_name(); };
+  virtual int equals(const std::string& other_zonegroup) const override {
+    return group.equals(other_zonegroup);
+  };
+  /** Get the endpoint from zonegroup, or from master zone if not set */
+  virtual const std::string& get_endpoint() const override;
+  virtual bool placement_target_exists(std::string& target) const override;
+  virtual bool is_master_zonegroup() const override {
+    return group.is_master_zonegroup();
+  };
+  virtual const std::string& get_api_name() const override { return group.api_name; };
+  virtual int get_placement_target_names(std::set<std::string>& names) const override;
+  virtual const std::string& get_default_placement_name() const override {
+    return group.default_placement.name; };
+  virtual int get_hostnames(std::list<std::string>& names) const override {
+    names = group.hostnames;
+    return 0;
+  };
+  virtual int get_s3website_hostnames(std::list<std::string>& names) const override {
+    names = group.hostnames_s3website;
+    return 0;
+  };
+  virtual int get_zone_count() const override {
+    return group.zones.size();
+  }
+  virtual int get_placement_tier(const rgw_placement_rule& rule, std::unique_ptr<PlacementTier>* tier);
+  virtual int get_zone_by_id(const std::string& id, std::unique_ptr<Zone>* zone) override;
+  virtual int get_zone_by_name(const std::string& name, std::unique_ptr<Zone>* zone) override;
+  virtual int list_zones(std::list<std::string>& zone_ids) override;
+  virtual std::unique_ptr<ZoneGroup> clone() override {
+    return std::make_unique<RadosZoneGroup>(store, group);
+  }
+  const RGWZoneGroup& get_group() const { return group; }
+};
+
+class RadosZone : public StoreZone {
+  protected:
+    RadosStore* store;
+    std::unique_ptr<ZoneGroup> group;
+    RGWZone rgw_zone;
+    bool local_zone{false};
+  public:
+    RadosZone(RadosStore* _store, std::unique_ptr<ZoneGroup> _zg) : store(_store), group(std::move(_zg)), local_zone(true) {}
+    RadosZone(RadosStore* _store, std::unique_ptr<ZoneGroup> _zg, RGWZone& z) : store(_store), group(std::move(_zg)), rgw_zone(z) {}
+    ~RadosZone() = default;
+
+    virtual std::unique_ptr<Zone> clone() override;
+    virtual ZoneGroup& get_zonegroup() override { return *(group.get()); }
+    virtual const std::string& get_id() override;
+    virtual const std::string& get_name() const override;
+    virtual bool is_writeable() override;
+    virtual bool get_redirect_endpoint(std::string* endpoint) override;
+    virtual bool has_zonegroup_api(const std::string& api) const override;
+    virtual const std::string& get_current_period_id() override;
+    virtual const RGWAccessKey& get_system_key() override;
+    virtual const std::string& get_realm_name() override;
+    virtual const std::string& get_realm_id() override;
+    virtual const std::string_view get_tier_type() override;
+    virtual RGWBucketSyncPolicyHandlerRef get_sync_policy_handler() override;
+};
+
+class RadosStore : public StoreDriver {
+  private:
+    RGWRados* rados;
+    RGWUserCtl* user_ctl;
+    std::string luarocks_path;
+    std::unique_ptr<RadosZone> zone;
+
+  public:
+    RadosStore()
+      : rados(nullptr) {
+      }
+    ~RadosStore() {
+      delete rados;
+    }
+
+    virtual int initialize(CephContext *cct, const DoutPrefixProvider *dpp) override;
+    virtual const std::string get_name() const override {
+      return "rados";
+    }
+    virtual std::string get_cluster_id(const DoutPrefixProvider* dpp,  optional_yield y) override;
+    virtual std::unique_ptr<User> get_user(const rgw_user& u) override;
+    virtual int get_user_by_access_key(const DoutPrefixProvider* dpp, const std::string& key, optional_yield y, std::unique_ptr<User>* user) override;
+    virtual int get_user_by_email(const DoutPrefixProvider* dpp, const std::string& email, optional_yield y, std::unique_ptr<User>* user) override;
+    virtual int get_user_by_swift(const DoutPrefixProvider* dpp, const std::string& user_str, optional_yield y, std::unique_ptr<User>* user) override;
+    virtual std::unique_ptr<Object> get_object(const rgw_obj_key& k) override;
+    virtual int get_bucket(const DoutPrefixProvider* dpp, User* u, const rgw_bucket& b, std::unique_ptr<Bucket>* bucket, optional_yield y) override;
+    virtual int get_bucket(User* u, const RGWBucketInfo& i, std::unique_ptr<Bucket>* bucket) override;
+    virtual int get_bucket(const DoutPrefixProvider* dpp, User* u, const std::string& tenant, const std::string&name, std::unique_ptr<Bucket>* bucket, optional_yield y) override;
+    virtual bool is_meta_master() override;
+    virtual int forward_request_to_master(const DoutPrefixProvider *dpp, User* user, obj_version* objv,
+                                         bufferlist& in_data, JSONParser* jp, req_info& info,
+                                         optional_yield y) override;
+    virtual int forward_iam_request_to_master(const DoutPrefixProvider *dpp, const RGWAccessKey& key, obj_version* objv,
+                                            bufferlist& in_data,
+                                            RGWXMLDecoder::XMLParser* parser, req_info& info,
+                                            optional_yield y) override;
+    virtual Zone* get_zone() { return zone.get(); }
+    virtual std::string zone_unique_id(uint64_t unique_num) override;
+    virtual std::string zone_unique_trans_id(const uint64_t unique_num) override;
+    virtual int get_zonegroup(const std::string& id, std::unique_ptr<ZoneGroup>* zonegroup) override;
+    virtual int list_all_zones(const DoutPrefixProvider* dpp, std::list<std::string>& zone_ids) override;
+    virtual int cluster_stat(RGWClusterStat& stats) override;
+    virtual std::unique_ptr<Lifecycle> get_lifecycle(void) override;
+    virtual std::unique_ptr<Completions> get_completions(void) override;
+    virtual std::unique_ptr<Notification> get_notification(rgw::sal::Object* obj, rgw::sal::Object* src_obj, req_state* s, rgw::notify::EventType event_type, const std::string* object_name=nullptr) override;
+    virtual std::unique_ptr<Notification> get_notification(
+    const DoutPrefixProvider* dpp, rgw::sal::Object* obj, rgw::sal::Object* src_obj, 
+    rgw::notify::EventType event_type, rgw::sal::Bucket* _bucket, std::string& _user_id, std::string& _user_tenant,
+    std::string& _req_id, optional_yield y) override;
+    virtual RGWLC* get_rgwlc(void) override { return rados->get_lc(); }
+    virtual RGWCoroutinesManagerRegistry* get_cr_registry() override { return rados->get_cr_registry(); }
+
+    virtual int log_usage(const DoutPrefixProvider *dpp, std::map<rgw_user_bucket, RGWUsageBatch>& usage_info) override;
+    virtual int log_op(const DoutPrefixProvider *dpp, std::string& oid, bufferlist& bl) override;
+    virtual int register_to_service_map(const DoutPrefixProvider *dpp, const std::string& daemon_type,
+                               const std::map<std::string, std::string>& meta) override;
+    virtual void get_quota(RGWQuota& quota) override;
+    virtual void get_ratelimit(RGWRateLimitInfo& bucket_ratelimit, RGWRateLimitInfo& user_ratelimit, RGWRateLimitInfo& anon_ratelimit) override;
+    virtual int set_buckets_enabled(const DoutPrefixProvider* dpp, std::vector<rgw_bucket>& buckets, bool enabled) override;
+    virtual int get_sync_policy_handler(const DoutPrefixProvider* dpp,
+                                       std::optional<rgw_zone_id> zone,
+                                       std::optional<rgw_bucket> bucket,
+                                       RGWBucketSyncPolicyHandlerRef* phandler,
+                                       optional_yield y) override;
+    virtual RGWDataSyncStatusManager* get_data_sync_manager(const rgw_zone_id& source_zone) override;
+    virtual void wakeup_meta_sync_shards(std::set<int>& shard_ids) override { rados->wakeup_meta_sync_shards(shard_ids); }
+    virtual void wakeup_data_sync_shards(const DoutPrefixProvider *dpp, const rgw_zone_id& source_zone, boost::container::flat_map<int, boost::container::flat_set<rgw_data_notify_entry>>& shard_ids) override { rados->wakeup_data_sync_shards(dpp, source_zone, shard_ids); }
+    virtual int clear_usage(const DoutPrefixProvider *dpp) override { return rados->clear_usage(dpp); }
+    virtual int read_all_usage(const DoutPrefixProvider *dpp, uint64_t start_epoch, uint64_t end_epoch,
+                              uint32_t max_entries, bool* is_truncated,
+                              RGWUsageIter& usage_iter,
+                              std::map<rgw_user_bucket, rgw_usage_log_entry>& usage) override;
+    virtual int trim_all_usage(const DoutPrefixProvider *dpp, uint64_t start_epoch, uint64_t end_epoch) override;
+    virtual int get_config_key_val(std::string name, bufferlist* bl) override;
+    virtual int meta_list_keys_init(const DoutPrefixProvider *dpp, const std::string& section, const std::string& marker, void** phandle) override;
+    virtual int meta_list_keys_next(const DoutPrefixProvider *dpp, void* handle, int max, std::list<std::string>& keys, bool* truncated) override;
+    virtual void meta_list_keys_complete(void* handle) override;
+    virtual std::string meta_get_marker(void* handle) override;
+    virtual int meta_remove(const DoutPrefixProvider* dpp, std::string& metadata_key, optional_yield y) override;
+    virtual const RGWSyncModuleInstanceRef& get_sync_module() { return rados->get_sync_module(); }
+    virtual std::string get_host_id() { return rados->host_id; }
+    virtual std::unique_ptr<LuaManager> get_lua_manager() override;
+    virtual std::unique_ptr<RGWRole> get_role(std::string name,
+                                             std::string tenant,
+                                             std::string path="",
+                                             std::string trust_policy="",
+                                             std::string max_session_duration_str="",
+                std::multimap<std::string,std::string> tags={}) override;
+    virtual std::unique_ptr<RGWRole> get_role(std::string id) override;
+    virtual std::unique_ptr<RGWRole> get_role(const RGWRoleInfo& info) override;
+    virtual int get_roles(const DoutPrefixProvider *dpp,
+                         optional_yield y,
+                         const std::string& path_prefix,
+                         const std::string& tenant,
+                         std::vector<std::unique_ptr<RGWRole>>& roles) override;
+    virtual std::unique_ptr<RGWOIDCProvider> get_oidc_provider() override;
+    virtual int get_oidc_providers(const DoutPrefixProvider *dpp,
+                                  const std::string& tenant,
+                                  std::vector<std::unique_ptr<RGWOIDCProvider>>& providers) override;
+    virtual std::unique_ptr<Writer> get_append_writer(const DoutPrefixProvider *dpp,
+                                 optional_yield y,
+                                 std::unique_ptr<rgw::sal::Object> _head_obj,
+                                 const rgw_user& owner,
+                                 const rgw_placement_rule *ptail_placement_rule,
+                                 const std::string& unique_tag,
+                                 uint64_t position,
+                                 uint64_t *cur_accounted_size) override;
+    virtual std::unique_ptr<Writer> get_atomic_writer(const DoutPrefixProvider *dpp,
+                                 optional_yield y,
+                                 std::unique_ptr<rgw::sal::Object> _head_obj,
+                                 const rgw_user& owner,
+                                 const rgw_placement_rule *ptail_placement_rule,
+                                 uint64_t olh_epoch,
+                                 const std::string& unique_tag) override;
+    virtual const std::string& get_compression_type(const rgw_placement_rule& rule) override;
+    virtual bool valid_placement(const rgw_placement_rule& rule) override;
+
+    virtual void finalize(void) override;
+
+    virtual CephContext* ctx(void) override { return rados->ctx(); }
+
+    virtual const std::string& get_luarocks_path() const override {
+      return luarocks_path;
+    }
+
+    virtual void set_luarocks_path(const std::string& path) override {
+      luarocks_path = path;
+    }
+    virtual void register_admin_apis(RGWRESTMgr* mgr) override;
+
+    /* Unique to RadosStore */
+    int get_obj_head_ioctx(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj,
+                          librados::IoCtx* ioctx);
+    int delete_raw_obj(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj);
+    int delete_raw_obj_aio(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj, Completions* aio);
+    void get_raw_obj(const rgw_placement_rule& placement_rule, const rgw_obj& obj, rgw_raw_obj* raw_obj);
+    int get_raw_chunk_size(const DoutPrefixProvider* dpp, const rgw_raw_obj& obj, uint64_t* chunk_size);
+
+    void setRados(RGWRados * st) { rados = st; }
+    RGWRados* getRados(void) { return rados; }
+
+    RGWServices* svc() { return &rados->svc; }
+    const RGWServices* svc() const { return &rados->svc; }
+    RGWCtl* ctl() { return &rados->ctl; }
+    const RGWCtl* ctl() const { return &rados->ctl; }
+
+    void setUserCtl(RGWUserCtl *_ctl) { user_ctl = _ctl; }
+};
+
+class RadosUser : public StoreUser {
+  private:
+    RadosStore* store;
+
+  public:
+    RadosUser(RadosStore *_st, const rgw_user& _u) : StoreUser(_u), store(_st) { }
+    RadosUser(RadosStore *_st, const RGWUserInfo& _i) : StoreUser(_i), store(_st) { }
+    RadosUser(RadosStore *_st) : store(_st) { }
+    RadosUser(RadosUser& _o) = default;
+
+    virtual std::unique_ptr<User> clone() override {
+      return std::unique_ptr<User>(new RadosUser(*this));
+    }
+    int list_buckets(const DoutPrefixProvider* dpp, const std::string& marker, const std::string& end_marker,
+                    uint64_t max, bool need_stats, BucketList& buckets,
+                    optional_yield y) override;
+    virtual int create_bucket(const DoutPrefixProvider* dpp,
+                            const rgw_bucket& b,
+                            const std::string& zonegroup_id,
+                            rgw_placement_rule& placement_rule,
+                            std::string& swift_ver_location,
+                            const RGWQuotaInfo * pquota_info,
+                            const RGWAccessControlPolicy& policy,
+                           Attrs& attrs,
+                            RGWBucketInfo& info,
+                            obj_version& ep_objv,
+                           bool exclusive,
+                           bool obj_lock_enabled,
+                           bool* existed,
+                           req_info& req_info,
+                           std::unique_ptr<Bucket>* bucket,
+                           optional_yield y) override;
+    virtual int read_attrs(const DoutPrefixProvider* dpp, optional_yield y) override;
+    virtual int merge_and_store_attrs(const DoutPrefixProvider* dpp, Attrs& new_attrs, optional_yield y) override;
+    virtual int read_stats(const DoutPrefixProvider *dpp,
+                           optional_yield y, RGWStorageStats* stats,
+                          ceph::real_time* last_stats_sync = nullptr,
+                          ceph::real_time* last_stats_update = nullptr) override;
+    virtual int read_stats_async(const DoutPrefixProvider *dpp, RGWGetUserStats_CB* cb) override;
+    virtual int complete_flush_stats(const DoutPrefixProvider *dpp, optional_yield y) override;
+    virtual int read_usage(const DoutPrefixProvider *dpp, uint64_t start_epoch, uint64_t end_epoch, uint32_t max_entries,
+                          bool* is_truncated, RGWUsageIter& usage_iter,
+                          std::map<rgw_user_bucket, rgw_usage_log_entry>& usage) override;
+    virtual int trim_usage(const DoutPrefixProvider *dpp, uint64_t start_epoch, uint64_t end_epoch) override;
+
+    virtual int load_user(const DoutPrefixProvider* dpp, optional_yield y) override;
+    virtual int store_user(const DoutPrefixProvider* dpp, optional_yield y, bool exclusive, RGWUserInfo* old_info = nullptr) override;
+    virtual int remove_user(const DoutPrefixProvider* dpp, optional_yield y) override;
+    virtual int verify_mfa(const std::string& mfa_str, bool* verified, const DoutPrefixProvider* dpp, optional_yield y) override;
+
+    friend class RadosBucket;
+};
+
+class RadosObject : public StoreObject {
+  private:
+    RadosStore* store;
+    RGWAccessControlPolicy acls;
+    RGWObjManifest *manifest{nullptr};
+    RGWObjectCtx* rados_ctx;
+    bool rados_ctx_owned;
+
+  public:
+
+    struct RadosReadOp : public ReadOp {
+    private:
+      RadosObject* source;
+      RGWObjectCtx* rctx;
+      RGWRados::Object op_target;
+      RGWRados::Object::Read parent_op;
+
+    public:
+      RadosReadOp(RadosObject *_source, RGWObjectCtx *_rctx);
+
+      virtual int prepare(optional_yield y, const DoutPrefixProvider* dpp) override;
+
+      /*
+       * Both `read` and `iterate` read up through index `end`
+       * *inclusive*. The number of bytes that could be returned is
+       * `end - ofs + 1`.
+       */
+      virtual int read(int64_t ofs, int64_t end,
+                      bufferlist& bl, optional_yield y,
+                      const DoutPrefixProvider* dpp) override;
+      virtual int iterate(const DoutPrefixProvider* dpp,
+                         int64_t ofs, int64_t end,
+                         RGWGetDataCB* cb, optional_yield y) override;
+
+        virtual int get_attr(const DoutPrefixProvider* dpp, const char* name, bufferlist& dest, optional_yield y) override;
+    };
+
+    struct RadosDeleteOp : public DeleteOp {
+    private:
+      RadosObject* source;
+      RGWRados::Object op_target;
+      RGWRados::Object::Delete parent_op;
+
+    public:
+      RadosDeleteOp(RadosObject* _source);
+
+      virtual int delete_obj(const DoutPrefixProvider* dpp, optional_yield y) override;
+    };
+
+    RadosObject(RadosStore *_st, const rgw_obj_key& _k)
+      : StoreObject(_k),
+       store(_st),
+        acls(),
+       rados_ctx(new RGWObjectCtx(dynamic_cast<Driver*>(store))),
+       rados_ctx_owned(true) {
+    }
+    RadosObject(RadosStore *_st, const rgw_obj_key& _k, Bucket* _b)
+      : StoreObject(_k, _b),
+       store(_st),
+        acls(),
+       rados_ctx(new RGWObjectCtx(dynamic_cast<Driver*>(store))) ,
+       rados_ctx_owned(true) {
+    }
+    RadosObject(RadosObject& _o) : StoreObject(_o) {
+      store = _o.store;
+      acls = _o.acls;
+      manifest = _o.manifest;
+      rados_ctx = _o.rados_ctx;
+      rados_ctx_owned = false;
+    }
+
+    virtual ~RadosObject();
+
+    virtual void invalidate() override {
+      StoreObject::invalidate();
+      rados_ctx->invalidate(get_obj());
+    }
+    virtual int delete_object(const DoutPrefixProvider* dpp,
+                             optional_yield y, bool prevent_versioning) override;
+    virtual int delete_obj_aio(const DoutPrefixProvider* dpp, RGWObjState* astate, Completions* aio,
+                              bool keep_index_consistent, optional_yield y) override;
+    virtual int copy_object(User* user,
+               req_info* info, const rgw_zone_id& source_zone,
+               rgw::sal::Object* dest_object, rgw::sal::Bucket* dest_bucket,
+               rgw::sal::Bucket* src_bucket,
+               const rgw_placement_rule& dest_placement,
+               ceph::real_time* src_mtime, ceph::real_time* mtime,
+               const ceph::real_time* mod_ptr, const ceph::real_time* unmod_ptr,
+               bool high_precision_time,
+               const char* if_match, const char* if_nomatch,
+               AttrsMod attrs_mod, bool copy_if_newer, Attrs& attrs,
+               RGWObjCategory category, uint64_t olh_epoch,
+              boost::optional<ceph::real_time> delete_at,
+               std::string* version_id, std::string* tag, std::string* etag,
+               void (*progress_cb)(off_t, void *), void* progress_data,
+               const DoutPrefixProvider* dpp, optional_yield y) override;
+    virtual RGWAccessControlPolicy& get_acl(void) override { return acls; }
+    virtual int set_acl(const RGWAccessControlPolicy& acl) override { acls = acl; return 0; }
+    virtual void set_atomic() override {
+      rados_ctx->set_atomic(state.obj);
+      StoreObject::set_atomic();
+    }
+    virtual void set_prefetch_data() override {
+      rados_ctx->set_prefetch_data(state.obj);
+      StoreObject::set_prefetch_data();
+    }
+    virtual void set_compressed() override {
+      rados_ctx->set_compressed(state.obj);
+      StoreObject::set_compressed();
+    }
+
+    virtual int get_obj_state(const DoutPrefixProvider* dpp, RGWObjState **state, optional_yield y, bool follow_olh = true) override;
+    virtual int set_obj_attrs(const DoutPrefixProvider* dpp, Attrs* setattrs, Attrs* delattrs, optional_yield y) override;
+    virtual int get_obj_attrs(optional_yield y, const DoutPrefixProvider* dpp, rgw_obj* target_obj = NULL) override;
+    virtual int modify_obj_attrs(const char* attr_name, bufferlist& attr_val, optional_yield y, const DoutPrefixProvider* dpp) override;
+    virtual int delete_obj_attrs(const DoutPrefixProvider* dpp, const char* attr_name, optional_yield y) override;
+    virtual bool is_expired() override;
+    virtual void gen_rand_obj_instance_name() override;
+    void get_raw_obj(rgw_raw_obj* raw_obj);
+    virtual std::unique_ptr<Object> clone() override {
+      return std::unique_ptr<Object>(new RadosObject(*this));
+    }
+    virtual std::unique_ptr<MPSerializer> get_serializer(const DoutPrefixProvider *dpp,
+                                                        const std::string& lock_name) override;
+    virtual int transition(Bucket* bucket,
+                          const rgw_placement_rule& placement_rule,
+                          const real_time& mtime,
+                          uint64_t olh_epoch,
+                          const DoutPrefixProvider* dpp,
+                          optional_yield y) override;
+    virtual int transition_to_cloud(Bucket* bucket,
+                          rgw::sal::PlacementTier* tier,
+                          rgw_bucket_dir_entry& o,
+                          std::set<std::string>& cloud_targets,
+                          CephContext* cct,
+                          bool update_object,
+                          const DoutPrefixProvider* dpp,
+                          optional_yield y) override;
+    virtual bool placement_rules_match(rgw_placement_rule& r1, rgw_placement_rule& r2) override;
+    virtual int dump_obj_layout(const DoutPrefixProvider *dpp, optional_yield y, Formatter* f) override;
+
+    /* Swift versioning */
+    virtual int swift_versioning_restore(bool& restored,
+                                        const DoutPrefixProvider* dpp) override;
+    virtual int swift_versioning_copy(const DoutPrefixProvider* dpp,
+                                     optional_yield y) override;
+
+    /* OPs */
+    virtual std::unique_ptr<ReadOp> get_read_op() override;
+    virtual std::unique_ptr<DeleteOp> get_delete_op() override;
+
+    /* OMAP */
+    virtual int omap_get_vals(const DoutPrefixProvider *dpp, const std::string& marker, uint64_t count,
+                             std::map<std::string, bufferlist> *m,
+                             bool* pmore, optional_yield y) override;
+    virtual int omap_get_all(const DoutPrefixProvider *dpp, std::map<std::string, bufferlist> *m,
+                            optional_yield y) override;
+    virtual int omap_get_vals_by_keys(const DoutPrefixProvider *dpp, const std::string& oid,
+                             const std::set<std::string>& keys,
+                             Attrs* vals) override;
+    virtual int omap_set_val_by_key(const DoutPrefixProvider *dpp, const std::string& key, bufferlist& val,
+                                   bool must_exist, optional_yield y) override;
+
+    /* Internal to RadosStore */
+    int get_max_chunk_size(const DoutPrefixProvider* dpp,
+                          rgw_placement_rule placement_rule,
+                          uint64_t* max_chunk_size,
+                          uint64_t* alignment = nullptr);
+    void get_max_aligned_size(uint64_t size, uint64_t alignment, uint64_t* max_size);
+    void raw_obj_to_obj(const rgw_raw_obj& raw_obj);
+    int write_cloud_tier(const DoutPrefixProvider* dpp,
+                          optional_yield y,
+                          uint64_t olh_epoch,
+                          rgw::sal::PlacementTier* tier,
+                          bool is_multipart_upload,
+                          rgw_placement_rule& target_placement,
+                          Object* head_obj);
+    RGWObjManifest* get_manifest() { return manifest; }
+    RGWObjectCtx& get_ctx() { return *rados_ctx; }
+
+  private:
+    int read_attrs(const DoutPrefixProvider* dpp, RGWRados::Object::Read &read_op, optional_yield y, rgw_obj* target_obj = nullptr);
+};
+
+class RadosBucket : public StoreBucket {
+  private:
+    RadosStore* store;
+    RGWAccessControlPolicy acls;
+
+  public:
+    RadosBucket(RadosStore *_st)
+      : store(_st),
+        acls() {
+    }
+
+    RadosBucket(RadosStore *_st, User* _u)
+      : StoreBucket(_u),
+       store(_st),
+        acls() {
+    }
+
+    RadosBucket(RadosStore *_st, const rgw_bucket& _b)
+      : StoreBucket(_b),
+       store(_st),
+        acls() {
+    }
+
+    RadosBucket(RadosStore *_st, const RGWBucketEnt& _e)
+      : StoreBucket(_e),
+       store(_st),
+        acls() {
+    }
+
+    RadosBucket(RadosStore *_st, const RGWBucketInfo& _i)
+      : StoreBucket(_i),
+       store(_st),
+        acls() {
+    }
+
+    RadosBucket(RadosStore *_st, const rgw_bucket& _b, User* _u)
+      : StoreBucket(_b, _u),
+       store(_st),
+        acls() {
+    }
+
+    RadosBucket(RadosStore *_st, const RGWBucketEnt& _e, User* _u)
+      : StoreBucket(_e, _u),
+       store(_st),
+        acls() {
+    }
+
+    RadosBucket(RadosStore *_st, const RGWBucketInfo& _i, User* _u)
+      : StoreBucket(_i, _u),
+       store(_st),
+        acls() {
+    }
+
+    virtual ~RadosBucket();
+    virtual std::unique_ptr<Object> get_object(const rgw_obj_key& k) override;
+    virtual int list(const DoutPrefixProvider* dpp, ListParams&, int, ListResults&, optional_yield y) override;
+    virtual int remove_bucket(const DoutPrefixProvider* dpp, bool delete_children, bool forward_to_master, req_info* req_info, optional_yield y) override;
+    virtual int remove_bucket_bypass_gc(int concurrent_max, bool
+                                       keep_index_consistent,
+                                       optional_yield y, const
+                                       DoutPrefixProvider *dpp) override;
+    virtual RGWAccessControlPolicy& get_acl(void) override { return acls; }
+    virtual int set_acl(const DoutPrefixProvider* dpp, RGWAccessControlPolicy& acl, optional_yield y) override;
+    virtual int load_bucket(const DoutPrefixProvider* dpp, optional_yield y, bool get_stats = false) override;
+    virtual int read_stats(const DoutPrefixProvider *dpp,
+                           const bucket_index_layout_generation& idx_layout,
+                           int shard_id, std::string* bucket_ver, std::string* master_ver,
+                           std::map<RGWObjCategory, RGWStorageStats>& stats,
+                           std::string* max_marker = nullptr,
+                           bool* syncstopped = nullptr) override;
+    virtual int read_stats_async(const DoutPrefixProvider *dpp,
+                                 const bucket_index_layout_generation& idx_layout,
+                                 int shard_id, RGWGetBucketStats_CB* ctx) override;
+    virtual int sync_user_stats(const DoutPrefixProvider *dpp, optional_yield y) override;
+    virtual int update_container_stats(const DoutPrefixProvider* dpp) override;
+    virtual int check_bucket_shards(const DoutPrefixProvider* dpp) override;
+    virtual int chown(const DoutPrefixProvider* dpp, User* new_user, User* old_user, optional_yield y, const std::string* marker = nullptr) override;
+    virtual int put_info(const DoutPrefixProvider* dpp, bool exclusive, ceph::real_time mtime) override;
+    virtual bool is_owner(User* user) override;
+    virtual int check_empty(const DoutPrefixProvider* dpp, optional_yield y) override;
+    virtual int check_quota(const DoutPrefixProvider *dpp, RGWQuota& quota, uint64_t obj_size, optional_yield y, bool check_size_only = false) override;
+    virtual int merge_and_store_attrs(const DoutPrefixProvider* dpp, Attrs& attrs, optional_yield y) override;
+    virtual int try_refresh_info(const DoutPrefixProvider* dpp, ceph::real_time* pmtime) override;
+    virtual int read_usage(const DoutPrefixProvider *dpp, uint64_t start_epoch, uint64_t end_epoch, uint32_t max_entries,
+                          bool* is_truncated, RGWUsageIter& usage_iter,
+                          std::map<rgw_user_bucket, rgw_usage_log_entry>& usage) override;
+    virtual int trim_usage(const DoutPrefixProvider *dpp, uint64_t start_epoch, uint64_t end_epoch) override;
+    virtual int remove_objs_from_index(const DoutPrefixProvider *dpp, std::list<rgw_obj_index_key>& objs_to_unlink) override;
+    virtual int check_index(const DoutPrefixProvider *dpp, std::map<RGWObjCategory, RGWStorageStats>& existing_stats, std::map<RGWObjCategory, RGWStorageStats>& calculated_stats) override;
+    virtual int rebuild_index(const DoutPrefixProvider *dpp) override;
+    virtual int set_tag_timeout(const DoutPrefixProvider *dpp, uint64_t timeout) override;
+    virtual int purge_instance(const DoutPrefixProvider* dpp) override;
+    virtual std::unique_ptr<Bucket> clone() override {
+      return std::make_unique<RadosBucket>(*this);
+    }
+    virtual std::unique_ptr<MultipartUpload> get_multipart_upload(
+                               const std::string& oid,
+                               std::optional<std::string> upload_id=std::nullopt,
+                               ACLOwner owner={}, ceph::real_time mtime=real_clock::now()) override;
+    virtual int list_multiparts(const DoutPrefixProvider *dpp,
+                               const std::string& prefix,
+                               std::string& marker,
+                               const std::string& delim,
+                               const int& max_uploads,
+                               std::vector<std::unique_ptr<MultipartUpload>>& uploads,
+                               std::map<std::string, bool> *common_prefixes,
+                               bool *is_truncated) override;
+    virtual int abort_multiparts(const DoutPrefixProvider* dpp,
+                                CephContext* cct) override;
+
+  private:
+    int link(const DoutPrefixProvider* dpp, User* new_user, optional_yield y, bool update_entrypoint = true, RGWObjVersionTracker* objv = nullptr);
+    int unlink(const DoutPrefixProvider* dpp, User* new_user, optional_yield y, bool update_entrypoint = true);
+    friend class RadosUser;
+};
+
+class RadosMultipartPart : public StoreMultipartPart {
+protected:
+  RGWUploadPartInfo info;
+
+public:
+  RadosMultipartPart() = default;
+  virtual ~RadosMultipartPart() = default;
+
+  virtual uint32_t get_num() { return info.num; }
+  virtual uint64_t get_size() { return info.accounted_size; }
+  virtual const std::string& get_etag() { return info.etag; }
+  virtual ceph::real_time& get_mtime() { return info.modified; }
+
+  /* For RadosStore code */
+  RGWObjManifest& get_manifest() { return info.manifest; }
+
+  friend class RadosMultipartUpload;
+};
+
+class RadosMultipartUpload : public StoreMultipartUpload {
+  RadosStore* store;
+  RGWMPObj mp_obj;
+  ACLOwner owner;
+  ceph::real_time mtime;
+  rgw_placement_rule placement;
+  RGWObjManifest manifest;
+
+public:
+  RadosMultipartUpload(RadosStore* _store, Bucket* _bucket, const std::string& oid,
+                       std::optional<std::string> upload_id, ACLOwner owner,
+                       ceph::real_time _mtime)
+      : StoreMultipartUpload(_bucket), store(_store), mp_obj(oid, upload_id),
+        owner(owner), mtime(_mtime) {}
+  virtual ~RadosMultipartUpload() = default;
+
+  virtual const std::string& get_meta() const override { return mp_obj.get_meta(); }
+  virtual const std::string& get_key() const override { return mp_obj.get_key(); }
+  virtual const std::string& get_upload_id() const override { return mp_obj.get_upload_id(); }
+  virtual const ACLOwner& get_owner() const override { return owner; }
+  virtual ceph::real_time& get_mtime() override { return mtime; }
+  virtual std::unique_ptr<rgw::sal::Object> get_meta_obj() override;
+  virtual int init(const DoutPrefixProvider* dpp, optional_yield y, ACLOwner& owner, rgw_placement_rule& dest_placement, rgw::sal::Attrs& attrs) override;
+  virtual int list_parts(const DoutPrefixProvider* dpp, CephContext* cct,
+                        int num_parts, int marker,
+                        int* next_marker, bool* truncated,
+                        bool assume_unsorted = false) override;
+  virtual int abort(const DoutPrefixProvider* dpp, CephContext* cct) override;
+  virtual int complete(const DoutPrefixProvider* dpp,
+                      optional_yield y, CephContext* cct,
+                      std::map<int, std::string>& part_etags,
+                      std::list<rgw_obj_index_key>& remove_objs,
+                      uint64_t& accounted_size, bool& compressed,
+                      RGWCompressionInfo& cs_info, off_t& ofs,
+                      std::string& tag, ACLOwner& owner,
+                      uint64_t olh_epoch,
+                      rgw::sal::Object* target_obj) override;
+  virtual int get_info(const DoutPrefixProvider *dpp, optional_yield y, rgw_placement_rule** rule, rgw::sal::Attrs* attrs = nullptr) override;
+  virtual std::unique_ptr<Writer> get_writer(const DoutPrefixProvider *dpp,
+                         optional_yield y,
+                         std::unique_ptr<rgw::sal::Object> _head_obj,
+                         const rgw_user& owner,
+                         const rgw_placement_rule *ptail_placement_rule,
+                         uint64_t part_num,
+                         const std::string& part_num_str) override;
+};
+
+class MPRadosSerializer : public StoreMPSerializer {
+  librados::IoCtx ioctx;
+  rados::cls::lock::Lock lock;
+  librados::ObjectWriteOperation op;
+
+public:
+  MPRadosSerializer(const DoutPrefixProvider *dpp, RadosStore* store, RadosObject* obj, const std::string& lock_name);
+
+  virtual int try_lock(const DoutPrefixProvider *dpp, utime_t dur, optional_yield y) override;
+  virtual int unlock() override {
+    return lock.unlock(&ioctx, oid);
+  }
+};
+
+class LCRadosSerializer : public StoreLCSerializer {
+  librados::IoCtx* ioctx;
+  rados::cls::lock::Lock lock;
+
+public:
+  LCRadosSerializer(RadosStore* store, const std::string& oid, const std::string& lock_name, const std::string& cookie);
+
+  virtual int try_lock(const DoutPrefixProvider *dpp, utime_t dur, optional_yield y) override;
+  virtual int unlock() override {
+    return lock.unlock(ioctx, oid);
+  }
+};
+
+class RadosLifecycle : public StoreLifecycle {
+  RadosStore* store;
+
+public:
+  RadosLifecycle(RadosStore* _st) : store(_st) {}
+
+  using StoreLifecycle::get_entry;
+  virtual int get_entry(const std::string& oid, const std::string& marker, std::unique_ptr<LCEntry>* entry) override;
+  virtual int get_next_entry(const std::string& oid, const std::string& marker, std::unique_ptr<LCEntry>* entry) override;
+  virtual int set_entry(const std::string& oid, LCEntry& entry) override;
+  virtual int list_entries(const std::string& oid, const std::string& marker,
+                          uint32_t max_entries,
+                          std::vector<std::unique_ptr<LCEntry>>& entries) override;
+  virtual int rm_entry(const std::string& oid, LCEntry& entry) override;
+  virtual int get_head(const std::string& oid, std::unique_ptr<LCHead>* head) override;
+  virtual int put_head(const std::string& oid, LCHead& head) override;
+  virtual std::unique_ptr<LCSerializer> get_serializer(const std::string& lock_name,
+                                                      const std::string& oid,
+                                                      const std::string& cookie) override;
+};
+
+class RadosNotification : public StoreNotification {
+  RadosStore* store;
+  /* XXX it feels incorrect to me that rgw::notify::reservation_t is
+   * currently RADOS-specific; instead, I think notification types such as
+   * reservation_t should be generally visible, whereas the internal
+   * notification behavior should be made portable (e.g., notification
+   * to non-RADOS message sinks) */
+  rgw::notify::reservation_t res;
+
+  public:
+    RadosNotification(const DoutPrefixProvider* _dpp, RadosStore* _store, Object* _obj, Object* _src_obj, req_state* _s, rgw::notify::EventType _type, const std::string* object_name=nullptr) :
+      StoreNotification(_obj, _src_obj, _type), store(_store), res(_dpp, _store, _s, _obj, _src_obj, object_name) { }
+
+    RadosNotification(const DoutPrefixProvider* _dpp, RadosStore* _store, Object* _obj, Object* _src_obj, rgw::notify::EventType _type, rgw::sal::Bucket* _bucket, std::string& _user_id, std::string& _user_tenant, std::string& _req_id, optional_yield y) :
+      StoreNotification(_obj, _src_obj, _type), store(_store), res(_dpp, _store, _obj, _src_obj, _bucket, _user_id, _user_tenant, _req_id, y) {}
+
+    ~RadosNotification() = default;
+
+    rgw::notify::reservation_t& get_reservation(void) {
+      return res;
+    }
+
+    virtual int publish_reserve(const DoutPrefixProvider *dpp, RGWObjTags* obj_tags = nullptr) override;
+    virtual int publish_commit(const DoutPrefixProvider* dpp, uint64_t size,
+                              const ceph::real_time& mtime, const std::string& etag, const std::string& version) override;
+};
+
+class RadosAtomicWriter : public StoreWriter {
+protected:
+  rgw::sal::RadosStore* store;
+  std::unique_ptr<Aio> aio;
+  RGWObjectCtx* obj_ctx;
+  rgw::putobj::AtomicObjectProcessor processor;
+
+public:
+  RadosAtomicWriter(const DoutPrefixProvider *dpp,
+                   optional_yield y,
+                   std::unique_ptr<rgw::sal::Object> _head_obj,
+                   RadosStore* _store, std::unique_ptr<Aio> _aio,
+                   const rgw_user& owner,
+                   const rgw_placement_rule *ptail_placement_rule,
+                   uint64_t olh_epoch,
+                   const std::string& unique_tag) :
+                       StoreWriter(dpp, y),
+                       store(_store),
+                       aio(std::move(_aio)),
+                       obj_ctx(&dynamic_cast<RadosObject*>(_head_obj.get())->get_ctx()),
+                       processor(&*aio, store,
+                                 ptail_placement_rule, owner, 
+                                 *obj_ctx,
+                                 std::move(_head_obj), olh_epoch, unique_tag,
+                                 dpp, y)
+  {}
+  ~RadosAtomicWriter() = default;
+
+  // prepare to start processing object data
+  virtual int prepare(optional_yield y) override;
+
+  // Process a bufferlist
+  virtual int process(bufferlist&& data, uint64_t offset) override;
+
+  // complete the operation and make its result visible to clients
+  virtual int complete(size_t accounted_size, const std::string& etag,
+                       ceph::real_time *mtime, ceph::real_time set_mtime,
+                       std::map<std::string, bufferlist>& attrs,
+                       ceph::real_time delete_at,
+                       const char *if_match, const char *if_nomatch,
+                       const std::string *user_data,
+                       rgw_zone_set *zones_trace, bool *canceled,
+                       optional_yield y) override;
+};
+
+class RadosAppendWriter : public StoreWriter {
+protected:
+  rgw::sal::RadosStore* store;
+  std::unique_ptr<Aio> aio;
+  RGWObjectCtx* obj_ctx;
+  rgw::putobj::AppendObjectProcessor processor;
+
+public:
+  RadosAppendWriter(const DoutPrefixProvider *dpp,
+                   optional_yield y,
+                   std::unique_ptr<rgw::sal::Object> _head_obj,
+                   RadosStore* _store, std::unique_ptr<Aio> _aio,
+                   const rgw_user& owner,
+                   const rgw_placement_rule *ptail_placement_rule,
+                   const std::string& unique_tag,
+                   uint64_t position,
+                   uint64_t *cur_accounted_size) :
+                       StoreWriter(dpp, y),
+                       store(_store),
+                       aio(std::move(_aio)),
+                       obj_ctx(&dynamic_cast<RadosObject*>(_head_obj.get())->get_ctx()),
+                       processor(&*aio, store,
+                                 ptail_placement_rule, owner,
+                                 *obj_ctx,
+                                 std::move(_head_obj), unique_tag, position,
+                                 cur_accounted_size, dpp, y)
+  {}
+  ~RadosAppendWriter() = default;
+
+  // prepare to start processing object data
+  virtual int prepare(optional_yield y) override;
+
+  // Process a bufferlist
+  virtual int process(bufferlist&& data, uint64_t offset) override;
+
+  // complete the operation and make its result visible to clients
+  virtual int complete(size_t accounted_size, const std::string& etag,
+                       ceph::real_time *mtime, ceph::real_time set_mtime,
+                       std::map<std::string, bufferlist>& attrs,
+                       ceph::real_time delete_at,
+                       const char *if_match, const char *if_nomatch,
+                       const std::string *user_data,
+                       rgw_zone_set *zones_trace, bool *canceled,
+                       optional_yield y) override;
+};
+
+class RadosMultipartWriter : public StoreWriter {
+protected:
+  rgw::sal::RadosStore* store;
+  std::unique_ptr<Aio> aio;
+  RGWObjectCtx* obj_ctx;
+  rgw::putobj::MultipartObjectProcessor processor;
+
+public:
+  RadosMultipartWriter(const DoutPrefixProvider *dpp,
+                      optional_yield y, MultipartUpload* upload,
+                      std::unique_ptr<rgw::sal::Object> _head_obj,
+                      RadosStore* _store, std::unique_ptr<Aio> _aio,
+                      const rgw_user& owner,
+                      const rgw_placement_rule *ptail_placement_rule,
+                      uint64_t part_num, const std::string& part_num_str) :
+                       StoreWriter(dpp, y),
+                       store(_store),
+                       aio(std::move(_aio)),
+                       obj_ctx(&dynamic_cast<RadosObject*>(_head_obj.get())->get_ctx()),
+                       processor(&*aio, store,
+                                 ptail_placement_rule, owner,
+                                 *obj_ctx,
+                                 std::move(_head_obj), upload->get_upload_id(),
+                                 part_num, part_num_str, dpp, y)
+  {}
+  ~RadosMultipartWriter() = default;
+
+  // prepare to start processing object data
+  virtual int prepare(optional_yield y) override;
+
+  // Process a bufferlist
+  virtual int process(bufferlist&& data, uint64_t offset) override;
+
+  // complete the operation and make its result visible to clients
+  virtual int complete(size_t accounted_size, const std::string& etag,
+                       ceph::real_time *mtime, ceph::real_time set_mtime,
+                       std::map<std::string, bufferlist>& attrs,
+                       ceph::real_time delete_at,
+                       const char *if_match, const char *if_nomatch,
+                       const std::string *user_data,
+                       rgw_zone_set *zones_trace, bool *canceled,
+                       optional_yield y) override;
+};
+
+class RadosLuaManager : public StoreLuaManager {
+  RadosStore* const store;
+  rgw_pool pool;
+
+public:
+  RadosLuaManager(RadosStore* _s);
+  virtual ~RadosLuaManager() = default;
+
+  virtual int get_script(const DoutPrefixProvider* dpp, optional_yield y, const std::string& key, std::string& script);
+  virtual int put_script(const DoutPrefixProvider* dpp, optional_yield y, const std::string& key, const std::string& script);
+  virtual int del_script(const DoutPrefixProvider* dpp, optional_yield y, const std::string& key);
+  virtual int add_package(const DoutPrefixProvider* dpp, optional_yield y, const std::string& package_name);
+  virtual int remove_package(const DoutPrefixProvider* dpp, optional_yield y, const std::string& package_name);
+  virtual int list_packages(const DoutPrefixProvider* dpp, optional_yield y, rgw::lua::packages_t& packages);
+};
+
+class RadosOIDCProvider : public RGWOIDCProvider {
+  RadosStore* store;
+public:
+  RadosOIDCProvider(RadosStore* _store) : store(_store) {}
+  ~RadosOIDCProvider() = default;
+
+  virtual int store_url(const DoutPrefixProvider *dpp, const std::string& url, bool exclusive, optional_yield y) override;
+  virtual int read_url(const DoutPrefixProvider *dpp, const std::string& url, const std::string& tenant) override;
+  virtual int delete_obj(const DoutPrefixProvider *dpp, optional_yield y) override;
+  void encode(bufferlist& bl) const {
+    RGWOIDCProvider::encode(bl);
+  }
+  void decode(bufferlist::const_iterator& bl) {
+    RGWOIDCProvider::decode(bl);
+  }
+};
+
+class RadosRole : public RGWRole {
+  RadosStore* store;
+public:
+  RadosRole(RadosStore* _store, std::string name,
+          std::string tenant,
+          std::string path,
+          std::string trust_policy,
+          std::string max_session_duration,
+          std::multimap<std::string,std::string> tags) : RGWRole(name, tenant, path, trust_policy, max_session_duration, tags), store(_store) {}
+  RadosRole(RadosStore* _store, std::string id) : RGWRole(id), store(_store) {}
+  RadosRole(RadosStore* _store, const RGWRoleInfo& info) : RGWRole(info), store(_store) {}
+  RadosRole(RadosStore* _store) : store(_store) {}
+  ~RadosRole() = default;
+
+  virtual int store_info(const DoutPrefixProvider *dpp, bool exclusive, optional_yield y) override;
+  virtual int store_name(const DoutPrefixProvider *dpp, bool exclusive, optional_yield y) override;
+  virtual int store_path(const DoutPrefixProvider *dpp, bool exclusive, optional_yield y) override;
+  virtual int read_id(const DoutPrefixProvider *dpp, const std::string& role_name, const std::string& tenant, std::string& role_id, optional_yield y) override;
+  virtual int read_name(const DoutPrefixProvider *dpp, optional_yield y) override;
+  virtual int read_info(const DoutPrefixProvider *dpp, optional_yield y) override;
+  virtual int create(const DoutPrefixProvider *dpp, bool exclusive, const std::string& role_id, optional_yield y) override;
+  virtual int delete_obj(const DoutPrefixProvider *dpp, optional_yield y) override;
+};
+}} // namespace rgw::sal
+
+WRITE_CLASS_ENCODER(rgw::sal::RadosOIDCProvider)
diff --git a/src/rgw/driver/rados/rgw_service.cc b/src/rgw/driver/rados/rgw_service.cc
new file mode 100644 (file)
index 0000000..4fcb1eb
--- /dev/null
@@ -0,0 +1,476 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "rgw_service.h"
+
+#include "services/svc_finisher.h"
+#include "services/svc_bi_rados.h"
+#include "services/svc_bilog_rados.h"
+#include "services/svc_bucket_sobj.h"
+#include "services/svc_bucket_sync_sobj.h"
+#include "services/svc_cls.h"
+#include "services/svc_config_key_rados.h"
+#include "services/svc_mdlog.h"
+#include "services/svc_meta.h"
+#include "services/svc_meta_be.h"
+#include "services/svc_meta_be_sobj.h"
+#include "services/svc_meta_be_otp.h"
+#include "services/svc_notify.h"
+#include "services/svc_otp.h"
+#include "services/svc_rados.h"
+#include "services/svc_zone.h"
+#include "services/svc_zone_utils.h"
+#include "services/svc_quota.h"
+#include "services/svc_sync_modules.h"
+#include "services/svc_sys_obj.h"
+#include "services/svc_sys_obj_cache.h"
+#include "services/svc_sys_obj_core.h"
+#include "services/svc_user_rados.h"
+#include "services/svc_role_rados.h"
+
+#include "common/errno.h"
+
+#include "rgw_bucket.h"
+#include "rgw_datalog.h"
+#include "rgw_metadata.h"
+#include "rgw_otp.h"
+#include "rgw_user.h"
+#include "rgw_role.h"
+
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+
+RGWServices_Def::RGWServices_Def() = default;
+RGWServices_Def::~RGWServices_Def()
+{
+  shutdown();
+}
+
+int RGWServices_Def::init(CephContext *cct,
+                         bool have_cache,
+                          bool raw,
+                         bool run_sync,
+                         optional_yield y,
+                          const DoutPrefixProvider *dpp)
+{
+  finisher = std::make_unique<RGWSI_Finisher>(cct);
+  bucket_sobj = std::make_unique<RGWSI_Bucket_SObj>(cct);
+  bucket_sync_sobj = std::make_unique<RGWSI_Bucket_Sync_SObj>(cct);
+  bi_rados = std::make_unique<RGWSI_BucketIndex_RADOS>(cct);
+  bilog_rados = std::make_unique<RGWSI_BILog_RADOS>(cct);
+  cls = std::make_unique<RGWSI_Cls>(cct);
+  config_key_rados = std::make_unique<RGWSI_ConfigKey_RADOS>(cct);
+  datalog_rados = std::make_unique<RGWDataChangesLog>(cct);
+  mdlog = std::make_unique<RGWSI_MDLog>(cct, run_sync);
+  meta = std::make_unique<RGWSI_Meta>(cct);
+  meta_be_sobj = std::make_unique<RGWSI_MetaBackend_SObj>(cct);
+  meta_be_otp = std::make_unique<RGWSI_MetaBackend_OTP>(cct);
+  notify = std::make_unique<RGWSI_Notify>(cct);
+  otp = std::make_unique<RGWSI_OTP>(cct);
+  rados = std::make_unique<RGWSI_RADOS>(cct);
+  zone = std::make_unique<RGWSI_Zone>(cct);
+  zone_utils = std::make_unique<RGWSI_ZoneUtils>(cct);
+  quota = std::make_unique<RGWSI_Quota>(cct);
+  sync_modules = std::make_unique<RGWSI_SyncModules>(cct);
+  sysobj = std::make_unique<RGWSI_SysObj>(cct);
+  sysobj_core = std::make_unique<RGWSI_SysObj_Core>(cct);
+  user_rados = std::make_unique<RGWSI_User_RADOS>(cct);
+  role_rados = std::make_unique<RGWSI_Role_RADOS>(cct);
+
+  if (have_cache) {
+    sysobj_cache = std::make_unique<RGWSI_SysObj_Cache>(dpp, cct);
+  }
+
+  vector<RGWSI_MetaBackend *> meta_bes{meta_be_sobj.get(), meta_be_otp.get()};
+
+  finisher->init();
+  bi_rados->init(zone.get(), rados.get(), bilog_rados.get(), datalog_rados.get());
+  bilog_rados->init(bi_rados.get());
+  bucket_sobj->init(zone.get(), sysobj.get(), sysobj_cache.get(),
+                    bi_rados.get(), meta.get(), meta_be_sobj.get(),
+                    sync_modules.get(), bucket_sync_sobj.get());
+  bucket_sync_sobj->init(zone.get(),
+                         sysobj.get(),
+                         sysobj_cache.get(),
+                         bucket_sobj.get());
+  cls->init(zone.get(), rados.get());
+  config_key_rados->init(rados.get());
+  mdlog->init(rados.get(), zone.get(), sysobj.get(), cls.get());
+  meta->init(sysobj.get(), mdlog.get(), meta_bes);
+  meta_be_sobj->init(sysobj.get(), mdlog.get());
+  meta_be_otp->init(sysobj.get(), mdlog.get(), cls.get());
+  notify->init(zone.get(), rados.get(), finisher.get());
+  otp->init(zone.get(), meta.get(), meta_be_otp.get());
+  rados->init();
+  zone->init(sysobj.get(), rados.get(), sync_modules.get(), bucket_sync_sobj.get());
+  zone_utils->init(rados.get(), zone.get());
+  quota->init(zone.get());
+  sync_modules->init(zone.get());
+  sysobj_core->core_init(rados.get(), zone.get());
+  if (have_cache) {
+    sysobj_cache->init(rados.get(), zone.get(), notify.get());
+    sysobj->init(rados.get(), sysobj_cache.get());
+  } else {
+    sysobj->init(rados.get(), sysobj_core.get());
+  }
+  user_rados->init(rados.get(), zone.get(), sysobj.get(), sysobj_cache.get(),
+                   meta.get(), meta_be_sobj.get(), sync_modules.get());
+  role_rados->init(zone.get(), meta.get(), meta_be_sobj.get(), sysobj.get());
+
+  can_shutdown = true;
+
+  int r = finisher->start(y, dpp);
+  if (r < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: failed to start finisher service (" << cpp_strerror(-r) << dendl;
+    return r;
+  }
+
+  if (!raw) {
+    r = notify->start(y, dpp);
+    if (r < 0) {
+      ldpp_dout(dpp, 0) << "ERROR: failed to start notify service (" << cpp_strerror(-r) << dendl;
+      return r;
+    }
+  }
+
+  r = rados->start(y, dpp);
+  if (r < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: failed to start rados service (" << cpp_strerror(-r) << dendl;
+    return r;
+  }
+
+  if (!raw) {
+    r = zone->start(y, dpp);
+    if (r < 0) {
+      ldpp_dout(dpp, 0) << "ERROR: failed to start zone service (" << cpp_strerror(-r) << dendl;
+      return r;
+    }
+
+    r = datalog_rados->start(dpp, &zone->get_zone(),
+                            zone->get_zone_params(),
+                            rados->get_rados_handle());
+    if (r < 0) {
+      ldpp_dout(dpp, 0) << "ERROR: failed to start datalog_rados service (" << cpp_strerror(-r) << dendl;
+      return r;
+    }
+
+    r = mdlog->start(y, dpp);
+    if (r < 0) {
+      ldpp_dout(dpp, 0) << "ERROR: failed to start mdlog service (" << cpp_strerror(-r) << dendl;
+      return r;
+    }
+
+    r = sync_modules->start(y, dpp);
+    if (r < 0) {
+      ldpp_dout(dpp, 0) << "ERROR: failed to start sync modules service (" << cpp_strerror(-r) << dendl;
+      return r;
+    }
+  }
+
+  r = cls->start(y, dpp);
+  if (r < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: failed to start cls service (" << cpp_strerror(-r) << dendl;
+    return r;
+  }
+
+  r = config_key_rados->start(y, dpp);
+  if (r < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: failed to start config_key service (" << cpp_strerror(-r) << dendl;
+    return r;
+  }
+
+  r = zone_utils->start(y, dpp);
+  if (r < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: failed to start zone_utils service (" << cpp_strerror(-r) << dendl;
+    return r;
+  }
+
+  r = quota->start(y, dpp);
+  if (r < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: failed to start quota service (" << cpp_strerror(-r) << dendl;
+    return r;
+  }
+
+  r = sysobj_core->start(y, dpp);
+  if (r < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: failed to start sysobj_core service (" << cpp_strerror(-r) << dendl;
+    return r;
+  }
+
+  if (have_cache) {
+    r = sysobj_cache->start(y, dpp);
+    if (r < 0) {
+      ldpp_dout(dpp, 0) << "ERROR: failed to start sysobj_cache service (" << cpp_strerror(-r) << dendl;
+      return r;
+    }
+  }
+
+  r = sysobj->start(y, dpp);
+  if (r < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: failed to start sysobj service (" << cpp_strerror(-r) << dendl;
+    return r;
+  }
+
+  if (!raw) {
+    r = meta_be_sobj->start(y, dpp);
+    if (r < 0) {
+      ldpp_dout(dpp, 0) << "ERROR: failed to start meta_be_sobj service (" << cpp_strerror(-r) << dendl;
+      return r;
+    }
+
+    r = meta->start(y, dpp);
+    if (r < 0) {
+      ldpp_dout(dpp, 0) << "ERROR: failed to start meta service (" << cpp_strerror(-r) << dendl;
+      return r;
+    }
+
+    r = bucket_sobj->start(y, dpp);
+    if (r < 0) {
+      ldpp_dout(dpp, 0) << "ERROR: failed to start bucket service (" << cpp_strerror(-r) << dendl;
+      return r;
+    }
+
+    r = bucket_sync_sobj->start(y, dpp);
+    if (r < 0) {
+      ldpp_dout(dpp, 0) << "ERROR: failed to start bucket_sync service (" << cpp_strerror(-r) << dendl;
+      return r;
+    }
+
+    r = user_rados->start(y, dpp);
+    if (r < 0) {
+      ldpp_dout(dpp, 0) << "ERROR: failed to start user_rados service (" << cpp_strerror(-r) << dendl;
+      return r;
+    }
+
+    r = otp->start(y, dpp);
+    if (r < 0) {
+      ldpp_dout(dpp, 0) << "ERROR: failed to start otp service (" << cpp_strerror(-r) << dendl;
+      return r;
+    }
+
+    r = role_rados->start(y, dpp);
+    if (r < 0) {
+      ldout(cct, 0) << "ERROR: failed to start role_rados service (" << cpp_strerror(-r) << dendl;
+      return r;
+    }
+
+  }
+
+  /* cache or core services will be started by sysobj */
+
+  return  0;
+}
+
+void RGWServices_Def::shutdown()
+{
+  if (!can_shutdown) {
+    return;
+  }
+
+  if (has_shutdown) {
+    return;
+  }
+
+  role_rados->shutdown();
+  datalog_rados.reset();
+  user_rados->shutdown();
+  sync_modules->shutdown();
+  otp->shutdown();
+  notify->shutdown();
+  meta_be_otp->shutdown();
+  meta_be_sobj->shutdown();
+  meta->shutdown();
+  mdlog->shutdown();
+  config_key_rados->shutdown();
+  cls->shutdown();
+  bilog_rados->shutdown();
+  bi_rados->shutdown();
+  bucket_sync_sobj->shutdown();
+  bucket_sobj->shutdown();
+  finisher->shutdown();
+
+  sysobj->shutdown();
+  sysobj_core->shutdown();
+  notify->shutdown();
+  if (sysobj_cache) {
+    sysobj_cache->shutdown();
+  }
+  quota->shutdown();
+  zone_utils->shutdown();
+  zone->shutdown();
+  rados->shutdown();
+
+  has_shutdown = true;
+
+}
+
+
+int RGWServices::do_init(CephContext *_cct, bool have_cache, bool raw, bool run_sync, optional_yield y, const DoutPrefixProvider *dpp)
+{
+  cct = _cct;
+
+  int r = _svc.init(cct, have_cache, raw, run_sync, y, dpp);
+  if (r < 0) {
+    return r;
+  }
+
+  finisher = _svc.finisher.get();
+  bi_rados = _svc.bi_rados.get();
+  bi = bi_rados;
+  bilog_rados = _svc.bilog_rados.get();
+  bucket_sobj = _svc.bucket_sobj.get();
+  bucket = bucket_sobj;
+  bucket_sync_sobj = _svc.bucket_sync_sobj.get();
+  bucket_sync = bucket_sync_sobj;
+  cls = _svc.cls.get();
+  config_key_rados = _svc.config_key_rados.get();
+  config_key = config_key_rados;
+  datalog_rados = _svc.datalog_rados.get();
+  mdlog = _svc.mdlog.get();
+  meta = _svc.meta.get();
+  meta_be_sobj = _svc.meta_be_sobj.get();
+  meta_be_otp = _svc.meta_be_otp.get();
+  notify = _svc.notify.get();
+  otp = _svc.otp.get();
+  rados = _svc.rados.get();
+  zone = _svc.zone.get();
+  zone_utils = _svc.zone_utils.get();
+  quota = _svc.quota.get();
+  sync_modules = _svc.sync_modules.get();
+  sysobj = _svc.sysobj.get();
+  cache = _svc.sysobj_cache.get();
+  core = _svc.sysobj_core.get();
+  user = _svc.user_rados.get();
+  role = _svc.role_rados.get();
+
+  return 0;
+}
+
+RGWServiceInstance::~RGWServiceInstance() {}
+
+int RGWServiceInstance::start(optional_yield y, const DoutPrefixProvider *dpp)
+{
+  if (start_state != StateInit) {
+    return 0;
+  }
+
+  start_state = StateStarting;; /* setting started prior to do_start() on purpose so that circular
+                                   references can call start() on each other */
+
+  int r = do_start(y, dpp);
+  if (r < 0) {
+    return r;
+  }
+
+  start_state = StateStarted;
+
+  return 0;
+}
+
+RGWCtlDef::RGWCtlDef() {}
+RGWCtlDef::~RGWCtlDef() {}
+RGWCtlDef::_meta::_meta() {}
+RGWCtlDef::_meta::~_meta() {}
+
+
+int RGWCtlDef::init(RGWServices& svc, rgw::sal::Driver* driver, const DoutPrefixProvider *dpp)
+{
+  meta.mgr.reset(new RGWMetadataManager(svc.meta));
+
+  meta.user.reset(RGWUserMetaHandlerAllocator::alloc(svc.user));
+
+  auto sync_module = svc.sync_modules->get_sync_module();
+  if (sync_module) {
+    meta.bucket.reset(sync_module->alloc_bucket_meta_handler());
+    meta.bucket_instance.reset(sync_module->alloc_bucket_instance_meta_handler(driver));
+  } else {
+    meta.bucket.reset(RGWBucketMetaHandlerAllocator::alloc());
+    meta.bucket_instance.reset(RGWBucketInstanceMetaHandlerAllocator::alloc(driver));
+  }
+
+  meta.otp.reset(RGWOTPMetaHandlerAllocator::alloc());
+  meta.role = std::make_unique<rgw::sal::RGWRoleMetadataHandler>(driver, svc.role);
+
+  user.reset(new RGWUserCtl(svc.zone, svc.user, (RGWUserMetadataHandler *)meta.user.get()));
+  bucket.reset(new RGWBucketCtl(svc.zone,
+                                svc.bucket,
+                                svc.bucket_sync,
+                                svc.bi, svc.user));
+  otp.reset(new RGWOTPCtl(svc.zone, svc.otp));
+
+  RGWBucketMetadataHandlerBase *bucket_meta_handler = static_cast<RGWBucketMetadataHandlerBase *>(meta.bucket.get());
+  RGWBucketInstanceMetadataHandlerBase *bi_meta_handler = static_cast<RGWBucketInstanceMetadataHandlerBase *>(meta.bucket_instance.get());
+
+  bucket_meta_handler->init(svc.bucket, bucket.get());
+  bi_meta_handler->init(svc.zone, svc.bucket, svc.bi);
+
+  RGWOTPMetadataHandlerBase *otp_handler = static_cast<RGWOTPMetadataHandlerBase *>(meta.otp.get());
+  otp_handler->init(svc.zone, svc.meta_be_otp, svc.otp);
+
+  user->init(bucket.get());
+  bucket->init(user.get(),
+               (RGWBucketMetadataHandler *)bucket_meta_handler,
+               (RGWBucketInstanceMetadataHandler *)bi_meta_handler,
+              svc.datalog_rados,
+               dpp);
+
+  otp->init((RGWOTPMetadataHandler *)meta.otp.get());
+
+  return 0;
+}
+
+int RGWCtl::init(RGWServices *_svc, rgw::sal::Driver* driver, const DoutPrefixProvider *dpp)
+{
+  svc = _svc;
+  cct = svc->cct;
+
+  int r = _ctl.init(*svc, driver, dpp);
+  if (r < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: failed to start init ctls (" << cpp_strerror(-r) << dendl;
+    return r;
+  }
+
+  meta.mgr = _ctl.meta.mgr.get();
+  meta.user = _ctl.meta.user.get();
+  meta.bucket = _ctl.meta.bucket.get();
+  meta.bucket_instance = _ctl.meta.bucket_instance.get();
+  meta.otp = _ctl.meta.otp.get();
+  meta.role = _ctl.meta.role.get();
+
+  user = _ctl.user.get();
+  bucket = _ctl.bucket.get();
+  otp = _ctl.otp.get();
+
+  r = meta.user->attach(meta.mgr);
+  if (r < 0) {
+    ldout(cct, 0) << "ERROR: failed to start init meta.user ctl (" << cpp_strerror(-r) << dendl;
+    return r;
+  }
+
+  r = meta.bucket->attach(meta.mgr);
+  if (r < 0) {
+    ldout(cct, 0) << "ERROR: failed to start init meta.bucket ctl (" << cpp_strerror(-r) << dendl;
+    return r;
+  }
+
+  r = meta.bucket_instance->attach(meta.mgr);
+  if (r < 0) {
+    ldout(cct, 0) << "ERROR: failed to start init meta.bucket_instance ctl (" << cpp_strerror(-r) << dendl;
+    return r;
+  }
+
+  r = meta.otp->attach(meta.mgr);
+  if (r < 0) {
+    ldout(cct, 0) << "ERROR: failed to start init otp ctl (" << cpp_strerror(-r) << dendl;
+    return r;
+  }
+
+  r = meta.role->attach(meta.mgr);
+  if (r < 0) {
+    ldout(cct, 0) << "ERROR: failed to start init otp ctl (" << cpp_strerror(-r) << dendl;
+    return r;
+  }
+  return 0;
+}
+
diff --git a/src/rgw/driver/rados/rgw_service.h b/src/rgw/driver/rados/rgw_service.h
new file mode 100644 (file)
index 0000000..dc49913
--- /dev/null
@@ -0,0 +1,219 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#ifndef CEPH_RGW_SERVICE_H
+#define CEPH_RGW_SERVICE_H
+
+
+#include <string>
+#include <vector>
+#include <memory>
+
+#include "common/async/yield_context.h"
+
+#include "rgw_common.h"
+
+struct RGWServices_Def;
+
+class RGWServiceInstance
+{
+  friend struct RGWServices_Def;
+
+protected:
+  CephContext *cct;
+
+  enum StartState {
+    StateInit = 0,
+    StateStarting = 1,
+    StateStarted = 2,
+  } start_state{StateInit};
+
+  virtual void shutdown() {}
+  virtual int do_start(optional_yield, const DoutPrefixProvider *dpp) {
+    return 0;
+  }
+public:
+  RGWServiceInstance(CephContext *_cct) : cct(_cct) {}
+  virtual ~RGWServiceInstance();
+
+  int start(optional_yield y, const DoutPrefixProvider *dpp);
+  bool is_started() {
+    return (start_state == StateStarted);
+  }
+
+  CephContext *ctx() {
+    return cct;
+  }
+};
+
+class RGWSI_Finisher;
+class RGWSI_Bucket;
+class RGWSI_Bucket_SObj;
+class RGWSI_Bucket_Sync;
+class RGWSI_Bucket_Sync_SObj;
+class RGWSI_BucketIndex;
+class RGWSI_BucketIndex_RADOS;
+class RGWSI_BILog_RADOS;
+class RGWSI_Cls;
+class RGWSI_ConfigKey;
+class RGWSI_ConfigKey_RADOS;
+class RGWSI_MDLog;
+class RGWSI_Meta;
+class RGWSI_MetaBackend;
+class RGWSI_MetaBackend_SObj;
+class RGWSI_MetaBackend_OTP;
+class RGWSI_Notify;
+class RGWSI_OTP;
+class RGWSI_RADOS;
+class RGWSI_Zone;
+class RGWSI_ZoneUtils;
+class RGWSI_Quota;
+class RGWSI_SyncModules;
+class RGWSI_SysObj;
+class RGWSI_SysObj_Core;
+class RGWSI_SysObj_Cache;
+class RGWSI_User;
+class RGWSI_User_RADOS;
+class RGWDataChangesLog;
+class RGWSI_Role_RADOS;
+
+struct RGWServices_Def
+{
+  bool can_shutdown{false};
+  bool has_shutdown{false};
+
+  std::unique_ptr<RGWSI_Finisher> finisher;
+  std::unique_ptr<RGWSI_Bucket_SObj> bucket_sobj;
+  std::unique_ptr<RGWSI_Bucket_Sync_SObj> bucket_sync_sobj;
+  std::unique_ptr<RGWSI_BucketIndex_RADOS> bi_rados;
+  std::unique_ptr<RGWSI_BILog_RADOS> bilog_rados;
+  std::unique_ptr<RGWSI_Cls> cls;
+  std::unique_ptr<RGWSI_ConfigKey_RADOS> config_key_rados;
+  std::unique_ptr<RGWSI_MDLog> mdlog;
+  std::unique_ptr<RGWSI_Meta> meta;
+  std::unique_ptr<RGWSI_MetaBackend_SObj> meta_be_sobj;
+  std::unique_ptr<RGWSI_MetaBackend_OTP> meta_be_otp;
+  std::unique_ptr<RGWSI_Notify> notify;
+  std::unique_ptr<RGWSI_OTP> otp;
+  std::unique_ptr<RGWSI_RADOS> rados;
+  std::unique_ptr<RGWSI_Zone> zone;
+  std::unique_ptr<RGWSI_ZoneUtils> zone_utils;
+  std::unique_ptr<RGWSI_Quota> quota;
+  std::unique_ptr<RGWSI_SyncModules> sync_modules;
+  std::unique_ptr<RGWSI_SysObj> sysobj;
+  std::unique_ptr<RGWSI_SysObj_Core> sysobj_core;
+  std::unique_ptr<RGWSI_SysObj_Cache> sysobj_cache;
+  std::unique_ptr<RGWSI_User_RADOS> user_rados;
+  std::unique_ptr<RGWDataChangesLog> datalog_rados;
+  std::unique_ptr<RGWSI_Role_RADOS> role_rados;
+
+  RGWServices_Def();
+  ~RGWServices_Def();
+
+  int init(CephContext *cct, bool have_cache, bool raw_storage, bool run_sync, optional_yield y, const DoutPrefixProvider *dpp);
+  void shutdown();
+};
+
+
+struct RGWServices
+{
+  RGWServices_Def _svc;
+
+  CephContext *cct;
+
+  RGWSI_Finisher *finisher{nullptr};
+  RGWSI_Bucket *bucket{nullptr};
+  RGWSI_Bucket_SObj *bucket_sobj{nullptr};
+  RGWSI_Bucket_Sync *bucket_sync{nullptr};
+  RGWSI_Bucket_Sync_SObj *bucket_sync_sobj{nullptr};
+  RGWSI_BucketIndex *bi{nullptr};
+  RGWSI_BucketIndex_RADOS *bi_rados{nullptr};
+  RGWSI_BILog_RADOS *bilog_rados{nullptr};
+  RGWSI_Cls *cls{nullptr};
+  RGWSI_ConfigKey_RADOS *config_key_rados{nullptr};
+  RGWSI_ConfigKey *config_key{nullptr};
+  RGWDataChangesLog *datalog_rados{nullptr};
+  RGWSI_MDLog *mdlog{nullptr};
+  RGWSI_Meta *meta{nullptr};
+  RGWSI_MetaBackend *meta_be_sobj{nullptr};
+  RGWSI_MetaBackend *meta_be_otp{nullptr};
+  RGWSI_Notify *notify{nullptr};
+  RGWSI_OTP *otp{nullptr};
+  RGWSI_RADOS *rados{nullptr};
+  RGWSI_Zone *zone{nullptr};
+  RGWSI_ZoneUtils *zone_utils{nullptr};
+  RGWSI_Quota *quota{nullptr};
+  RGWSI_SyncModules *sync_modules{nullptr};
+  RGWSI_SysObj *sysobj{nullptr};
+  RGWSI_SysObj_Cache *cache{nullptr};
+  RGWSI_SysObj_Core *core{nullptr};
+  RGWSI_User *user{nullptr};
+  RGWSI_Role_RADOS *role{nullptr};
+
+  int do_init(CephContext *cct, bool have_cache, bool raw_storage, bool run_sync, optional_yield y, const DoutPrefixProvider *dpp);
+
+  int init(CephContext *cct, bool have_cache, bool run_sync, optional_yield y, const DoutPrefixProvider *dpp) {
+    return do_init(cct, have_cache, false, run_sync, y, dpp);
+  }
+
+  int init_raw(CephContext *cct, bool have_cache, optional_yield y, const DoutPrefixProvider *dpp) {
+    return do_init(cct, have_cache, true, false, y, dpp);
+  }
+  void shutdown() {
+    _svc.shutdown();
+  }
+};
+
+class RGWMetadataManager;
+class RGWMetadataHandler;
+class RGWUserCtl;
+class RGWBucketCtl;
+class RGWOTPCtl;
+
+struct RGWCtlDef {
+  struct _meta {
+    std::unique_ptr<RGWMetadataManager> mgr;
+    std::unique_ptr<RGWMetadataHandler> bucket;
+    std::unique_ptr<RGWMetadataHandler> bucket_instance;
+    std::unique_ptr<RGWMetadataHandler> user;
+    std::unique_ptr<RGWMetadataHandler> otp;
+    std::unique_ptr<RGWMetadataHandler> role;
+
+    _meta();
+    ~_meta();
+  } meta;
+
+  std::unique_ptr<RGWUserCtl> user;
+  std::unique_ptr<RGWBucketCtl> bucket;
+  std::unique_ptr<RGWOTPCtl> otp;
+
+  RGWCtlDef();
+  ~RGWCtlDef();
+
+  int init(RGWServices& svc, rgw::sal::Driver* driver, const DoutPrefixProvider *dpp);
+};
+
+struct RGWCtl {
+  CephContext *cct{nullptr};
+  RGWServices *svc{nullptr};
+
+  RGWCtlDef _ctl;
+
+  struct _meta {
+    RGWMetadataManager *mgr{nullptr};
+
+    RGWMetadataHandler *bucket{nullptr};
+    RGWMetadataHandler *bucket_instance{nullptr};
+    RGWMetadataHandler *user{nullptr};
+    RGWMetadataHandler *otp{nullptr};
+    RGWMetadataHandler *role{nullptr};
+  } meta;
+
+  RGWUserCtl *user{nullptr};
+  RGWBucketCtl *bucket{nullptr};
+  RGWOTPCtl *otp{nullptr};
+
+  int init(RGWServices *_svc, rgw::sal::Driver* driver, const DoutPrefixProvider *dpp);
+};
+
+#endif
diff --git a/src/rgw/driver/rados/rgw_sync.cc b/src/rgw/driver/rados/rgw_sync.cc
new file mode 100644 (file)
index 0000000..065d209
--- /dev/null
@@ -0,0 +1,2567 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "rgw_sync.h"
+#include "rgw_rest_conn.h"
+#include "rgw_cr_rados.h"
+#include "rgw_cr_rest.h"
+
+#include "services/svc_zone.h"
+#include "services/svc_mdlog.h"
+#include "services/svc_cls.h"
+
+#include <boost/asio/yield.hpp>
+
+#define dout_subsys ceph_subsys_rgw
+
+#undef dout_prefix
+#define dout_prefix (*_dout << "meta sync: ")
+
+using namespace std;
+
+static string mdlog_sync_status_oid = "mdlog.sync-status";
+static string mdlog_sync_status_shard_prefix = "mdlog.sync-status.shard";
+static string mdlog_sync_full_sync_index_prefix = "meta.full-sync.index";
+
+RGWContinuousLeaseCR::~RGWContinuousLeaseCR() {}
+
+RGWSyncErrorLogger::RGWSyncErrorLogger(rgw::sal::RadosStore* _store, const string &oid_prefix, int _num_shards) : store(_store), num_shards(_num_shards) {
+  for (int i = 0; i < num_shards; i++) {
+    oids.push_back(get_shard_oid(oid_prefix, i));
+  }
+}
+string RGWSyncErrorLogger::get_shard_oid(const string& oid_prefix, int shard_id) {
+  char buf[oid_prefix.size() + 16];
+  snprintf(buf, sizeof(buf), "%s.%d", oid_prefix.c_str(), shard_id);
+  return string(buf);
+}
+
+RGWCoroutine *RGWSyncErrorLogger::log_error_cr(const DoutPrefixProvider *dpp, const string& source_zone, const string& section, const string& name, uint32_t error_code, const string& message) {
+  cls_log_entry entry;
+
+  rgw_sync_error_info info(source_zone, error_code, message);
+  bufferlist bl;
+  encode(info, bl);
+  store->svc()->cls->timelog.prepare_entry(entry, real_clock::now(), section, name, bl);
+
+  uint32_t shard_id = ++counter % num_shards;
+
+
+  return new RGWRadosTimelogAddCR(dpp, store, oids[shard_id], entry);
+}
+
+void RGWSyncBackoff::update_wait_time()
+{
+  if (cur_wait == 0) {
+    cur_wait = 1;
+  } else {
+    cur_wait = (cur_wait << 1);
+  }
+  if (cur_wait >= max_secs) {
+    cur_wait = max_secs;
+  }
+}
+
+void RGWSyncBackoff::backoff_sleep()
+{
+  update_wait_time();
+  sleep(cur_wait);
+}
+
+void RGWSyncBackoff::backoff(RGWCoroutine *op)
+{
+  update_wait_time();
+  op->wait(utime_t(cur_wait, 0));
+}
+
+int RGWBackoffControlCR::operate(const DoutPrefixProvider *dpp) {
+  reenter(this) {
+    // retry the operation until it succeeds
+    while (true) {
+      yield {
+       std::lock_guard l{lock};
+        cr = alloc_cr();
+        cr->get();
+        call(cr);
+      }
+      {
+       std::lock_guard l{lock};
+        cr->put();
+        cr = NULL;
+      }
+      if (retcode >= 0) {
+        break;
+      }
+      if (retcode != -EBUSY && retcode != -EAGAIN) {
+        ldout(cct, 0) << "ERROR: RGWBackoffControlCR called coroutine returned " << retcode << dendl;
+        if (exit_on_error) {
+          return set_cr_error(retcode);
+        }
+      }
+      if (reset_backoff) {
+        backoff.reset();
+      }
+      yield backoff.backoff(this);
+    }
+
+    // run an optional finisher
+    yield call(alloc_finisher_cr());
+    if (retcode < 0) {
+      ldout(cct, 0) << "ERROR: call to finisher_cr() failed: retcode=" << retcode << dendl;
+      return set_cr_error(retcode);
+    }
+    return set_cr_done();
+  }
+  return 0;
+}
+
+void rgw_mdlog_info::decode_json(JSONObj *obj) {
+  JSONDecoder::decode_json("num_objects", num_shards, obj);
+  JSONDecoder::decode_json("period", period, obj);
+  JSONDecoder::decode_json("realm_epoch", realm_epoch, obj);
+}
+
+void rgw_mdlog_entry::decode_json(JSONObj *obj) {
+  JSONDecoder::decode_json("id", id, obj);
+  JSONDecoder::decode_json("section", section, obj);
+  JSONDecoder::decode_json("name", name, obj);
+  utime_t ut;
+  JSONDecoder::decode_json("timestamp", ut, obj);
+  timestamp = ut.to_real_time();
+  JSONDecoder::decode_json("data", log_data, obj);
+}
+
+void rgw_mdlog_shard_data::decode_json(JSONObj *obj) {
+  JSONDecoder::decode_json("marker", marker, obj);
+  JSONDecoder::decode_json("truncated", truncated, obj);
+  JSONDecoder::decode_json("entries", entries, obj);
+};
+
+int RGWShardCollectCR::operate(const DoutPrefixProvider *dpp) {
+  reenter(this) {
+    while (spawn_next()) {
+      current_running++;
+
+      if (current_running >= max_concurrent) {
+        int child_ret;
+        yield wait_for_child();
+        if (collect_next(&child_ret)) {
+          current_running--;
+          child_ret = handle_result(child_ret);
+          if (child_ret < 0) {
+            status = child_ret;
+          }
+        }
+      }
+    }
+    while (current_running > 0) {
+      int child_ret;
+      yield wait_for_child();
+      if (collect_next(&child_ret)) {
+        current_running--;
+        child_ret = handle_result(child_ret);
+        if (child_ret < 0) {
+          status = child_ret;
+        }
+      }
+    }
+    if (status < 0) {
+      return set_cr_error(status);
+    }
+    return set_cr_done();
+  }
+  return 0;
+}
+
+class RGWReadRemoteMDLogInfoCR : public RGWShardCollectCR {
+  RGWMetaSyncEnv *sync_env;
+
+  const std::string& period;
+  int num_shards;
+  map<int, RGWMetadataLogInfo> *mdlog_info;
+
+  int shard_id;
+#define READ_MDLOG_MAX_CONCURRENT 10
+
+  int handle_result(int r) override {
+    if (r == -ENOENT) { // ENOENT is not a fatal error
+      return 0;
+    }
+    if (r < 0) {
+      ldout(cct, 4) << "failed to fetch mdlog status: " << cpp_strerror(r) << dendl;
+    }
+    return r;
+  }
+public:
+  RGWReadRemoteMDLogInfoCR(RGWMetaSyncEnv *_sync_env,
+                     const std::string& period, int _num_shards,
+                     map<int, RGWMetadataLogInfo> *_mdlog_info) : RGWShardCollectCR(_sync_env->cct, READ_MDLOG_MAX_CONCURRENT),
+                                                                 sync_env(_sync_env),
+                                                                 period(period), num_shards(_num_shards),
+                                                                 mdlog_info(_mdlog_info), shard_id(0) {}
+  bool spawn_next() override;
+};
+
+class RGWListRemoteMDLogCR : public RGWShardCollectCR {
+  RGWMetaSyncEnv *sync_env;
+
+  const std::string& period;
+  map<int, string> shards;
+  int max_entries_per_shard;
+  map<int, rgw_mdlog_shard_data> *result;
+
+  map<int, string>::iterator iter;
+#define READ_MDLOG_MAX_CONCURRENT 10
+
+  int handle_result(int r) override {
+    if (r == -ENOENT) { // ENOENT is not a fatal error
+      return 0;
+    }
+    if (r < 0) {
+      ldout(cct, 4) << "failed to list remote mdlog shard: " << cpp_strerror(r) << dendl;
+    }
+    return r;
+  }
+public:
+  RGWListRemoteMDLogCR(RGWMetaSyncEnv *_sync_env,
+                     const std::string& period, map<int, string>& _shards,
+                     int _max_entries_per_shard,
+                     map<int, rgw_mdlog_shard_data> *_result) : RGWShardCollectCR(_sync_env->cct, READ_MDLOG_MAX_CONCURRENT),
+                                                                 sync_env(_sync_env), period(period),
+                                                                 max_entries_per_shard(_max_entries_per_shard),
+                                                                 result(_result) {
+    shards.swap(_shards);
+    iter = shards.begin();
+  }
+  bool spawn_next() override;
+};
+
+int RGWRemoteMetaLog::read_log_info(const DoutPrefixProvider *dpp, rgw_mdlog_info *log_info)
+{
+  rgw_http_param_pair pairs[] = { { "type", "metadata" },
+                                  { NULL, NULL } };
+
+  int ret = conn->get_json_resource(dpp, "/admin/log", pairs, null_yield, *log_info);
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: failed to fetch mdlog info" << dendl;
+    return ret;
+  }
+
+  ldpp_dout(dpp, 20) << "remote mdlog, num_shards=" << log_info->num_shards << dendl;
+
+  return 0;
+}
+
+int RGWRemoteMetaLog::read_master_log_shards_info(const DoutPrefixProvider *dpp, const string &master_period, map<int, RGWMetadataLogInfo> *shards_info)
+{
+  if (store->svc()->zone->is_meta_master()) {
+    return 0;
+  }
+
+  rgw_mdlog_info log_info;
+  int ret = read_log_info(dpp, &log_info);
+  if (ret < 0) {
+    return ret;
+  }
+
+  return run(dpp, new RGWReadRemoteMDLogInfoCR(&sync_env, master_period, log_info.num_shards, shards_info));
+}
+
+int RGWRemoteMetaLog::read_master_log_shards_next(const DoutPrefixProvider *dpp, const string& period, map<int, string> shard_markers, map<int, rgw_mdlog_shard_data> *result)
+{
+  if (store->svc()->zone->is_meta_master()) {
+    return 0;
+  }
+
+  return run(dpp, new RGWListRemoteMDLogCR(&sync_env, period, shard_markers, 1, result));
+}
+
+int RGWRemoteMetaLog::init()
+{
+  conn = store->svc()->zone->get_master_conn();
+
+  int ret = http_manager.start();
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "failed in http_manager.start() ret=" << ret << dendl;
+    return ret;
+  }
+
+  error_logger = new RGWSyncErrorLogger(store, RGW_SYNC_ERROR_LOG_SHARD_PREFIX, ERROR_LOGGER_SHARDS);
+
+  init_sync_env(&sync_env);
+
+  tn = sync_env.sync_tracer->add_node(sync_env.sync_tracer->root_node, "meta");
+
+  return 0;
+}
+
+#define CLONE_MAX_ENTRIES 100
+
+int RGWMetaSyncStatusManager::init(const DoutPrefixProvider *dpp)
+{
+  if (store->svc()->zone->is_meta_master()) {
+    return 0;
+  }
+
+  if (!store->svc()->zone->get_master_conn()) {
+    ldpp_dout(dpp, -1) << "no REST connection to master zone" << dendl;
+    return -EIO;
+  }
+
+  int r = rgw_init_ioctx(dpp, store->getRados()->get_rados_handle(), store->svc()->zone->get_zone_params().log_pool, ioctx, true);
+  if (r < 0) {
+    ldpp_dout(dpp, -1) << "ERROR: failed to open log pool (" << store->svc()->zone->get_zone_params().log_pool << " ret=" << r << dendl;
+    return r;
+  }
+
+  r = master_log.init();
+  if (r < 0) {
+    ldpp_dout(dpp, -1) << "ERROR: failed to init remote log, r=" << r << dendl;
+    return r;
+  }
+
+  RGWMetaSyncEnv& sync_env = master_log.get_sync_env();
+
+  rgw_meta_sync_status sync_status;
+  r = read_sync_status(dpp, &sync_status);
+  if (r < 0 && r != -ENOENT) {
+    ldpp_dout(dpp, -1) << "ERROR: failed to read sync status, r=" << r << dendl;
+    return r;
+  }
+
+  int num_shards = sync_status.sync_info.num_shards;
+
+  for (int i = 0; i < num_shards; i++) {
+    shard_objs[i] = rgw_raw_obj(store->svc()->zone->get_zone_params().log_pool, sync_env.shard_obj_name(i));
+  }
+
+  std::unique_lock wl{ts_to_shard_lock};
+  for (int i = 0; i < num_shards; i++) {
+    clone_markers.push_back(string());
+    utime_shard ut;
+    ut.shard_id = i;
+    ts_to_shard[ut] = i;
+  }
+
+  return 0;
+}
+
+void RGWMetaSyncEnv::init(const DoutPrefixProvider *_dpp, CephContext *_cct, rgw::sal::RadosStore* _store, RGWRESTConn *_conn,
+                          RGWAsyncRadosProcessor *_async_rados, RGWHTTPManager *_http_manager,
+                          RGWSyncErrorLogger *_error_logger, RGWSyncTraceManager *_sync_tracer) {
+  dpp = _dpp;
+  cct = _cct;
+  store = _store;
+  conn = _conn;
+  async_rados = _async_rados;
+  http_manager = _http_manager;
+  error_logger = _error_logger;
+  sync_tracer = _sync_tracer;
+}
+
+string RGWMetaSyncEnv::status_oid()
+{
+  return mdlog_sync_status_oid;
+}
+
+string RGWMetaSyncEnv::shard_obj_name(int shard_id)
+{
+  char buf[mdlog_sync_status_shard_prefix.size() + 16];
+  snprintf(buf, sizeof(buf), "%s.%d", mdlog_sync_status_shard_prefix.c_str(), shard_id);
+
+  return string(buf);
+}
+
+class RGWAsyncReadMDLogEntries : public RGWAsyncRadosRequest {
+  const DoutPrefixProvider *dpp;
+  rgw::sal::RadosStore* store;
+  RGWMetadataLog *mdlog;
+  int shard_id;
+  int max_entries;
+
+protected:
+  int _send_request(const DoutPrefixProvider *dpp) override {
+    real_time from_time;
+    real_time end_time;
+
+    void *handle;
+
+    mdlog->init_list_entries(shard_id, from_time, end_time, marker, &handle);
+
+    int ret = mdlog->list_entries(dpp, handle, max_entries, entries, &marker, &truncated);
+
+    mdlog->complete_list_entries(handle);
+
+    return ret;
+  }
+public:
+  string marker;
+  list<cls_log_entry> entries;
+  bool truncated;
+
+  RGWAsyncReadMDLogEntries(const DoutPrefixProvider *dpp, RGWCoroutine *caller, RGWAioCompletionNotifier *cn, rgw::sal::RadosStore* _store,
+                           RGWMetadataLog* mdlog, int _shard_id,
+                           std::string _marker, int _max_entries)
+    : RGWAsyncRadosRequest(caller, cn), dpp(dpp), store(_store), mdlog(mdlog),
+      shard_id(_shard_id), max_entries(_max_entries), marker(std::move(_marker)) {}
+};
+
+class RGWReadMDLogEntriesCR : public RGWSimpleCoroutine {
+  RGWMetaSyncEnv *sync_env;
+  RGWMetadataLog *const mdlog;
+  int shard_id;
+  string marker;
+  string *pmarker;
+  int max_entries;
+  list<cls_log_entry> *entries;
+  bool *truncated;
+
+  RGWAsyncReadMDLogEntries *req{nullptr};
+
+public:
+  RGWReadMDLogEntriesCR(RGWMetaSyncEnv *_sync_env, RGWMetadataLog* mdlog,
+                        int _shard_id, string*_marker, int _max_entries,
+                        list<cls_log_entry> *_entries, bool *_truncated)
+    : RGWSimpleCoroutine(_sync_env->cct), sync_env(_sync_env), mdlog(mdlog),
+      shard_id(_shard_id), pmarker(_marker), max_entries(_max_entries),
+      entries(_entries), truncated(_truncated) {}
+
+  ~RGWReadMDLogEntriesCR() override {
+    if (req) {
+      req->finish();
+    }
+  }
+
+  int send_request(const DoutPrefixProvider *dpp) override {
+    marker = *pmarker;
+    req = new RGWAsyncReadMDLogEntries(dpp, this, stack->create_completion_notifier(),
+                                       sync_env->store, mdlog, shard_id, marker,
+                                       max_entries);
+    sync_env->async_rados->queue(req);
+    return 0;
+  }
+
+  int request_complete() override {
+    *pmarker = std::move(req->marker);
+    *entries = std::move(req->entries);
+    *truncated = req->truncated;
+    return req->get_ret_status();
+  }
+};
+
+
+class RGWReadRemoteMDLogShardInfoCR : public RGWCoroutine {
+  RGWMetaSyncEnv *env;
+  RGWRESTReadResource *http_op;
+
+  const std::string& period;
+  int shard_id;
+  RGWMetadataLogInfo *shard_info;
+
+public:
+  RGWReadRemoteMDLogShardInfoCR(RGWMetaSyncEnv *env, const std::string& period,
+                                int _shard_id, RGWMetadataLogInfo *_shard_info)
+    : RGWCoroutine(env->store->ctx()), env(env), http_op(NULL),
+      period(period), shard_id(_shard_id), shard_info(_shard_info) {}
+
+  int operate(const DoutPrefixProvider *dpp) override {
+    auto store = env->store;
+    RGWRESTConn *conn = store->svc()->zone->get_master_conn();
+    reenter(this) {
+      yield {
+       char buf[16];
+       snprintf(buf, sizeof(buf), "%d", shard_id);
+        rgw_http_param_pair pairs[] = { { "type" , "metadata" },
+                                       { "id", buf },
+                                       { "period", period.c_str() },
+                                       { "info" , NULL },
+                                       { NULL, NULL } };
+
+        string p = "/admin/log/";
+
+        http_op = new RGWRESTReadResource(conn, p, pairs, NULL,
+                                          env->http_manager);
+
+        init_new_io(http_op);
+
+        int ret = http_op->aio_read(dpp);
+        if (ret < 0) {
+          ldpp_dout(env->dpp, 0) << "ERROR: failed to read from " << p << dendl;
+          log_error() << "failed to send http operation: " << http_op->to_str() << " ret=" << ret << std::endl;
+          http_op->put();
+          return set_cr_error(ret);
+        }
+
+        return io_block(0);
+      }
+      yield {
+        int ret = http_op->wait(shard_info, null_yield);
+        http_op->put();
+        if (ret < 0) {
+          return set_cr_error(ret);
+        }
+        return set_cr_done();
+      }
+    }
+    return 0;
+  }
+};
+
+RGWCoroutine* create_read_remote_mdlog_shard_info_cr(RGWMetaSyncEnv *env,
+                                                     const std::string& period,
+                                                     int shard_id,
+                                                     RGWMetadataLogInfo* info)
+{
+  return new RGWReadRemoteMDLogShardInfoCR(env, period, shard_id, info);
+}
+
+class RGWListRemoteMDLogShardCR : public RGWSimpleCoroutine {
+  RGWMetaSyncEnv *sync_env;
+  RGWRESTReadResource *http_op;
+
+  const std::string& period;
+  int shard_id;
+  string marker;
+  uint32_t max_entries;
+  rgw_mdlog_shard_data *result;
+
+public:
+  RGWListRemoteMDLogShardCR(RGWMetaSyncEnv *env, const std::string& period,
+                            int _shard_id, const string& _marker, uint32_t _max_entries,
+                            rgw_mdlog_shard_data *_result)
+    : RGWSimpleCoroutine(env->store->ctx()), sync_env(env), http_op(NULL),
+      period(period), shard_id(_shard_id), marker(_marker), max_entries(_max_entries), result(_result) {}
+
+  int send_request(const DoutPrefixProvider *dpp) override {
+    RGWRESTConn *conn = sync_env->conn;
+
+    char buf[32];
+    snprintf(buf, sizeof(buf), "%d", shard_id);
+
+    char max_entries_buf[32];
+    snprintf(max_entries_buf, sizeof(max_entries_buf), "%d", (int)max_entries);
+
+    const char *marker_key = (marker.empty() ? "" : "marker");
+
+    rgw_http_param_pair pairs[] = { { "type", "metadata" },
+      { "id", buf },
+      { "period", period.c_str() },
+      { "max-entries", max_entries_buf },
+      { marker_key, marker.c_str() },
+      { NULL, NULL } };
+
+    string p = "/admin/log/";
+
+    http_op = new RGWRESTReadResource(conn, p, pairs, NULL, sync_env->http_manager);
+    init_new_io(http_op);
+
+    int ret = http_op->aio_read(dpp);
+    if (ret < 0) {
+      ldpp_dout(dpp, 0) << "ERROR: failed to read from " << p << dendl;
+      log_error() << "failed to send http operation: " << http_op->to_str() << " ret=" << ret << std::endl;
+      http_op->put();
+      return ret;
+    }
+
+    return 0;
+  }
+
+  int request_complete() override {
+    int ret = http_op->wait(result, null_yield);
+    http_op->put();
+    if (ret < 0 && ret != -ENOENT) {
+      ldpp_dout(sync_env->dpp, 0) << "ERROR: failed to list remote mdlog shard, ret=" << ret << dendl;
+      return ret;
+    }
+    return 0;
+  }
+};
+
+RGWCoroutine* create_list_remote_mdlog_shard_cr(RGWMetaSyncEnv *env,
+                                                const std::string& period,
+                                                int shard_id,
+                                                const std::string& marker,
+                                                uint32_t max_entries,
+                                                rgw_mdlog_shard_data *result)
+{
+  return new RGWListRemoteMDLogShardCR(env, period, shard_id, marker,
+                                       max_entries, result);
+}
+
+bool RGWReadRemoteMDLogInfoCR::spawn_next() {
+  if (shard_id >= num_shards) {
+    return false;
+  }
+  spawn(new RGWReadRemoteMDLogShardInfoCR(sync_env, period, shard_id, &(*mdlog_info)[shard_id]), false);
+  shard_id++;
+  return true;
+}
+
+bool RGWListRemoteMDLogCR::spawn_next() {
+  if (iter == shards.end()) {
+    return false;
+  }
+
+  spawn(new RGWListRemoteMDLogShardCR(sync_env, period, iter->first, iter->second, max_entries_per_shard, &(*result)[iter->first]), false);
+  ++iter;
+  return true;
+}
+
+class RGWInitSyncStatusCoroutine : public RGWCoroutine {
+  RGWMetaSyncEnv *sync_env;
+
+  rgw_meta_sync_info status;
+  vector<RGWMetadataLogInfo> shards_info;
+  boost::intrusive_ptr<RGWContinuousLeaseCR> lease_cr;
+  boost::intrusive_ptr<RGWCoroutinesStack> lease_stack;
+public:
+  RGWInitSyncStatusCoroutine(RGWMetaSyncEnv *_sync_env,
+                             const rgw_meta_sync_info &status)
+    : RGWCoroutine(_sync_env->store->ctx()), sync_env(_sync_env),
+      status(status), shards_info(status.num_shards),
+      lease_cr(nullptr), lease_stack(nullptr) {}
+
+  ~RGWInitSyncStatusCoroutine() override {
+    if (lease_cr) {
+      lease_cr->abort();
+    }
+  }
+
+  int operate(const DoutPrefixProvider *dpp) override {
+    int ret;
+    reenter(this) {
+      yield {
+        set_status("acquiring sync lock");
+       uint32_t lock_duration = cct->_conf->rgw_sync_lease_period;
+        string lock_name = "sync_lock";
+       rgw::sal::RadosStore* store = sync_env->store;
+        lease_cr.reset(new RGWContinuousLeaseCR(sync_env->async_rados, store,
+                                                rgw_raw_obj(store->svc()->zone->get_zone_params().log_pool, sync_env->status_oid()),
+                                                lock_name, lock_duration, this));
+        lease_stack.reset(spawn(lease_cr.get(), false));
+      }
+      while (!lease_cr->is_locked()) {
+        if (lease_cr->is_done()) {
+          ldpp_dout(dpp, 5) << "failed to take lease" << dendl;
+          set_status("lease lock failed, early abort");
+          return set_cr_error(lease_cr->get_ret_status());
+        }
+        set_sleeping(true);
+        yield;
+      }
+      yield {
+        set_status("writing sync status");
+       rgw::sal::RadosStore* store = sync_env->store;
+        call(new RGWSimpleRadosWriteCR<rgw_meta_sync_info>(dpp, sync_env->async_rados, store->svc()->sysobj,
+                                                           rgw_raw_obj(store->svc()->zone->get_zone_params().log_pool, sync_env->status_oid()),
+                                                           status));
+      }
+
+      if (retcode < 0) {
+        set_status("failed to write sync status");
+        ldpp_dout(dpp, 0) << "ERROR: failed to write sync status, retcode=" << retcode << dendl;
+        yield lease_cr->go_down();
+        return set_cr_error(retcode);
+      }
+      /* fetch current position in logs */
+      set_status("fetching remote log position");
+      yield {
+        for (int i = 0; i < (int)status.num_shards; i++) {
+          spawn(new RGWReadRemoteMDLogShardInfoCR(sync_env, status.period, i,
+                                                  &shards_info[i]), false);
+       }
+      }
+
+      drain_all_but_stack(lease_stack.get()); /* the lease cr still needs to run */
+
+      yield {
+        set_status("updating sync status");
+        for (int i = 0; i < (int)status.num_shards; i++) {
+         rgw_meta_sync_marker marker;
+          RGWMetadataLogInfo& info = shards_info[i];
+         marker.next_step_marker = info.marker;
+         marker.timestamp = info.last_update;
+         rgw::sal::RadosStore* store = sync_env->store;
+          spawn(new RGWSimpleRadosWriteCR<rgw_meta_sync_marker>(dpp,
+                                                                sync_env->async_rados,
+                                                                store->svc()->sysobj,
+                                                                rgw_raw_obj(store->svc()->zone->get_zone_params().log_pool, sync_env->shard_obj_name(i)),
+                                                                marker), true);
+        }
+      }
+      yield {
+        set_status("changing sync state: build full sync maps");
+       status.state = rgw_meta_sync_info::StateBuildingFullSyncMaps;
+       rgw::sal::RadosStore* store = sync_env->store;
+        call(new RGWSimpleRadosWriteCR<rgw_meta_sync_info>(dpp, sync_env->async_rados, store->svc()->sysobj,
+                                                           rgw_raw_obj(store->svc()->zone->get_zone_params().log_pool, sync_env->status_oid()),
+                                                           status));
+      }
+      set_status("drop lock lease");
+      yield lease_cr->go_down();
+      while (collect(&ret, NULL)) {
+       if (ret < 0) {
+         return set_cr_error(ret);
+       }
+        yield;
+      }
+      drain_all();
+      return set_cr_done();
+    }
+    return 0;
+  }
+};
+
+class RGWReadSyncStatusMarkersCR : public RGWShardCollectCR {
+  static constexpr int MAX_CONCURRENT_SHARDS = 16;
+
+  RGWMetaSyncEnv *env;
+  const int num_shards;
+  int shard_id{0};
+  map<uint32_t, rgw_meta_sync_marker>& markers;
+
+  int handle_result(int r) override {
+    if (r == -ENOENT) { // ENOENT is not a fatal error
+      return 0;
+    }
+    if (r < 0) {
+      ldout(cct, 4) << "failed to read metadata sync markers: "
+          << cpp_strerror(r) << dendl;
+    }
+    return r;
+  }
+ public:
+  RGWReadSyncStatusMarkersCR(RGWMetaSyncEnv *env, int num_shards,
+                             map<uint32_t, rgw_meta_sync_marker>& markers)
+    : RGWShardCollectCR(env->cct, MAX_CONCURRENT_SHARDS),
+      env(env), num_shards(num_shards), markers(markers)
+  {}
+  bool spawn_next() override;
+};
+
+bool RGWReadSyncStatusMarkersCR::spawn_next()
+{
+  if (shard_id >= num_shards) {
+    return false;
+  }
+  using CR = RGWSimpleRadosReadCR<rgw_meta_sync_marker>;
+  rgw_raw_obj obj{env->store->svc()->zone->get_zone_params().log_pool,
+                  env->shard_obj_name(shard_id)};
+  spawn(new CR(env->dpp, env->async_rados, env->store->svc()->sysobj, obj, &markers[shard_id]), false);
+  shard_id++;
+  return true;
+}
+
+class RGWReadSyncStatusCoroutine : public RGWCoroutine {
+  RGWMetaSyncEnv *sync_env;
+  rgw_meta_sync_status *sync_status;
+
+public:
+  RGWReadSyncStatusCoroutine(RGWMetaSyncEnv *_sync_env,
+                             rgw_meta_sync_status *_status)
+    : RGWCoroutine(_sync_env->cct), sync_env(_sync_env), sync_status(_status)
+  {}
+  int operate(const DoutPrefixProvider *dpp) override;
+};
+
+int RGWReadSyncStatusCoroutine::operate(const DoutPrefixProvider *dpp)
+{
+  reenter(this) {
+    // read sync info
+    using ReadInfoCR = RGWSimpleRadosReadCR<rgw_meta_sync_info>;
+    yield {
+      bool empty_on_enoent = false; // fail on ENOENT
+      rgw_raw_obj obj{sync_env->store->svc()->zone->get_zone_params().log_pool,
+                      sync_env->status_oid()};
+      call(new ReadInfoCR(dpp, sync_env->async_rados, sync_env->store->svc()->sysobj, obj,
+                          &sync_status->sync_info, empty_on_enoent));
+    }
+    if (retcode < 0) {
+      ldpp_dout(dpp, 4) << "failed to read sync status info with "
+          << cpp_strerror(retcode) << dendl;
+      return set_cr_error(retcode);
+    }
+    // read shard markers
+    using ReadMarkersCR = RGWReadSyncStatusMarkersCR;
+    yield call(new ReadMarkersCR(sync_env, sync_status->sync_info.num_shards,
+                                 sync_status->sync_markers));
+    if (retcode < 0) {
+      ldpp_dout(dpp, 4) << "failed to read sync status markers with "
+          << cpp_strerror(retcode) << dendl;
+      return set_cr_error(retcode);
+    }
+    return set_cr_done();
+  }
+  return 0;
+}
+
+class RGWFetchAllMetaCR : public RGWCoroutine {
+  RGWMetaSyncEnv *sync_env;
+
+  int num_shards;
+
+
+  int ret_status;
+
+  list<string> sections;
+  list<string>::iterator sections_iter;
+
+  struct meta_list_result {
+    list<string> keys;
+    string marker;
+    uint64_t count{0};
+    bool truncated{false};
+
+    void decode_json(JSONObj *obj) {
+      JSONDecoder::decode_json("keys", keys, obj);
+      JSONDecoder::decode_json("marker", marker, obj);
+      JSONDecoder::decode_json("count", count, obj);
+      JSONDecoder::decode_json("truncated", truncated, obj);
+    }
+  } result;
+  list<string>::iterator iter;
+
+  std::unique_ptr<RGWShardedOmapCRManager> entries_index;
+
+  boost::intrusive_ptr<RGWContinuousLeaseCR> lease_cr;
+  boost::intrusive_ptr<RGWCoroutinesStack> lease_stack;
+  bool lost_lock;
+  bool failed;
+
+  string marker;
+
+  map<uint32_t, rgw_meta_sync_marker>& markers;
+
+  RGWSyncTraceNodeRef tn;
+
+public:
+  RGWFetchAllMetaCR(RGWMetaSyncEnv *_sync_env, int _num_shards,
+                    map<uint32_t, rgw_meta_sync_marker>& _markers,
+                    RGWSyncTraceNodeRef& _tn_parent) : RGWCoroutine(_sync_env->cct), sync_env(_sync_env),
+                                                     num_shards(_num_shards),
+                                                     ret_status(0), lease_cr(nullptr), lease_stack(nullptr),
+                                                      lost_lock(false), failed(false), markers(_markers) {
+    tn = sync_env->sync_tracer->add_node(_tn_parent, "fetch_all_meta");
+  }
+
+  ~RGWFetchAllMetaCR() override {
+  }
+
+  void append_section_from_set(set<string>& all_sections, const string& name) {
+    set<string>::iterator iter = all_sections.find(name);
+    if (iter != all_sections.end()) {
+      sections.emplace_back(std::move(*iter));
+      all_sections.erase(iter);
+    }
+  }
+  /*
+   * meta sync should go in the following order: user, bucket.instance, bucket
+   * then whatever other sections exist (if any)
+   */
+  void rearrange_sections() {
+    set<string> all_sections;
+    std::move(sections.begin(), sections.end(),
+              std::inserter(all_sections, all_sections.end()));
+    sections.clear();
+
+    append_section_from_set(all_sections, "user");
+    append_section_from_set(all_sections, "bucket.instance");
+    append_section_from_set(all_sections, "bucket");
+    append_section_from_set(all_sections, "roles");
+
+    std::move(all_sections.begin(), all_sections.end(),
+              std::back_inserter(sections));
+  }
+
+  int operate(const DoutPrefixProvider *dpp) override {
+    RGWRESTConn *conn = sync_env->conn;
+
+    reenter(this) {
+      yield {
+        set_status(string("acquiring lock (") + sync_env->status_oid() + ")");
+       uint32_t lock_duration = cct->_conf->rgw_sync_lease_period;
+        string lock_name = "sync_lock";
+        lease_cr.reset(new RGWContinuousLeaseCR(sync_env->async_rados,
+                                                sync_env->store,
+                                                rgw_raw_obj(sync_env->store->svc()->zone->get_zone_params().log_pool, sync_env->status_oid()),
+                                                lock_name, lock_duration, this));
+        lease_stack.reset(spawn(lease_cr.get(), false));
+      }
+      while (!lease_cr->is_locked()) {
+        if (lease_cr->is_done()) {
+          ldpp_dout(dpp, 5) << "failed to take lease" << dendl;
+          set_status("lease lock failed, early abort");
+          return set_cr_error(lease_cr->get_ret_status());
+        }
+        set_sleeping(true);
+        yield;
+      }
+      entries_index.reset(new RGWShardedOmapCRManager(sync_env->async_rados, sync_env->store, this, num_shards,
+                                                      sync_env->store->svc()->zone->get_zone_params().log_pool,
+                                                      mdlog_sync_full_sync_index_prefix));
+      yield {
+       call(new RGWReadRESTResourceCR<list<string> >(cct, conn, sync_env->http_manager,
+                                      "/admin/metadata", NULL, &sections));
+      }
+      if (get_ret_status() < 0) {
+        ldpp_dout(dpp, 0) << "ERROR: failed to fetch metadata sections" << dendl;
+        yield entries_index->finish();
+        yield lease_cr->go_down();
+        drain_all();
+       return set_cr_error(get_ret_status());
+      }
+      rearrange_sections();
+      sections_iter = sections.begin();
+      for (; sections_iter != sections.end(); ++sections_iter) {
+        do {
+          yield {
+#define META_FULL_SYNC_CHUNK_SIZE "1000"
+            string entrypoint = string("/admin/metadata/") + *sections_iter;
+            rgw_http_param_pair pairs[] = { { "max-entries", META_FULL_SYNC_CHUNK_SIZE },
+              { "marker", result.marker.c_str() },
+              { NULL, NULL } };
+            result.keys.clear();
+            call(new RGWReadRESTResourceCR<meta_list_result >(cct, conn, sync_env->http_manager,
+                                                              entrypoint, pairs, &result));
+          }
+          ret_status = get_ret_status();
+          if (ret_status == -ENOENT) {
+            set_retcode(0); /* reset coroutine status so that we don't return it */
+            ret_status = 0;
+          }
+          if (ret_status < 0) {
+            tn->log(0, SSTR("ERROR: failed to fetch metadata section: " << *sections_iter));
+            yield entries_index->finish();
+            yield lease_cr->go_down();
+            drain_all();
+            return set_cr_error(ret_status);
+          }
+          iter = result.keys.begin();
+          for (; iter != result.keys.end(); ++iter) {
+            if (!lease_cr->is_locked()) {
+              lost_lock = true;
+              tn->log(1, "lease is lost, abort");
+              break;
+            }
+            yield; // allow entries_index consumer to make progress
+
+            tn->log(20, SSTR("list metadata: section=" << *sections_iter << " key=" << *iter));
+            string s = *sections_iter + ":" + *iter;
+            int shard_id;
+           rgw::sal::RadosStore* store = sync_env->store;
+            int ret = store->ctl()->meta.mgr->get_shard_id(*sections_iter, *iter, &shard_id);
+            if (ret < 0) {
+              tn->log(0, SSTR("ERROR: could not determine shard id for " << *sections_iter << ":" << *iter));
+              ret_status = ret;
+              break;
+            }
+            if (!entries_index->append(s, shard_id)) {
+              break;
+            }
+          }
+        } while (result.truncated);
+      }
+      yield {
+        if (!entries_index->finish()) {
+          failed = true;
+        }
+      }
+      if (!failed) {
+        for (map<uint32_t, rgw_meta_sync_marker>::iterator iter = markers.begin(); iter != markers.end(); ++iter) {
+          int shard_id = (int)iter->first;
+          rgw_meta_sync_marker& marker = iter->second;
+          marker.total_entries = entries_index->get_total_entries(shard_id);
+          spawn(new RGWSimpleRadosWriteCR<rgw_meta_sync_marker>(dpp, sync_env->async_rados, sync_env->store->svc()->sysobj,
+                                                                rgw_raw_obj(sync_env->store->svc()->zone->get_zone_params().log_pool, sync_env->shard_obj_name(shard_id)),
+                                                                marker), true);
+        }
+      }
+
+      drain_all_but_stack(lease_stack.get()); /* the lease cr still needs to run */
+
+      yield lease_cr->go_down();
+
+      int ret;
+      while (collect(&ret, NULL)) {
+       if (ret < 0) {
+         return set_cr_error(ret);
+       }
+        yield;
+      }
+      drain_all();
+      if (failed) {
+        yield return set_cr_error(-EIO);
+      }
+      if (lost_lock) {
+        yield return set_cr_error(-EBUSY);
+      }
+
+      if (ret_status < 0) {
+        yield return set_cr_error(ret_status);
+      }
+
+      yield return set_cr_done();
+    }
+    return 0;
+  }
+};
+
+static string full_sync_index_shard_oid(int shard_id)
+{
+  char buf[mdlog_sync_full_sync_index_prefix.size() + 16];
+  snprintf(buf, sizeof(buf), "%s.%d", mdlog_sync_full_sync_index_prefix.c_str(), shard_id);
+  return string(buf);
+}
+
+class RGWReadRemoteMetadataCR : public RGWCoroutine {
+  RGWMetaSyncEnv *sync_env;
+
+  RGWRESTReadResource *http_op;
+
+  string section;
+  string key;
+
+  bufferlist *pbl;
+
+  RGWSyncTraceNodeRef tn;
+
+public:
+  RGWReadRemoteMetadataCR(RGWMetaSyncEnv *_sync_env,
+                                                      const string& _section, const string& _key, bufferlist *_pbl,
+                                                      const RGWSyncTraceNodeRef& _tn_parent) : RGWCoroutine(_sync_env->cct), sync_env(_sync_env),
+                                                      http_op(NULL),
+                                                      section(_section),
+                                                      key(_key),
+                                                     pbl(_pbl) {
+    tn = sync_env->sync_tracer->add_node(_tn_parent, "read_remote_meta",
+                                         section + ":" + key);
+  }
+
+  int operate(const DoutPrefixProvider *dpp) override {
+    RGWRESTConn *conn = sync_env->conn;
+    reenter(this) {
+      yield {
+        string key_encode;
+        url_encode(key, key_encode);
+        rgw_http_param_pair pairs[] = { { "key" , key.c_str()},
+                                       { NULL, NULL } };
+
+        string p = string("/admin/metadata/") + section + "/" + key_encode;
+
+        http_op = new RGWRESTReadResource(conn, p, pairs, NULL, sync_env->http_manager);
+
+        init_new_io(http_op);
+
+        int ret = http_op->aio_read(dpp);
+        if (ret < 0) {
+          ldpp_dout(dpp, 0) << "ERROR: failed to fetch mdlog data" << dendl;
+          log_error() << "failed to send http operation: " << http_op->to_str() << " ret=" << ret << std::endl;
+          http_op->put();
+          return set_cr_error(ret);
+        }
+
+        return io_block(0);
+      }
+      yield {
+        int ret = http_op->wait(pbl, null_yield);
+        http_op->put();
+        if (ret < 0) {
+          return set_cr_error(ret);
+        }
+        return set_cr_done();
+      }
+    }
+    return 0;
+  }
+};
+
+class RGWAsyncMetaStoreEntry : public RGWAsyncRadosRequest {
+  rgw::sal::RadosStore* store;
+  string raw_key;
+  bufferlist bl;
+  const DoutPrefixProvider *dpp;
+protected:
+  int _send_request(const DoutPrefixProvider *dpp) override {
+    int ret = store->ctl()->meta.mgr->put(raw_key, bl, null_yield, dpp, RGWMDLogSyncType::APPLY_ALWAYS, true);
+    if (ret < 0) {
+      ldpp_dout(dpp, 0) << "ERROR: can't store key: " << raw_key << " ret=" << ret << dendl;
+      return ret;
+    }
+    return 0;
+  }
+public:
+  RGWAsyncMetaStoreEntry(RGWCoroutine *caller, RGWAioCompletionNotifier *cn, rgw::sal::RadosStore* _store,
+                       const string& _raw_key,
+                       bufferlist& _bl,
+                       const DoutPrefixProvider *dpp) : RGWAsyncRadosRequest(caller, cn), store(_store),
+                                          raw_key(_raw_key), bl(_bl), dpp(dpp) {}
+};
+
+
+class RGWMetaStoreEntryCR : public RGWSimpleCoroutine {
+  RGWMetaSyncEnv *sync_env;
+  string raw_key;
+  bufferlist bl;
+
+  RGWAsyncMetaStoreEntry *req;
+
+public:
+  RGWMetaStoreEntryCR(RGWMetaSyncEnv *_sync_env,
+                       const string& _raw_key,
+                       bufferlist& _bl) : RGWSimpleCoroutine(_sync_env->cct), sync_env(_sync_env),
+                                          raw_key(_raw_key), bl(_bl), req(NULL) {
+  }
+
+  ~RGWMetaStoreEntryCR() override {
+    if (req) {
+      req->finish();
+    }
+  }
+
+  int send_request(const DoutPrefixProvider *dpp) override {
+    req = new RGWAsyncMetaStoreEntry(this, stack->create_completion_notifier(),
+                                  sync_env->store, raw_key, bl, dpp);
+    sync_env->async_rados->queue(req);
+    return 0;
+  }
+
+  int request_complete() override {
+    return req->get_ret_status();
+  }
+};
+
+class RGWAsyncMetaRemoveEntry : public RGWAsyncRadosRequest {
+  rgw::sal::RadosStore* store;
+  string raw_key;
+  const DoutPrefixProvider *dpp;
+protected:
+  int _send_request(const DoutPrefixProvider *dpp) override {
+    int ret = store->ctl()->meta.mgr->remove(raw_key, null_yield, dpp);
+    if (ret < 0) {
+      ldpp_dout(dpp, 0) << "ERROR: can't remove key: " << raw_key << " ret=" << ret << dendl;
+      return ret;
+    }
+    return 0;
+  }
+public:
+  RGWAsyncMetaRemoveEntry(RGWCoroutine *caller, RGWAioCompletionNotifier *cn, rgw::sal::RadosStore* _store,
+                       const string& _raw_key, const DoutPrefixProvider *dpp) : RGWAsyncRadosRequest(caller, cn), store(_store),
+                                          raw_key(_raw_key), dpp(dpp) {}
+};
+
+
+class RGWMetaRemoveEntryCR : public RGWSimpleCoroutine {
+  RGWMetaSyncEnv *sync_env;
+  string raw_key;
+
+  RGWAsyncMetaRemoveEntry *req;
+
+public:
+  RGWMetaRemoveEntryCR(RGWMetaSyncEnv *_sync_env,
+                       const string& _raw_key) : RGWSimpleCoroutine(_sync_env->cct), sync_env(_sync_env),
+                                          raw_key(_raw_key), req(NULL) {
+  }
+
+  ~RGWMetaRemoveEntryCR() override {
+    if (req) {
+      req->finish();
+    }
+  }
+
+  int send_request(const DoutPrefixProvider *dpp) override {
+    req = new RGWAsyncMetaRemoveEntry(this, stack->create_completion_notifier(),
+                                  sync_env->store, raw_key, dpp);
+    sync_env->async_rados->queue(req);
+    return 0;
+  }
+
+  int request_complete() override {
+    int r = req->get_ret_status();
+    if (r == -ENOENT) {
+      r = 0;
+    }
+    return r;
+  }
+};
+
+#define META_SYNC_UPDATE_MARKER_WINDOW 10
+
+
+int RGWLastCallerWinsCR::operate(const DoutPrefixProvider *dpp) {
+  RGWCoroutine *call_cr;
+  reenter(this) {
+    while (cr) {
+      call_cr = cr;
+      cr = nullptr;
+      yield call(call_cr);
+      /* cr might have been modified at this point */
+    }
+    return set_cr_done();
+  }
+  return 0;
+}
+
+class RGWMetaSyncShardMarkerTrack : public RGWSyncShardMarkerTrack<string, string> {
+  RGWMetaSyncEnv *sync_env;
+
+  string marker_oid;
+  rgw_meta_sync_marker sync_marker;
+
+  RGWSyncTraceNodeRef tn;
+
+public:
+  RGWMetaSyncShardMarkerTrack(RGWMetaSyncEnv *_sync_env,
+                         const string& _marker_oid,
+                         const rgw_meta_sync_marker& _marker,
+                         RGWSyncTraceNodeRef& _tn) : RGWSyncShardMarkerTrack(META_SYNC_UPDATE_MARKER_WINDOW),
+                                                                sync_env(_sync_env),
+                                                                marker_oid(_marker_oid),
+                                                                sync_marker(_marker),
+                                                                tn(_tn){}
+
+  RGWCoroutine *store_marker(const string& new_marker, uint64_t index_pos, const real_time& timestamp) override {
+    sync_marker.marker = new_marker;
+    if (index_pos > 0) {
+      sync_marker.pos = index_pos;
+    }
+
+    if (!real_clock::is_zero(timestamp)) {
+      sync_marker.timestamp = timestamp;
+    }
+
+    ldpp_dout(sync_env->dpp, 20) << __func__ << "(): updating marker marker_oid=" << marker_oid << " marker=" << new_marker << " realm_epoch=" << sync_marker.realm_epoch << dendl;
+    tn->log(20, SSTR("new marker=" << new_marker));
+    rgw::sal::RadosStore* store = sync_env->store;
+    return new RGWSimpleRadosWriteCR<rgw_meta_sync_marker>(sync_env->dpp, sync_env->async_rados,
+                                                           store->svc()->sysobj,
+                                                           rgw_raw_obj(store->svc()->zone->get_zone_params().log_pool, marker_oid),
+                                                           sync_marker);
+  }
+
+  RGWOrderCallCR *allocate_order_control_cr() override {
+    return new RGWLastCallerWinsCR(sync_env->cct);
+  }
+};
+
+RGWMetaSyncSingleEntryCR::RGWMetaSyncSingleEntryCR(RGWMetaSyncEnv *_sync_env,
+                          const string& _raw_key, const string& _entry_marker,
+                           const RGWMDLogStatus& _op_status,
+                           RGWMetaSyncShardMarkerTrack *_marker_tracker, const RGWSyncTraceNodeRef& _tn_parent) : RGWCoroutine(_sync_env->cct),
+                                                      sync_env(_sync_env),
+                                                     raw_key(_raw_key), entry_marker(_entry_marker),
+                                                      op_status(_op_status),
+                                                      pos(0), sync_status(0),
+                                                      marker_tracker(_marker_tracker), tries(0) {
+  error_injection = (sync_env->cct->_conf->rgw_sync_meta_inject_err_probability > 0);
+  tn = sync_env->sync_tracer->add_node(_tn_parent, "entry", raw_key);
+}
+
+int RGWMetaSyncSingleEntryCR::operate(const DoutPrefixProvider *dpp) {
+  reenter(this) {
+#define NUM_TRANSIENT_ERROR_RETRIES 10
+
+    if (error_injection &&
+        rand() % 10000 < cct->_conf->rgw_sync_meta_inject_err_probability * 10000.0) {
+      return set_cr_error(-EIO);
+    }
+
+    if (op_status != MDLOG_STATUS_COMPLETE) {
+      tn->log(20, "skipping pending operation");
+      yield call(marker_tracker->finish(entry_marker));
+      if (retcode < 0) {
+        return set_cr_error(retcode);
+      }
+      return set_cr_done();
+    }
+    tn->set_flag(RGW_SNS_FLAG_ACTIVE);
+    for (tries = 0; tries < NUM_TRANSIENT_ERROR_RETRIES; tries++) {
+      yield {
+        pos = raw_key.find(':');
+        section = raw_key.substr(0, pos);
+        key = raw_key.substr(pos + 1);
+        tn->log(10, SSTR("fetching remote metadata entry" << (tries == 0 ? "" : " (retry)")));
+        call(new RGWReadRemoteMetadataCR(sync_env, section, key, &md_bl, tn));
+      }
+
+      sync_status = retcode;
+
+      if (sync_status == -ENOENT) {
+        break;
+      }
+
+      if (sync_status < 0) {
+        if (tries < NUM_TRANSIENT_ERROR_RETRIES - 1) {
+          ldpp_dout(dpp, 20) << *this << ": failed to fetch remote metadata entry: " << section << ":" << key << ", will retry" << dendl;
+          continue;
+        }
+
+        tn->log(10, SSTR("failed to read remote metadata entry: section=" << section << " key=" << key << " status=" << sync_status));
+        log_error() << "failed to read remote metadata entry: section=" << section << " key=" << key << " status=" << sync_status << std::endl;
+        yield call(sync_env->error_logger->log_error_cr(dpp, sync_env->conn->get_remote_id(), section, key, -sync_status,
+                                                        string("failed to read remote metadata entry: ") + cpp_strerror(-sync_status)));
+        return set_cr_error(sync_status);
+      }
+
+      break;
+    }
+
+    retcode = 0;
+    for (tries = 0; tries < NUM_TRANSIENT_ERROR_RETRIES; tries++) {
+      if (sync_status != -ENOENT) {
+        tn->log(10, SSTR("storing local metadata entry: " << section << ":" << key));
+        yield call(new RGWMetaStoreEntryCR(sync_env, raw_key, md_bl));
+      } else {
+        tn->log(10, SSTR("removing local metadata entry:" << section << ":" << key));
+        yield call(new RGWMetaRemoveEntryCR(sync_env, raw_key));
+        if (retcode == -ENOENT) {
+          retcode = 0;
+          break;
+        }
+      }
+      if ((retcode < 0) && (tries < NUM_TRANSIENT_ERROR_RETRIES - 1)) {
+        ldpp_dout(dpp, 20) << *this << ": failed to store metadata entry: " << section << ":" << key << ", got retcode=" << retcode << ", will retry" << dendl;
+        continue;
+      }
+      break;
+    }
+
+    sync_status = retcode;
+
+    if (sync_status == 0 && marker_tracker) {
+      /* update marker */
+      yield call(marker_tracker->finish(entry_marker));
+      sync_status = retcode;
+    }
+    if (sync_status < 0) {
+      tn->log(10, SSTR("failed, status=" << sync_status));
+      return set_cr_error(sync_status);
+    }
+    tn->log(10, "success");
+    return set_cr_done();
+  }
+  return 0;
+}
+
+class RGWCloneMetaLogCoroutine : public RGWCoroutine {
+  RGWMetaSyncEnv *sync_env;
+  RGWMetadataLog *mdlog;
+
+  const std::string& period;
+  int shard_id;
+  string marker;
+  bool truncated = false;
+  string *new_marker;
+
+  int max_entries = CLONE_MAX_ENTRIES;
+
+  RGWRESTReadResource *http_op = nullptr;
+  boost::intrusive_ptr<RGWMetadataLogInfoCompletion> completion;
+
+  RGWMetadataLogInfo shard_info;
+  rgw_mdlog_shard_data data;
+
+public:
+  RGWCloneMetaLogCoroutine(RGWMetaSyncEnv *_sync_env, RGWMetadataLog* mdlog,
+                           const std::string& period, int _id,
+                           const string& _marker, string *_new_marker)
+    : RGWCoroutine(_sync_env->cct), sync_env(_sync_env), mdlog(mdlog),
+      period(period), shard_id(_id), marker(_marker), new_marker(_new_marker) {
+    if (new_marker) {
+      *new_marker = marker;
+    }
+  }
+  ~RGWCloneMetaLogCoroutine() override {
+    if (http_op) {
+      http_op->put();
+    }
+    if (completion) {
+      completion->cancel();
+    }
+  }
+
+  int operate(const DoutPrefixProvider *dpp) override;
+
+  int state_init();
+  int state_read_shard_status();
+  int state_read_shard_status_complete();
+  int state_send_rest_request(const DoutPrefixProvider *dpp);
+  int state_receive_rest_response();
+  int state_store_mdlog_entries();
+  int state_store_mdlog_entries_complete();
+};
+
+class RGWMetaSyncShardCR : public RGWCoroutine {
+  RGWMetaSyncEnv *sync_env;
+
+  const rgw_pool& pool;
+  const std::string& period; //< currently syncing period id
+  const epoch_t realm_epoch; //< realm_epoch of period
+  RGWMetadataLog* mdlog; //< log of syncing period
+  uint32_t shard_id;
+  rgw_meta_sync_marker& sync_marker;
+  boost::optional<rgw_meta_sync_marker> temp_marker; //< for pending updates
+  string marker;
+  string max_marker;
+  const std::string& period_marker; //< max marker stored in next period
+
+  RGWRadosGetOmapKeysCR::ResultPtr omapkeys;
+  std::set<std::string> entries;
+  std::set<std::string>::iterator iter;
+
+  string oid;
+
+  RGWMetaSyncShardMarkerTrack *marker_tracker = nullptr;
+
+  list<cls_log_entry> log_entries;
+  list<cls_log_entry>::iterator log_iter;
+  bool truncated = false;
+
+  string mdlog_marker;
+  string raw_key;
+  rgw_mdlog_entry mdlog_entry;
+
+  ceph::mutex inc_lock = ceph::make_mutex("RGWMetaSyncShardCR::inc_lock");
+  ceph::condition_variable inc_cond;
+
+  boost::asio::coroutine incremental_cr;
+  boost::asio::coroutine full_cr;
+
+  boost::intrusive_ptr<RGWContinuousLeaseCR> lease_cr;
+  boost::intrusive_ptr<RGWCoroutinesStack> lease_stack;
+
+  bool lost_lock = false;
+
+  bool *reset_backoff;
+
+  // hold a reference to the cr stack while it's in the map
+  using StackRef = boost::intrusive_ptr<RGWCoroutinesStack>;
+  map<StackRef, string> stack_to_pos;
+  map<string, string> pos_to_prev;
+
+  bool can_adjust_marker = false;
+  bool done_with_period = false;
+
+  int total_entries = 0;
+
+  RGWSyncTraceNodeRef tn;
+public:
+  RGWMetaSyncShardCR(RGWMetaSyncEnv *_sync_env, const rgw_pool& _pool,
+                     const std::string& period, epoch_t realm_epoch,
+                     RGWMetadataLog* mdlog, uint32_t _shard_id,
+                     rgw_meta_sync_marker& _marker,
+                     const std::string& period_marker, bool *_reset_backoff,
+                     RGWSyncTraceNodeRef& _tn)
+    : RGWCoroutine(_sync_env->cct), sync_env(_sync_env), pool(_pool),
+      period(period), realm_epoch(realm_epoch), mdlog(mdlog),
+      shard_id(_shard_id), sync_marker(_marker),
+      period_marker(period_marker),
+      reset_backoff(_reset_backoff), tn(_tn) {
+    *reset_backoff = false;
+  }
+
+  ~RGWMetaSyncShardCR() override {
+    delete marker_tracker;
+    if (lease_cr) {
+      lease_cr->abort();
+    }
+  }
+
+  void set_marker_tracker(RGWMetaSyncShardMarkerTrack *mt) {
+    delete marker_tracker;
+    marker_tracker = mt;
+  }
+
+  int operate(const DoutPrefixProvider *dpp) override {
+    int r;
+    while (true) {
+      switch (sync_marker.state) {
+      case rgw_meta_sync_marker::FullSync:
+        r  = full_sync();
+        if (r < 0) {
+          ldpp_dout(dpp, 10) << "sync: full_sync: shard_id=" << shard_id << " r=" << r << dendl;
+          return set_cr_error(r);
+        }
+        return 0;
+      case rgw_meta_sync_marker::IncrementalSync:
+        r  = incremental_sync();
+        if (r < 0) {
+          ldpp_dout(dpp, 10) << "sync: incremental_sync: shard_id=" << shard_id << " r=" << r << dendl;
+          return set_cr_error(r);
+        }
+        return 0;
+      }
+    }
+    /* unreachable */
+    return 0;
+  }
+
+  void collect_children()
+  {
+    int child_ret;
+    RGWCoroutinesStack *child;
+    while (collect_next(&child_ret, &child)) {
+      auto iter = stack_to_pos.find(child);
+      if (iter == stack_to_pos.end()) {
+        /* some other stack that we don't care about */
+        continue;
+      }
+
+      string& pos = iter->second;
+
+      if (child_ret < 0) {
+        ldpp_dout(sync_env->dpp, 0) << *this << ": child operation stack=" << child << " entry=" << pos << " returned " << child_ret << dendl;
+        // on any error code from RGWMetaSyncSingleEntryCR, we do not advance
+        // the sync status marker past this entry, and set
+        // can_adjust_marker=false to exit out of RGWMetaSyncShardCR.
+        // RGWMetaSyncShardControlCR will rerun RGWMetaSyncShardCR from the
+        // previous marker and retry
+        can_adjust_marker = false;
+      }
+
+      map<string, string>::iterator prev_iter = pos_to_prev.find(pos);
+      ceph_assert(prev_iter != pos_to_prev.end());
+
+      if (pos_to_prev.size() == 1) {
+        if (can_adjust_marker) {
+          sync_marker.marker = pos;
+        }
+        pos_to_prev.erase(prev_iter);
+      } else {
+        ceph_assert(pos_to_prev.size() > 1);
+        pos_to_prev.erase(prev_iter);
+        prev_iter = pos_to_prev.begin();
+        if (can_adjust_marker) {
+          sync_marker.marker = prev_iter->second;
+        }
+      }
+
+      ldpp_dout(sync_env->dpp, 4) << *this << ": adjusting marker pos=" << sync_marker.marker << dendl;
+      stack_to_pos.erase(iter);
+    }
+  }
+
+  int full_sync() {
+#define OMAP_GET_MAX_ENTRIES 100
+    int max_entries = OMAP_GET_MAX_ENTRIES;
+    reenter(&full_cr) {
+      set_status("full_sync");
+      tn->log(10, "start full sync");
+      oid = full_sync_index_shard_oid(shard_id);
+      can_adjust_marker = true;
+      /* grab lock */
+      yield {
+       uint32_t lock_duration = cct->_conf->rgw_sync_lease_period;
+        string lock_name = "sync_lock";
+       rgw::sal::RadosStore* store = sync_env->store;
+        lease_cr.reset(new RGWContinuousLeaseCR(sync_env->async_rados, store,
+                                                rgw_raw_obj(pool, sync_env->shard_obj_name(shard_id)),
+                                                lock_name, lock_duration, this));
+        lease_stack.reset(spawn(lease_cr.get(), false));
+        lost_lock = false;
+      }
+      while (!lease_cr->is_locked()) {
+        if (lease_cr->is_done()) {
+          drain_all();
+          tn->log(5, "failed to take lease");
+          return lease_cr->get_ret_status();
+        }
+        set_sleeping(true);
+        yield;
+      }
+      tn->log(10, "took lease");
+
+      /* lock succeeded, a retry now should avoid previous backoff status */
+      *reset_backoff = true;
+
+      /* prepare marker tracker */
+      set_marker_tracker(new RGWMetaSyncShardMarkerTrack(sync_env,
+                                                         sync_env->shard_obj_name(shard_id),
+                                                         sync_marker, tn));
+
+      marker = sync_marker.marker;
+
+      total_entries = sync_marker.pos;
+
+      /* sync! */
+      do {
+        if (!lease_cr->is_locked()) {
+          tn->log(1, "lease is lost, abort");
+          lost_lock = true;
+          break;
+        }
+        omapkeys = std::make_shared<RGWRadosGetOmapKeysCR::Result>();
+        yield call(new RGWRadosGetOmapKeysCR(sync_env->store, rgw_raw_obj(pool, oid),
+                                             marker, max_entries, omapkeys));
+        if (retcode < 0) {
+          ldpp_dout(sync_env->dpp, 0) << "ERROR: " << __func__ << "(): RGWRadosGetOmapKeysCR() returned ret=" << retcode << dendl;
+          tn->log(0, SSTR("ERROR: failed to list omap keys, status=" << retcode));
+          yield lease_cr->go_down();
+          drain_all();
+          return retcode;
+        }
+        entries = std::move(omapkeys->entries);
+        tn->log(20, SSTR("retrieved " << entries.size() << " entries to sync"));
+        if (entries.size() > 0) {
+          tn->set_flag(RGW_SNS_FLAG_ACTIVE); /* actually have entries to sync */
+        }
+        iter = entries.begin();
+        for (; iter != entries.end(); ++iter) {
+          marker = *iter;
+          tn->log(20, SSTR("full sync: " << marker));
+          total_entries++;
+          if (!marker_tracker->start(marker, total_entries, real_time())) {
+            tn->log(0, SSTR("ERROR: cannot start syncing " << marker << ". Duplicate entry?"));
+          } else {
+            // fetch remote and write locally
+            yield {
+              RGWCoroutinesStack *stack = spawn(new RGWMetaSyncSingleEntryCR(sync_env, marker, marker, MDLOG_STATUS_COMPLETE, marker_tracker, tn), false);
+              // stack_to_pos holds a reference to the stack
+              stack_to_pos[stack] = marker;
+              pos_to_prev[marker] = marker;
+            }
+            // limit spawn window
+            while (num_spawned() > static_cast<size_t>(cct->_conf->rgw_meta_sync_spawn_window)) {
+              yield wait_for_child();
+              collect_children();
+            }
+          }
+        }
+        collect_children();
+      } while (omapkeys->more && can_adjust_marker);
+
+      tn->unset_flag(RGW_SNS_FLAG_ACTIVE); /* actually have entries to sync */
+
+      while (num_spawned() > 1) {
+        yield wait_for_child();
+        collect_children();
+      }
+
+      if (!lost_lock) {
+        /* update marker to reflect we're done with full sync */
+        if (can_adjust_marker) {
+          // apply updates to a temporary marker, or operate() will send us
+          // to incremental_sync() after we yield
+          temp_marker = sync_marker;
+         temp_marker->state = rgw_meta_sync_marker::IncrementalSync;
+         temp_marker->marker = std::move(temp_marker->next_step_marker);
+         temp_marker->next_step_marker.clear();
+         temp_marker->realm_epoch = realm_epoch;
+         ldpp_dout(sync_env->dpp, 4) << *this << ": saving marker pos=" << temp_marker->marker << " realm_epoch=" << realm_epoch << dendl;
+
+         using WriteMarkerCR = RGWSimpleRadosWriteCR<rgw_meta_sync_marker>;
+         yield call(new WriteMarkerCR(sync_env->dpp, sync_env->async_rados, sync_env->store->svc()->sysobj,
+                                      rgw_raw_obj(pool, sync_env->shard_obj_name(shard_id)),
+                                      *temp_marker));
+        }
+
+        if (retcode < 0) {
+          ldpp_dout(sync_env->dpp, 0) << "ERROR: failed to set sync marker: retcode=" << retcode << dendl;
+          yield lease_cr->go_down();
+          drain_all();
+          return retcode;
+        }
+        // clean up full sync index
+        yield {
+          auto oid = full_sync_index_shard_oid(shard_id);
+          call(new RGWRadosRemoveCR(sync_env->store, {pool, oid}));
+        }
+      }
+
+      /* 
+       * if we reached here, it means that lost_lock is true, otherwise the state
+       * change in the previous block will prevent us from reaching here
+       */
+
+      yield lease_cr->go_down();
+
+      lease_cr.reset();
+
+      drain_all();
+
+      if (!can_adjust_marker) {
+        return -EAGAIN;
+      }
+
+      if (lost_lock) {
+        return -EBUSY;
+      }
+
+      tn->log(10, "full sync complete");
+
+      // apply the sync marker update
+      ceph_assert(temp_marker);
+      sync_marker = std::move(*temp_marker);
+      temp_marker = boost::none;
+      // must not yield after this point!
+    }
+    return 0;
+  }
+    
+
+  int incremental_sync() {
+    reenter(&incremental_cr) {
+      set_status("incremental_sync");
+      tn->log(10, "start incremental sync");
+      can_adjust_marker = true;
+      /* grab lock */
+      if (!lease_cr) { /* could have had  a lease_cr lock from previous state */
+        yield {
+          uint32_t lock_duration = cct->_conf->rgw_sync_lease_period;
+          string lock_name = "sync_lock";
+         rgw::sal::RadosStore* store = sync_env->store;
+          lease_cr.reset( new RGWContinuousLeaseCR(sync_env->async_rados, store,
+                                                   rgw_raw_obj(pool, sync_env->shard_obj_name(shard_id)),
+                                                   lock_name, lock_duration, this));
+          lease_stack.reset(spawn(lease_cr.get(), false));
+          lost_lock = false;
+        }
+        while (!lease_cr->is_locked()) {
+          if (lease_cr->is_done()) {
+            drain_all();
+            tn->log(5, "failed to take lease");
+            return lease_cr->get_ret_status();
+          }
+          set_sleeping(true);
+          yield;
+        }
+      }
+      tn->log(10, "took lease");
+      // if the period has advanced, we can't use the existing marker
+      if (sync_marker.realm_epoch < realm_epoch) {
+        ldpp_dout(sync_env->dpp, 4) << "clearing marker=" << sync_marker.marker
+            << " from old realm_epoch=" << sync_marker.realm_epoch
+            << " (now " << realm_epoch << ')' << dendl;
+        sync_marker.realm_epoch = realm_epoch;
+        sync_marker.marker.clear();
+      }
+      mdlog_marker = sync_marker.marker;
+      set_marker_tracker(new RGWMetaSyncShardMarkerTrack(sync_env,
+                                                         sync_env->shard_obj_name(shard_id),
+                                                         sync_marker, tn));
+
+      /*
+       * mdlog_marker: the remote sync marker positiion
+       * sync_marker: the local sync marker position
+       * max_marker: the max mdlog position that we fetched
+       * marker: the current position we try to sync
+       * period_marker: the last marker before the next period begins (optional)
+       */
+      marker = max_marker = sync_marker.marker;
+      /* inc sync */
+      do {
+        if (!lease_cr->is_locked()) {
+          lost_lock = true;
+          tn->log(1, "lease is lost, abort");
+          break;
+        }
+#define INCREMENTAL_MAX_ENTRIES 100
+        ldpp_dout(sync_env->dpp, 20) << __func__ << ":" << __LINE__ << ": shard_id=" << shard_id << " mdlog_marker=" << mdlog_marker << " sync_marker.marker=" << sync_marker.marker << " period_marker=" << period_marker << " truncated=" << truncated << dendl;
+        if (!period_marker.empty() && period_marker <= mdlog_marker) {
+          tn->log(10, SSTR("finished syncing current period: mdlog_marker=" << mdlog_marker << " sync_marker=" << sync_marker.marker << " period_marker=" << period_marker));
+          done_with_period = true;
+          break;
+        }
+       if (mdlog_marker <= max_marker || !truncated) {
+         /* we're at the tip, try to bring more entries */
+          ldpp_dout(sync_env->dpp, 20) << __func__ << ":" << __LINE__ << ": shard_id=" << shard_id << " syncing mdlog for shard_id=" << shard_id << dendl;
+          yield call(new RGWCloneMetaLogCoroutine(sync_env, mdlog,
+                                                  period, shard_id,
+                                                  mdlog_marker, &mdlog_marker));
+       }
+        if (retcode < 0) {
+          tn->log(10, SSTR(*this << ": failed to fetch more log entries, retcode=" << retcode));
+          yield lease_cr->go_down();
+          drain_all();
+          *reset_backoff = false; // back off and try again later
+          return retcode;
+        }
+        truncated = true;
+        *reset_backoff = true; /* if we got to this point, all systems function */
+       if (mdlog_marker > max_marker) {
+          tn->set_flag(RGW_SNS_FLAG_ACTIVE); /* actually have entries to sync */
+          tn->log(20, SSTR("mdlog_marker=" << mdlog_marker << " sync_marker=" << sync_marker.marker));
+          marker = max_marker;
+          yield call(new RGWReadMDLogEntriesCR(sync_env, mdlog, shard_id,
+                                               &max_marker, INCREMENTAL_MAX_ENTRIES,
+                                               &log_entries, &truncated));
+          if (retcode < 0) {
+            tn->log(10, SSTR("failed to list mdlog entries, retcode=" << retcode));
+            yield lease_cr->go_down();
+            drain_all();
+            *reset_backoff = false; // back off and try again later
+            return retcode;
+          }
+          for (log_iter = log_entries.begin(); log_iter != log_entries.end() && !done_with_period; ++log_iter) {
+            if (!period_marker.empty() && period_marker <= log_iter->id) {
+              done_with_period = true;
+              if (period_marker < log_iter->id) {
+                tn->log(10, SSTR("found key=" << log_iter->id
+                    << " past period_marker=" << period_marker));
+                break;
+              }
+              ldpp_dout(sync_env->dpp, 10) << "found key at period_marker=" << period_marker << dendl;
+              // sync this entry, then return control to RGWMetaSyncCR
+            }
+            if (!mdlog_entry.convert_from(*log_iter)) {
+              tn->log(0, SSTR("ERROR: failed to convert mdlog entry, shard_id=" << shard_id << " log_entry: " << log_iter->id << ":" << log_iter->section << ":" << log_iter->name << ":" << log_iter->timestamp << " ... skipping entry"));
+              continue;
+            }
+            tn->log(20, SSTR("log_entry: " << log_iter->id << ":" << log_iter->section << ":" << log_iter->name << ":" << log_iter->timestamp));
+            if (!marker_tracker->start(log_iter->id, 0, log_iter->timestamp.to_real_time())) {
+              ldpp_dout(sync_env->dpp, 0) << "ERROR: cannot start syncing " << log_iter->id << ". Duplicate entry?" << dendl;
+            } else {
+              raw_key = log_iter->section + ":" + log_iter->name;
+              yield {
+                RGWCoroutinesStack *stack = spawn(new RGWMetaSyncSingleEntryCR(sync_env, raw_key, log_iter->id, mdlog_entry.log_data.status, marker_tracker, tn), false);
+                ceph_assert(stack);
+                // stack_to_pos holds a reference to the stack
+                stack_to_pos[stack] = log_iter->id;
+                pos_to_prev[log_iter->id] = marker;
+              }
+              // limit spawn window
+              while (num_spawned() > static_cast<size_t>(cct->_conf->rgw_meta_sync_spawn_window)) {
+                yield wait_for_child();
+                collect_children();
+              }
+            }
+            marker = log_iter->id;
+          }
+        }
+        collect_children();
+       ldpp_dout(sync_env->dpp, 20) << __func__ << ":" << __LINE__ << ": shard_id=" << shard_id << " mdlog_marker=" << mdlog_marker << " max_marker=" << max_marker << " sync_marker.marker=" << sync_marker.marker << " period_marker=" << period_marker << dendl;
+        if (done_with_period) {
+          // return control to RGWMetaSyncCR and advance to the next period
+          tn->log(10, SSTR(*this << ": done with period"));
+          break;
+        }
+       if (mdlog_marker == max_marker && can_adjust_marker) {
+          tn->unset_flag(RGW_SNS_FLAG_ACTIVE);
+         yield wait(utime_t(cct->_conf->rgw_meta_sync_poll_interval, 0));
+       }
+      } while (can_adjust_marker);
+
+      tn->unset_flag(RGW_SNS_FLAG_ACTIVE);
+
+      while (num_spawned() > 1) {
+        yield wait_for_child();
+        collect_children();
+      }
+
+      yield lease_cr->go_down();
+
+      drain_all();
+
+      if (lost_lock) {
+        return -EBUSY;
+      }
+
+      if (!can_adjust_marker) {
+        return -EAGAIN;
+      }
+
+      return set_cr_done();
+    }
+    /* TODO */
+    return 0;
+  }
+};
+
+class RGWMetaSyncShardControlCR : public RGWBackoffControlCR
+{
+  RGWMetaSyncEnv *sync_env;
+
+  const rgw_pool& pool;
+  const std::string& period;
+  epoch_t realm_epoch;
+  RGWMetadataLog* mdlog;
+  uint32_t shard_id;
+  rgw_meta_sync_marker sync_marker;
+  const std::string period_marker;
+
+  RGWSyncTraceNodeRef tn;
+
+  static constexpr bool exit_on_error = false; // retry on all errors
+public:
+  RGWMetaSyncShardControlCR(RGWMetaSyncEnv *_sync_env, const rgw_pool& _pool,
+                            const std::string& period, epoch_t realm_epoch,
+                            RGWMetadataLog* mdlog, uint32_t _shard_id,
+                            const rgw_meta_sync_marker& _marker,
+                            std::string&& period_marker,
+                            RGWSyncTraceNodeRef& _tn_parent)
+    : RGWBackoffControlCR(_sync_env->cct, exit_on_error), sync_env(_sync_env),
+      pool(_pool), period(period), realm_epoch(realm_epoch), mdlog(mdlog),
+      shard_id(_shard_id), sync_marker(_marker),
+      period_marker(std::move(period_marker)) {
+    tn = sync_env->sync_tracer->add_node(_tn_parent, "shard",
+                                         std::to_string(shard_id));
+  }
+
+  RGWCoroutine *alloc_cr() override {
+    return new RGWMetaSyncShardCR(sync_env, pool, period, realm_epoch, mdlog,
+                                  shard_id, sync_marker, period_marker, backoff_ptr(), tn);
+  }
+
+  RGWCoroutine *alloc_finisher_cr() override {
+    rgw::sal::RadosStore* store = sync_env->store;
+    return new RGWSimpleRadosReadCR<rgw_meta_sync_marker>(sync_env->dpp, sync_env->async_rados, store->svc()->sysobj,
+                                                          rgw_raw_obj(pool, sync_env->shard_obj_name(shard_id)),
+                                                          &sync_marker);
+  }
+};
+
+class RGWMetaSyncCR : public RGWCoroutine {
+  RGWMetaSyncEnv *sync_env;
+  const rgw_pool& pool;
+  RGWPeriodHistory::Cursor cursor; //< sync position in period history
+  RGWPeriodHistory::Cursor next; //< next period in history
+  rgw_meta_sync_status sync_status;
+  RGWSyncTraceNodeRef tn;
+
+  std::mutex mutex; //< protect access to shard_crs
+
+  // TODO: it should be enough to hold a reference on the stack only, as calling
+  // RGWCoroutinesStack::wakeup() doesn't refer to the RGWCoroutine if it has
+  // already completed
+  using ControlCRRef = boost::intrusive_ptr<RGWMetaSyncShardControlCR>;
+  using StackRef = boost::intrusive_ptr<RGWCoroutinesStack>;
+  using RefPair = std::pair<ControlCRRef, StackRef>;
+  map<int, RefPair> shard_crs;
+  int ret{0};
+
+public:
+  RGWMetaSyncCR(RGWMetaSyncEnv *_sync_env, const RGWPeriodHistory::Cursor &cursor,
+                const rgw_meta_sync_status& _sync_status, RGWSyncTraceNodeRef& _tn)
+    : RGWCoroutine(_sync_env->cct), sync_env(_sync_env),
+      pool(sync_env->store->svc()->zone->get_zone_params().log_pool),
+      cursor(cursor), sync_status(_sync_status), tn(_tn) {}
+
+  ~RGWMetaSyncCR() {
+  }
+
+  int operate(const DoutPrefixProvider *dpp) override {
+    reenter(this) {
+      // loop through one period at a time
+      tn->log(1, "start");
+      for (;;) {
+        if (cursor == sync_env->store->svc()->mdlog->get_period_history()->get_current()) {
+          next = RGWPeriodHistory::Cursor{};
+          if (cursor) {
+            ldpp_dout(dpp, 10) << "RGWMetaSyncCR on current period="
+                << cursor.get_period().get_id() << dendl;
+          } else {
+            ldpp_dout(dpp, 10) << "RGWMetaSyncCR with no period" << dendl;
+          }
+        } else {
+          next = cursor;
+          next.next();
+          ldpp_dout(dpp, 10) << "RGWMetaSyncCR on period="
+              << cursor.get_period().get_id() << ", next="
+              << next.get_period().get_id() << dendl;
+        }
+
+        yield {
+          // get the mdlog for the current period (may be empty)
+          auto& period_id = sync_status.sync_info.period;
+          auto realm_epoch = sync_status.sync_info.realm_epoch;
+          auto mdlog = sync_env->store->svc()->mdlog->get_log(period_id);
+
+          tn->log(1, SSTR("realm epoch=" << realm_epoch << " period id=" << period_id));
+
+          // prevent wakeup() from accessing shard_crs while we're spawning them
+          std::lock_guard<std::mutex> lock(mutex);
+
+          // sync this period on each shard
+          for (const auto& m : sync_status.sync_markers) {
+            uint32_t shard_id = m.first;
+            auto& marker = m.second;
+
+            std::string period_marker;
+            if (next) {
+              // read the maximum marker from the next period's sync status
+              period_marker = next.get_period().get_sync_status()[shard_id];
+              if (period_marker.empty()) {
+                // no metadata changes have occurred on this shard, skip it
+                ldpp_dout(dpp, 10) << "RGWMetaSyncCR: skipping shard " << shard_id
+                    << " with empty period marker" << dendl;
+                continue;
+              }
+            }
+
+            using ShardCR = RGWMetaSyncShardControlCR;
+            auto cr = new ShardCR(sync_env, pool, period_id, realm_epoch,
+                                  mdlog, shard_id, marker,
+                                  std::move(period_marker), tn);
+            auto stack = spawn(cr, false);
+            shard_crs[shard_id] = RefPair{cr, stack};
+          }
+        }
+        // wait for each shard to complete
+        while (ret == 0 && num_spawned() > 0) {
+          yield wait_for_child();
+          collect(&ret, nullptr);
+        }
+        drain_all();
+        {
+          // drop shard cr refs under lock
+          std::lock_guard<std::mutex> lock(mutex);
+          shard_crs.clear();
+        }
+        if (ret < 0) {
+          return set_cr_error(ret);
+        }
+        // advance to the next period
+        ceph_assert(next);
+        cursor = next;
+
+        // write the updated sync info
+        sync_status.sync_info.period = cursor.get_period().get_id();
+        sync_status.sync_info.realm_epoch = cursor.get_epoch();
+        yield call(new RGWSimpleRadosWriteCR<rgw_meta_sync_info>(dpp, sync_env->async_rados,
+                                                                 sync_env->store->svc()->sysobj,
+                                                                 rgw_raw_obj(pool, sync_env->status_oid()),
+                                                                 sync_status.sync_info));
+      }
+    }
+    return 0;
+  }
+
+  void wakeup(int shard_id) {
+    std::lock_guard<std::mutex> lock(mutex);
+    auto iter = shard_crs.find(shard_id);
+    if (iter == shard_crs.end()) {
+      return;
+    }
+    iter->second.first->wakeup();
+  }
+};
+
+void RGWRemoteMetaLog::init_sync_env(RGWMetaSyncEnv *env) {
+  env->dpp = dpp;
+  env->cct = store->ctx();
+  env->store = store;
+  env->conn = conn;
+  env->async_rados = async_rados;
+  env->http_manager = &http_manager;
+  env->error_logger = error_logger;
+  env->sync_tracer = store->getRados()->get_sync_tracer();
+}
+
+int RGWRemoteMetaLog::read_sync_status(const DoutPrefixProvider *dpp, rgw_meta_sync_status *sync_status)
+{
+  if (store->svc()->zone->is_meta_master()) {
+    return 0;
+  }
+  // cannot run concurrently with run_sync(), so run in a separate manager
+  RGWCoroutinesManager crs(store->ctx(), store->getRados()->get_cr_registry());
+  RGWHTTPManager http_manager(store->ctx(), crs.get_completion_mgr());
+  int ret = http_manager.start();
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "failed in http_manager.start() ret=" << ret << dendl;
+    return ret;
+  }
+  RGWMetaSyncEnv sync_env_local = sync_env;
+  sync_env_local.http_manager = &http_manager;
+  tn->log(20, "read sync status");
+  ret = crs.run(dpp, new RGWReadSyncStatusCoroutine(&sync_env_local, sync_status));
+  http_manager.stop();
+  return ret;
+}
+
+int RGWRemoteMetaLog::init_sync_status(const DoutPrefixProvider *dpp)
+{
+  if (store->svc()->zone->is_meta_master()) {
+    return 0;
+  }
+
+  rgw_mdlog_info mdlog_info;
+  int r = read_log_info(dpp, &mdlog_info);
+  if (r < 0) {
+    ldpp_dout(dpp, -1) << "ERROR: fail to fetch master log info (r=" << r << ")" << dendl;
+    return r;
+  }
+
+  rgw_meta_sync_info sync_info;
+  sync_info.num_shards = mdlog_info.num_shards;
+  auto cursor = store->svc()->mdlog->get_period_history()->get_current();
+  if (cursor) {
+    sync_info.period = cursor.get_period().get_id();
+    sync_info.realm_epoch = cursor.get_epoch();
+  }
+
+  return run(dpp, new RGWInitSyncStatusCoroutine(&sync_env, sync_info));
+}
+
+int RGWRemoteMetaLog::store_sync_info(const DoutPrefixProvider *dpp, const rgw_meta_sync_info& sync_info)
+{
+  tn->log(20, "store sync info");
+  return run(dpp, new RGWSimpleRadosWriteCR<rgw_meta_sync_info>(dpp, async_rados, store->svc()->sysobj,
+                                                           rgw_raw_obj(store->svc()->zone->get_zone_params().log_pool, sync_env.status_oid()),
+                                                           sync_info));
+}
+
+// return a cursor to the period at our sync position
+static RGWPeriodHistory::Cursor get_period_at(const DoutPrefixProvider *dpp,
+                                              rgw::sal::RadosStore* store,
+                                              const rgw_meta_sync_info& info,
+                                             optional_yield y)
+{
+  if (info.period.empty()) {
+    // return an empty cursor with error=0
+    return RGWPeriodHistory::Cursor{};
+  }
+
+  // look for an existing period in our history
+  auto cursor = store->svc()->mdlog->get_period_history()->lookup(info.realm_epoch);
+  if (cursor) {
+    // verify that the period ids match
+    auto& existing = cursor.get_period().get_id();
+    if (existing != info.period) {
+      ldpp_dout(dpp, -1) << "ERROR: sync status period=" << info.period
+          << " does not match period=" << existing
+          << " in history at realm epoch=" << info.realm_epoch << dendl;
+      return RGWPeriodHistory::Cursor{-EEXIST};
+    }
+    return cursor;
+  }
+
+  // read the period from rados or pull it from the master
+  RGWPeriod period;
+  int r = store->svc()->mdlog->pull_period(dpp, info.period, period, y);
+  if (r < 0) {
+    ldpp_dout(dpp, -1) << "ERROR: failed to read period id "
+        << info.period << ": " << cpp_strerror(r) << dendl;
+    return RGWPeriodHistory::Cursor{r};
+  }
+  // attach the period to our history
+  cursor = store->svc()->mdlog->get_period_history()->attach(dpp, std::move(period), y);
+  if (!cursor) {
+    r = cursor.get_error();
+    ldpp_dout(dpp, -1) << "ERROR: failed to read period history back to "
+        << info.period << ": " << cpp_strerror(r) << dendl;
+  }
+  return cursor;
+}
+
+int RGWRemoteMetaLog::run_sync(const DoutPrefixProvider *dpp, optional_yield y)
+{
+  if (store->svc()->zone->is_meta_master()) {
+    return 0;
+  }
+
+  int r = 0;
+
+  // get shard count and oldest log period from master
+  rgw_mdlog_info mdlog_info;
+  for (;;) {
+    if (going_down) {
+      ldpp_dout(dpp, 1) << __func__ << "(): going down" << dendl;
+      return 0;
+    }
+    r = read_log_info(dpp, &mdlog_info);
+    if (r == -EIO || r == -ENOENT) {
+      // keep retrying if master isn't alive or hasn't initialized the log
+      ldpp_dout(dpp, 10) << __func__ << "(): waiting for master.." << dendl;
+      backoff.backoff_sleep();
+      continue;
+    }
+    backoff.reset();
+    if (r < 0) {
+      ldpp_dout(dpp, -1) << "ERROR: fail to fetch master log info (r=" << r << ")" << dendl;
+      return r;
+    }
+    break;
+  }
+
+  rgw_meta_sync_status sync_status;
+  do {
+    if (going_down) {
+      ldpp_dout(dpp, 1) << __func__ << "(): going down" << dendl;
+      return 0;
+    }
+    r = run(dpp, new RGWReadSyncStatusCoroutine(&sync_env, &sync_status));
+    if (r < 0 && r != -ENOENT) {
+      ldpp_dout(dpp, 0) << "ERROR: failed to fetch sync status r=" << r << dendl;
+      return r;
+    }
+
+    if (!mdlog_info.period.empty()) {
+      // restart sync if the remote has a period, but:
+      // a) our status does not, or
+      // b) our sync period comes before the remote's oldest log period
+      if (sync_status.sync_info.period.empty() ||
+          sync_status.sync_info.realm_epoch < mdlog_info.realm_epoch) {
+        sync_status.sync_info.state = rgw_meta_sync_info::StateInit;
+        string reason;
+        if (sync_status.sync_info.period.empty()) {
+          reason = "period is empty";
+        } else {
+          reason = SSTR("sync_info realm epoch is behind: " << sync_status.sync_info.realm_epoch << " < " << mdlog_info.realm_epoch);
+        }
+        tn->log(1, "initialize sync (reason: " + reason + ")");
+        ldpp_dout(dpp, 1) << "epoch=" << sync_status.sync_info.realm_epoch
+           << " in sync status comes before remote's oldest mdlog epoch="
+           << mdlog_info.realm_epoch << ", restarting sync" << dendl;
+      }
+    }
+
+    if (sync_status.sync_info.state == rgw_meta_sync_info::StateInit) {
+      ldpp_dout(dpp, 20) << __func__ << "(): init" << dendl;
+      sync_status.sync_info.num_shards = mdlog_info.num_shards;
+      auto cursor = store->svc()->mdlog->get_period_history()->get_current();
+      if (cursor) {
+        // run full sync, then start incremental from the current period/epoch
+        sync_status.sync_info.period = cursor.get_period().get_id();
+        sync_status.sync_info.realm_epoch = cursor.get_epoch();
+      }
+      r = run(dpp, new RGWInitSyncStatusCoroutine(&sync_env, sync_status.sync_info));
+      if (r == -EBUSY) {
+        backoff.backoff_sleep();
+        continue;
+      }
+      backoff.reset();
+      if (r < 0) {
+        ldpp_dout(dpp, 0) << "ERROR: failed to init sync status r=" << r << dendl;
+        return r;
+      }
+    }
+  } while (sync_status.sync_info.state == rgw_meta_sync_info::StateInit);
+
+  auto num_shards = sync_status.sync_info.num_shards;
+  if (num_shards != mdlog_info.num_shards) {
+    ldpp_dout(dpp, -1) << "ERROR: can't sync, mismatch between num shards, master num_shards=" << mdlog_info.num_shards << " local num_shards=" << num_shards << dendl;
+    return -EINVAL;
+  }
+
+  RGWPeriodHistory::Cursor cursor;
+  do {
+    r = run(dpp, new RGWReadSyncStatusCoroutine(&sync_env, &sync_status));
+    if (r < 0 && r != -ENOENT) {
+      tn->log(0, SSTR("ERROR: failed to fetch sync status r=" << r));
+      return r;
+    }
+
+    switch ((rgw_meta_sync_info::SyncState)sync_status.sync_info.state) {
+      case rgw_meta_sync_info::StateBuildingFullSyncMaps:
+        tn->log(20, "building full sync maps");
+        r = run(dpp, new RGWFetchAllMetaCR(&sync_env, num_shards, sync_status.sync_markers, tn));
+        if (r == -EBUSY || r == -EIO) {
+          backoff.backoff_sleep();
+          continue;
+        }
+        backoff.reset();
+        if (r < 0) {
+          tn->log(0, SSTR("ERROR: failed to fetch all metadata keys (r=" << r << ")"));
+          return r;
+        }
+
+        sync_status.sync_info.state = rgw_meta_sync_info::StateSync;
+        r = store_sync_info(dpp, sync_status.sync_info);
+        if (r < 0) {
+          tn->log(0, SSTR("ERROR: failed to update sync status (r=" << r << ")"));
+          return r;
+        }
+        /* fall through */
+      case rgw_meta_sync_info::StateSync:
+        tn->log(20, "sync");
+        // find our position in the period history (if any)
+        cursor = get_period_at(dpp, store, sync_status.sync_info, y);
+        r = cursor.get_error();
+        if (r < 0) {
+          return r;
+        }
+        meta_sync_cr = new RGWMetaSyncCR(&sync_env, cursor, sync_status, tn);
+        r = run(dpp, meta_sync_cr);
+        if (r < 0) {
+          tn->log(0, "ERROR: failed to fetch all metadata keys");
+          return r;
+        }
+        break;
+      default:
+        tn->log(0, "ERROR: bad sync state!");
+        return -EIO;
+    }
+  } while (!going_down);
+
+  return 0;
+}
+
+void RGWRemoteMetaLog::wakeup(int shard_id)
+{
+  if (!meta_sync_cr) {
+    return;
+  }
+  meta_sync_cr->wakeup(shard_id);
+}
+
+int RGWCloneMetaLogCoroutine::operate(const DoutPrefixProvider *dpp)
+{
+  reenter(this) {
+    do {
+      yield {
+        ldpp_dout(dpp, 20) << __func__ << ": shard_id=" << shard_id << ": init request" << dendl;
+        return state_init();
+      }
+      yield {
+        ldpp_dout(dpp, 20) << __func__ << ": shard_id=" << shard_id << ": reading shard status" << dendl;
+        return state_read_shard_status();
+      }
+      yield {
+        ldpp_dout(dpp, 20) << __func__ << ": shard_id=" << shard_id << ": reading shard status complete" << dendl;
+        return state_read_shard_status_complete();
+      }
+      yield {
+        ldpp_dout(dpp, 20) << __func__ << ": shard_id=" << shard_id << ": sending rest request" << dendl;
+        return state_send_rest_request(dpp);
+      }
+      yield {
+        ldpp_dout(dpp, 20) << __func__ << ": shard_id=" << shard_id << ": receiving rest response" << dendl;
+        return state_receive_rest_response();
+      }
+      yield {
+        ldpp_dout(dpp, 20) << __func__ << ": shard_id=" << shard_id << ": storing mdlog entries" << dendl;
+        return state_store_mdlog_entries();
+      }
+    } while (truncated);
+    yield {
+      ldpp_dout(dpp, 20) << __func__ << ": shard_id=" << shard_id << ": storing mdlog entries complete" << dendl;
+      return state_store_mdlog_entries_complete();
+    }
+  }
+
+  return 0;
+}
+
+int RGWCloneMetaLogCoroutine::state_init()
+{
+  data = rgw_mdlog_shard_data();
+
+  return 0;
+}
+
+int RGWCloneMetaLogCoroutine::state_read_shard_status()
+{
+  const bool add_ref = false; // default constructs with refs=1
+
+  completion.reset(new RGWMetadataLogInfoCompletion(
+    [this](int ret, const cls_log_header& header) {
+      if (ret < 0) {
+        if (ret != -ENOENT) {
+          ldpp_dout(sync_env->dpp, 1) << "ERROR: failed to read mdlog info with "
+                                      << cpp_strerror(ret) << dendl;
+        }
+      } else {
+        shard_info.marker = header.max_marker;
+        shard_info.last_update = header.max_time.to_real_time();
+      }
+      // wake up parent stack
+      io_complete();
+    }), add_ref);
+
+  int ret = mdlog->get_info_async(sync_env->dpp, shard_id, completion.get());
+  if (ret < 0) {
+    ldpp_dout(sync_env->dpp, 0) << "ERROR: mdlog->get_info_async() returned ret=" << ret << dendl;
+    return set_cr_error(ret);
+  }
+
+  return io_block(0);
+}
+
+int RGWCloneMetaLogCoroutine::state_read_shard_status_complete()
+{
+  completion.reset();
+
+  ldpp_dout(sync_env->dpp, 20) << "shard_id=" << shard_id << " marker=" << shard_info.marker << " last_update=" << shard_info.last_update << dendl;
+
+  marker = shard_info.marker;
+
+  return 0;
+}
+
+int RGWCloneMetaLogCoroutine::state_send_rest_request(const DoutPrefixProvider *dpp)
+{
+  RGWRESTConn *conn = sync_env->conn;
+
+  char buf[32];
+  snprintf(buf, sizeof(buf), "%d", shard_id);
+
+  char max_entries_buf[32];
+  snprintf(max_entries_buf, sizeof(max_entries_buf), "%d", max_entries);
+
+  const char *marker_key = (marker.empty() ? "" : "marker");
+
+  rgw_http_param_pair pairs[] = { { "type", "metadata" },
+                                  { "id", buf },
+                                  { "period", period.c_str() },
+                                  { "max-entries", max_entries_buf },
+                                  { marker_key, marker.c_str() },
+                                  { NULL, NULL } };
+
+  http_op = new RGWRESTReadResource(conn, "/admin/log", pairs, NULL, sync_env->http_manager);
+
+  init_new_io(http_op);
+
+  int ret = http_op->aio_read(dpp);
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: failed to fetch mdlog data" << dendl;
+    log_error() << "failed to send http operation: " << http_op->to_str() << " ret=" << ret << std::endl;
+    http_op->put();
+    http_op = NULL;
+    return set_cr_error(ret);
+  }
+
+  return io_block(0);
+}
+
+int RGWCloneMetaLogCoroutine::state_receive_rest_response()
+{
+  int ret = http_op->wait(&data, null_yield);
+  if (ret < 0) {
+    error_stream << "http operation failed: " << http_op->to_str() << " status=" << http_op->get_http_status() << std::endl;
+    ldpp_dout(sync_env->dpp, 5) << "failed to wait for op, ret=" << ret << dendl;
+    http_op->put();
+    http_op = NULL;
+    return set_cr_error(ret);
+  }
+  http_op->put();
+  http_op = NULL;
+
+  ldpp_dout(sync_env->dpp, 20) << "remote mdlog, shard_id=" << shard_id << " num of shard entries: " << data.entries.size() << dendl;
+
+  truncated = ((int)data.entries.size() == max_entries);
+
+  if (data.entries.empty()) {
+    if (new_marker) {
+      *new_marker = marker;
+    }
+    return set_cr_done();
+  }
+
+  if (new_marker) {
+    *new_marker = data.entries.back().id;
+  }
+
+  return 0;
+}
+
+
+int RGWCloneMetaLogCoroutine::state_store_mdlog_entries()
+{
+  list<cls_log_entry> dest_entries;
+
+  vector<rgw_mdlog_entry>::iterator iter;
+  for (iter = data.entries.begin(); iter != data.entries.end(); ++iter) {
+    rgw_mdlog_entry& entry = *iter;
+    ldpp_dout(sync_env->dpp, 20) << "entry: name=" << entry.name << dendl;
+
+    cls_log_entry dest_entry;
+    dest_entry.id = entry.id;
+    dest_entry.section = entry.section;
+    dest_entry.name = entry.name;
+    dest_entry.timestamp = utime_t(entry.timestamp);
+  
+    encode(entry.log_data, dest_entry.data);
+
+    dest_entries.push_back(dest_entry);
+
+    marker = entry.id;
+  }
+
+  RGWAioCompletionNotifier *cn = stack->create_completion_notifier();
+
+  int ret = mdlog->store_entries_in_shard(sync_env->dpp, dest_entries, shard_id, cn->completion());
+  if (ret < 0) {
+    cn->put();
+    ldpp_dout(sync_env->dpp, 10) << "failed to store md log entries shard_id=" << shard_id << " ret=" << ret << dendl;
+    return set_cr_error(ret);
+  }
+  return io_block(0);
+}
+
+int RGWCloneMetaLogCoroutine::state_store_mdlog_entries_complete()
+{
+  return set_cr_done();
+}
+
+void rgw_meta_sync_info::decode_json(JSONObj *obj)
+{
+  string s;
+  JSONDecoder::decode_json("status", s, obj);
+  if (s == "init") {
+    state = StateInit;
+  } else if (s == "building-full-sync-maps") {
+    state = StateBuildingFullSyncMaps;
+  } else if (s == "sync") {
+    state = StateSync;
+  }
+  JSONDecoder::decode_json("num_shards", num_shards, obj);
+  JSONDecoder::decode_json("period", period, obj);
+  JSONDecoder::decode_json("realm_epoch", realm_epoch, obj);
+}
+
+void rgw_meta_sync_info::dump(Formatter *f) const
+{
+  string s;
+  switch ((SyncState)state) {
+  case StateInit:
+    s = "init";
+    break;
+  case StateBuildingFullSyncMaps:
+    s = "building-full-sync-maps";
+    break;
+  case StateSync:
+    s = "sync";
+    break;
+  default:
+    s = "unknown";
+    break;
+  }
+  encode_json("status", s, f);
+  encode_json("num_shards", num_shards, f);
+  encode_json("period", period, f);
+  encode_json("realm_epoch", realm_epoch, f);
+}
+
+
+void rgw_meta_sync_marker::decode_json(JSONObj *obj)
+{
+  int s;
+  JSONDecoder::decode_json("state", s, obj);
+  state = s;
+  JSONDecoder::decode_json("marker", marker, obj);
+  JSONDecoder::decode_json("next_step_marker", next_step_marker, obj);
+  JSONDecoder::decode_json("total_entries", total_entries, obj);
+  JSONDecoder::decode_json("pos", pos, obj);
+  utime_t ut;
+  JSONDecoder::decode_json("timestamp", ut, obj);
+  timestamp = ut.to_real_time();
+  JSONDecoder::decode_json("realm_epoch", realm_epoch, obj);
+}
+
+void rgw_meta_sync_marker::dump(Formatter *f) const
+{
+  encode_json("state", (int)state, f);
+  encode_json("marker", marker, f);
+  encode_json("next_step_marker", next_step_marker, f);
+  encode_json("total_entries", total_entries, f);
+  encode_json("pos", pos, f);
+  encode_json("timestamp", utime_t(timestamp), f);
+  encode_json("realm_epoch", realm_epoch, f);
+}
+
+void rgw_meta_sync_status::decode_json(JSONObj *obj)
+{
+  JSONDecoder::decode_json("info", sync_info, obj);
+  JSONDecoder::decode_json("markers", sync_markers, obj);
+}
+
+void rgw_meta_sync_status::dump(Formatter *f) const {
+  encode_json("info", sync_info, f);
+  encode_json("markers", sync_markers, f);
+}
+
+void rgw_sync_error_info::dump(Formatter *f) const {
+  encode_json("source_zone", source_zone, f);
+  encode_json("error_code", error_code, f);
+  encode_json("message", message, f);
+}
+
diff --git a/src/rgw/driver/rados/rgw_sync.h b/src/rgw/driver/rados/rgw_sync.h
new file mode 100644 (file)
index 0000000..8c4e511
--- /dev/null
@@ -0,0 +1,549 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#ifndef CEPH_RGW_SYNC_H
+#define CEPH_RGW_SYNC_H
+
+#include <atomic>
+
+#include "include/stringify.h"
+
+#include "rgw_coroutine.h"
+#include "rgw_http_client.h"
+#include "rgw_metadata.h"
+#include "rgw_meta_sync_status.h"
+#include "rgw_sal.h"
+#include "rgw_sal_rados.h"
+#include "rgw_sync_trace.h"
+#include "rgw_mdlog.h"
+
+#define ERROR_LOGGER_SHARDS 32
+#define RGW_SYNC_ERROR_LOG_SHARD_PREFIX "sync.error-log"
+
+struct rgw_mdlog_info {
+  uint32_t num_shards;
+  std::string period; //< period id of the master's oldest metadata log
+  epoch_t realm_epoch; //< realm epoch of oldest metadata log
+
+  rgw_mdlog_info() : num_shards(0), realm_epoch(0) {}
+
+  void decode_json(JSONObj *obj);
+};
+
+
+struct rgw_mdlog_entry {
+  std::string id;
+  std::string section;
+  std::string name;
+  ceph::real_time timestamp;
+  RGWMetadataLogData log_data;
+
+  void decode_json(JSONObj *obj);
+
+  bool convert_from(cls_log_entry& le) {
+    id = le.id;
+    section = le.section;
+    name = le.name;
+    timestamp = le.timestamp.to_real_time();
+    try {
+      auto iter = le.data.cbegin();
+      decode(log_data, iter);
+    } catch (buffer::error& err) {
+      return false;
+    }
+    return true;
+  }
+};
+
+struct rgw_mdlog_shard_data {
+  std::string marker;
+  bool truncated;
+  std::vector<rgw_mdlog_entry> entries;
+
+  void decode_json(JSONObj *obj);
+};
+
+class RGWAsyncRadosProcessor;
+class RGWMetaSyncStatusManager;
+class RGWMetaSyncCR;
+class RGWRESTConn;
+class RGWSyncTraceManager;
+
+class RGWSyncErrorLogger {
+  rgw::sal::RadosStore* store;
+
+  std::vector<std::string> oids;
+  int num_shards;
+
+  std::atomic<int64_t> counter = { 0 };
+public:
+  RGWSyncErrorLogger(rgw::sal::RadosStore* _store, const std::string &oid_prefix, int _num_shards);
+  RGWCoroutine *log_error_cr(const DoutPrefixProvider *dpp, const std::string& source_zone, const std::string& section, const std::string& name, uint32_t error_code, const std::string& message);
+
+  static std::string get_shard_oid(const std::string& oid_prefix, int shard_id);
+};
+
+struct rgw_sync_error_info {
+  std::string source_zone;
+  uint32_t error_code;
+  std::string message;
+
+  rgw_sync_error_info() : error_code(0) {}
+  rgw_sync_error_info(const std::string& _source_zone, uint32_t _error_code, const std::string& _message) : source_zone(_source_zone), error_code(_error_code), message(_message) {}
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(1, 1, bl);
+    encode(source_zone, bl);
+    encode(error_code, bl);
+    encode(message, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) {
+    DECODE_START(1, bl);
+    decode(source_zone, bl);
+    decode(error_code, bl);
+    decode(message, bl);
+    DECODE_FINISH(bl);
+  }
+
+  void dump(Formatter *f) const;
+};
+WRITE_CLASS_ENCODER(rgw_sync_error_info)
+
+#define DEFAULT_BACKOFF_MAX 30
+
+class RGWSyncBackoff {
+  int cur_wait;
+  int max_secs;
+
+  void update_wait_time();
+public:
+  explicit RGWSyncBackoff(int _max_secs = DEFAULT_BACKOFF_MAX) : cur_wait(0), max_secs(_max_secs) {}
+
+  void backoff_sleep();
+  void reset() {
+    cur_wait = 0;
+  }
+
+  void backoff(RGWCoroutine *op);
+};
+
+class RGWBackoffControlCR : public RGWCoroutine
+{
+  RGWCoroutine *cr;
+  ceph::mutex lock;
+
+  RGWSyncBackoff backoff;
+  bool reset_backoff;
+
+  bool exit_on_error;
+
+protected:
+  bool *backoff_ptr() {
+    return &reset_backoff;
+  }
+
+  ceph::mutex& cr_lock() {
+    return lock;
+  }
+
+  RGWCoroutine *get_cr() {
+    return cr;
+  }
+
+public:
+  RGWBackoffControlCR(CephContext *_cct, bool _exit_on_error)
+    : RGWCoroutine(_cct),
+      cr(nullptr),
+      lock(ceph::make_mutex("RGWBackoffControlCR::lock:" + stringify(this))),
+      reset_backoff(false), exit_on_error(_exit_on_error) {
+  }
+
+  ~RGWBackoffControlCR() override {
+    if (cr) {
+      cr->put();
+    }
+  }
+
+  virtual RGWCoroutine *alloc_cr() = 0;
+  virtual RGWCoroutine *alloc_finisher_cr() { return NULL; }
+
+  int operate(const DoutPrefixProvider *dpp) override;
+};
+
+struct RGWMetaSyncEnv {
+  const DoutPrefixProvider *dpp;
+  CephContext *cct{nullptr};
+  rgw::sal::RadosStore* store{nullptr};
+  RGWRESTConn *conn{nullptr};
+  RGWAsyncRadosProcessor *async_rados{nullptr};
+  RGWHTTPManager *http_manager{nullptr};
+  RGWSyncErrorLogger *error_logger{nullptr};
+  RGWSyncTraceManager *sync_tracer{nullptr};
+
+  RGWMetaSyncEnv() {}
+
+  void init(const DoutPrefixProvider *_dpp, CephContext *_cct, rgw::sal::RadosStore* _store, RGWRESTConn *_conn,
+            RGWAsyncRadosProcessor *_async_rados, RGWHTTPManager *_http_manager,
+            RGWSyncErrorLogger *_error_logger, RGWSyncTraceManager *_sync_tracer);
+
+  std::string shard_obj_name(int shard_id);
+  std::string status_oid();
+};
+
+class RGWRemoteMetaLog : public RGWCoroutinesManager {
+  const DoutPrefixProvider *dpp;
+  rgw::sal::RadosStore* store;
+  RGWRESTConn *conn;
+  RGWAsyncRadosProcessor *async_rados;
+
+  RGWHTTPManager http_manager;
+  RGWMetaSyncStatusManager *status_manager;
+  RGWSyncErrorLogger *error_logger{nullptr};
+  RGWSyncTraceManager *sync_tracer{nullptr};
+
+  RGWMetaSyncCR *meta_sync_cr{nullptr};
+
+  RGWSyncBackoff backoff;
+
+  RGWMetaSyncEnv sync_env;
+
+  void init_sync_env(RGWMetaSyncEnv *env);
+  int store_sync_info(const DoutPrefixProvider *dpp, const rgw_meta_sync_info& sync_info);
+
+  std::atomic<bool> going_down = { false };
+
+  RGWSyncTraceNodeRef tn;
+
+public:
+  RGWRemoteMetaLog(const DoutPrefixProvider *dpp, rgw::sal::RadosStore* _store,
+                   RGWAsyncRadosProcessor *async_rados,
+                   RGWMetaSyncStatusManager *_sm)
+    : RGWCoroutinesManager(_store->ctx(), _store->getRados()->get_cr_registry()),
+      dpp(dpp), store(_store), conn(NULL), async_rados(async_rados),
+      http_manager(store->ctx(), completion_mgr),
+      status_manager(_sm) {}
+
+  virtual ~RGWRemoteMetaLog() override;
+
+  int init();
+  void finish();
+
+  int read_log_info(const DoutPrefixProvider *dpp, rgw_mdlog_info *log_info);
+  int read_master_log_shards_info(const DoutPrefixProvider *dpp, const std::string& master_period, std::map<int, RGWMetadataLogInfo> *shards_info);
+  int read_master_log_shards_next(const DoutPrefixProvider *dpp, const std::string& period, std::map<int, std::string> shard_markers, std::map<int, rgw_mdlog_shard_data> *result);
+  int read_sync_status(const DoutPrefixProvider *dpp, rgw_meta_sync_status *sync_status);
+  int init_sync_status(const DoutPrefixProvider *dpp);
+  int run_sync(const DoutPrefixProvider *dpp, optional_yield y);
+
+  void wakeup(int shard_id);
+
+  RGWMetaSyncEnv& get_sync_env() {
+    return sync_env;
+  }
+};
+
+class RGWMetaSyncStatusManager : public DoutPrefixProvider {
+  rgw::sal::RadosStore* store;
+  librados::IoCtx ioctx;
+
+  RGWRemoteMetaLog master_log;
+
+  std::map<int, rgw_raw_obj> shard_objs;
+
+  struct utime_shard {
+    real_time ts;
+    int shard_id;
+
+    utime_shard() : shard_id(-1) {}
+
+    bool operator<(const utime_shard& rhs) const {
+      if (ts == rhs.ts) {
+       return shard_id < rhs.shard_id;
+      }
+      return ts < rhs.ts;
+    }
+  };
+
+  ceph::shared_mutex ts_to_shard_lock = ceph::make_shared_mutex("ts_to_shard_lock");
+  std::map<utime_shard, int> ts_to_shard;
+  std::vector<std::string> clone_markers;
+
+public:
+  RGWMetaSyncStatusManager(rgw::sal::RadosStore* _store, RGWAsyncRadosProcessor *async_rados)
+    : store(_store), master_log(this, store, async_rados, this)
+  {}
+
+  virtual ~RGWMetaSyncStatusManager() override;
+
+  int init(const DoutPrefixProvider *dpp);
+
+  int read_sync_status(const DoutPrefixProvider *dpp, rgw_meta_sync_status *sync_status) {
+    return master_log.read_sync_status(dpp, sync_status);
+  }
+  int init_sync_status(const DoutPrefixProvider *dpp) { return master_log.init_sync_status(dpp); }
+  int read_log_info(const DoutPrefixProvider *dpp, rgw_mdlog_info *log_info) {
+    return master_log.read_log_info(dpp, log_info);
+  }
+  int read_master_log_shards_info(const DoutPrefixProvider *dpp, const std::string& master_period, std::map<int, RGWMetadataLogInfo> *shards_info) {
+    return master_log.read_master_log_shards_info(dpp, master_period, shards_info);
+  }
+  int read_master_log_shards_next(const DoutPrefixProvider *dpp, const std::string& period, std::map<int, std::string> shard_markers, std::map<int, rgw_mdlog_shard_data> *result) {
+    return master_log.read_master_log_shards_next(dpp, period, shard_markers, result);
+  }
+
+  int run(const DoutPrefixProvider *dpp, optional_yield y) { return master_log.run_sync(dpp, y); }
+
+
+  // implements DoutPrefixProvider
+  CephContext *get_cct() const override { return store->ctx(); }
+  unsigned get_subsys() const override;
+  std::ostream& gen_prefix(std::ostream& out) const override;
+
+  void wakeup(int shard_id) { return master_log.wakeup(shard_id); }
+  void stop() {
+    master_log.finish();
+  }
+};
+
+class RGWOrderCallCR : public RGWCoroutine
+{
+public:
+  RGWOrderCallCR(CephContext *cct) : RGWCoroutine(cct) {}
+
+  virtual void call_cr(RGWCoroutine *_cr) = 0;
+};
+
+class RGWLastCallerWinsCR : public RGWOrderCallCR
+{
+  RGWCoroutine *cr{nullptr};
+
+public:
+  explicit RGWLastCallerWinsCR(CephContext *cct) : RGWOrderCallCR(cct) {}
+  ~RGWLastCallerWinsCR() {
+    if (cr) {
+      cr->put();
+    }
+  }
+
+  int operate(const DoutPrefixProvider *dpp) override;
+
+  void call_cr(RGWCoroutine *_cr) override {
+    if (cr) {
+      cr->put();
+    }
+    cr = _cr;
+  }
+};
+
+template <class T, class K>
+class RGWSyncShardMarkerTrack {
+  struct marker_entry {
+    uint64_t pos;
+    real_time timestamp;
+
+    marker_entry() : pos(0) {}
+    marker_entry(uint64_t _p, const real_time& _ts) : pos(_p), timestamp(_ts) {}
+  };
+  typename std::map<T, marker_entry> pending;
+
+  std::map<T, marker_entry> finish_markers;
+
+  int window_size;
+  int updates_since_flush;
+
+  RGWOrderCallCR *order_cr{nullptr};
+
+protected:
+  typename std::set<K> need_retry_set;
+
+  virtual RGWCoroutine *store_marker(const T& new_marker, uint64_t index_pos, const real_time& timestamp) = 0;
+  virtual RGWOrderCallCR *allocate_order_control_cr() = 0;
+  virtual void handle_finish(const T& marker) { }
+
+public:
+  RGWSyncShardMarkerTrack(int _window_size) : window_size(_window_size), updates_since_flush(0) {}
+  virtual ~RGWSyncShardMarkerTrack() {
+    if (order_cr) {
+      order_cr->put();
+    }
+  }
+
+  bool start(const T& pos, int index_pos, const real_time& timestamp) {
+    if (pending.find(pos) != pending.end()) {
+      return false;
+    }
+    pending[pos] = marker_entry(index_pos, timestamp);
+    return true;
+  }
+
+  void try_update_high_marker(const T& pos, int index_pos, const real_time& timestamp) {
+    finish_markers[pos] = marker_entry(index_pos, timestamp);
+  }
+
+  RGWCoroutine *finish(const T& pos) {
+    if (pending.empty()) {
+      /* can happen, due to a bug that ended up with multiple objects with the same name and version
+       * -- which can happen when versioning is enabled an the version is 'null'.
+       */
+      return NULL;
+    }
+
+    typename std::map<T, marker_entry>::iterator iter = pending.begin();
+
+    bool is_first = (pos == iter->first);
+
+    typename std::map<T, marker_entry>::iterator pos_iter = pending.find(pos);
+    if (pos_iter == pending.end()) {
+      /* see pending.empty() comment */
+      return NULL;
+    }
+
+    finish_markers[pos] = pos_iter->second;
+
+    pending.erase(pos);
+
+    handle_finish(pos);
+
+    updates_since_flush++;
+
+    if (is_first && (updates_since_flush >= window_size || pending.empty())) {
+      return flush();
+    }
+    return NULL;
+  }
+
+  RGWCoroutine *flush() {
+    if (finish_markers.empty()) {
+      return NULL;
+    }
+
+    typename std::map<T, marker_entry>::iterator i;
+
+    if (pending.empty()) {
+      i = finish_markers.end();
+    } else {
+      i = finish_markers.lower_bound(pending.begin()->first);
+    }
+    if (i == finish_markers.begin()) {
+      return NULL;
+    }
+    updates_since_flush = 0;
+
+    auto last = i;
+    --i;
+    const T& high_marker = i->first;
+    marker_entry& high_entry = i->second;
+    RGWCoroutine *cr = order(store_marker(high_marker, high_entry.pos, high_entry.timestamp));
+    finish_markers.erase(finish_markers.begin(), last);
+    return cr;
+  }
+
+  /*
+   * a key needs retry if it was processing when another marker that points
+   * to the same bucket shards arrives. Instead of processing it, we mark
+   * it as need_retry so that when we finish processing the original, we
+   * retry the processing on the same bucket shard, in case there are more
+   * entries to process. This closes a race that can happen.
+   */
+  bool need_retry(const K& key) {
+    return (need_retry_set.find(key) != need_retry_set.end());
+  }
+
+  void set_need_retry(const K& key) {
+    need_retry_set.insert(key);
+  }
+
+  void reset_need_retry(const K& key) {
+    need_retry_set.erase(key);
+  }
+
+  RGWCoroutine *order(RGWCoroutine *cr) {
+    /* either returns a new RGWLastWriteWinsCR, or update existing one, in which case it returns
+     * nothing and the existing one will call the cr
+     */
+    if (order_cr && order_cr->is_done()) {
+      order_cr->put();
+      order_cr = nullptr;
+    }
+    if (!order_cr) {
+      order_cr = allocate_order_control_cr();
+      order_cr->get();
+      order_cr->call_cr(cr);
+      return order_cr;
+    }
+    order_cr->call_cr(cr);
+    return nullptr; /* don't call it a second time */
+  }
+};
+
+class RGWMetaSyncShardMarkerTrack;
+
+class RGWMetaSyncSingleEntryCR : public RGWCoroutine {
+  RGWMetaSyncEnv *sync_env;
+
+  std::string raw_key;
+  std::string entry_marker;
+  RGWMDLogStatus op_status;
+
+  ssize_t pos;
+  std::string section;
+  std::string key;
+
+  int sync_status;
+
+  bufferlist md_bl;
+
+  RGWMetaSyncShardMarkerTrack *marker_tracker;
+
+  int tries;
+
+  bool error_injection;
+
+  RGWSyncTraceNodeRef tn;
+
+public:
+  RGWMetaSyncSingleEntryCR(RGWMetaSyncEnv *_sync_env,
+                           const std::string& _raw_key, const std::string& _entry_marker,
+                           const RGWMDLogStatus& _op_status,
+                           RGWMetaSyncShardMarkerTrack *_marker_tracker, const RGWSyncTraceNodeRef& _tn_parent);
+
+  int operate(const DoutPrefixProvider *dpp) override;
+};
+
+class RGWShardCollectCR : public RGWCoroutine {
+  int current_running = 0;
+ protected:
+  int max_concurrent;
+  int status = 0;
+
+  // called with the result of each child. error codes can be ignored by
+  // returning 0. if handle_result() returns a negative value, it's
+  // treated as an error and stored in 'status'. the last such error is
+  // reported to the caller with set_cr_error()
+  virtual int handle_result(int r) = 0;
+ public:
+  RGWShardCollectCR(CephContext *_cct, int _max_concurrent)
+    : RGWCoroutine(_cct), max_concurrent(_max_concurrent)
+  {}
+
+  virtual bool spawn_next() = 0;
+  int operate(const DoutPrefixProvider *dpp) override;
+};
+
+// factory functions for meta sync coroutines needed in mdlog trimming
+
+RGWCoroutine* create_read_remote_mdlog_shard_info_cr(RGWMetaSyncEnv *env,
+                                                     const std::string& period,
+                                                     int shard_id,
+                                                     RGWMetadataLogInfo* info);
+
+RGWCoroutine* create_list_remote_mdlog_shard_cr(RGWMetaSyncEnv *env,
+                                                const std::string& period,
+                                                int shard_id,
+                                                const std::string& marker,
+                                                uint32_t max_entries,
+                                                rgw_mdlog_shard_data *result);
+
+#endif
diff --git a/src/rgw/driver/rados/rgw_sync_counters.cc b/src/rgw/driver/rados/rgw_sync_counters.cc
new file mode 100644 (file)
index 0000000..1d23d58
--- /dev/null
@@ -0,0 +1,28 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "common/ceph_context.h"
+#include "rgw_sync_counters.h"
+
+namespace sync_counters {
+
+PerfCountersRef build(CephContext *cct, const std::string& name)
+{
+  PerfCountersBuilder b(cct, name, l_first, l_last);
+
+  // share these counters with ceph-mgr
+  b.set_prio_default(PerfCountersBuilder::PRIO_USEFUL);
+
+  b.add_u64_avg(l_fetch, "fetch_bytes", "Number of object bytes replicated");
+  b.add_u64_counter(l_fetch_not_modified, "fetch_not_modified", "Number of objects already replicated");
+  b.add_u64_counter(l_fetch_err, "fetch_errors", "Number of object replication errors");
+
+  b.add_time_avg(l_poll, "poll_latency", "Average latency of replication log requests");
+  b.add_u64_counter(l_poll_err, "poll_errors", "Number of replication log request errors");
+
+  auto logger = PerfCountersRef{ b.create_perf_counters(), cct };
+  cct->get_perfcounters_collection()->add(logger.get());
+  return logger;
+}
+
+} // namespace sync_counters
diff --git a/src/rgw/driver/rados/rgw_sync_counters.h b/src/rgw/driver/rados/rgw_sync_counters.h
new file mode 100644 (file)
index 0000000..df3acc6
--- /dev/null
@@ -0,0 +1,25 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include "common/perf_counters_collection.h"
+
+namespace sync_counters {
+
+enum {
+  l_first = 805000,
+
+  l_fetch,
+  l_fetch_not_modified,
+  l_fetch_err,
+
+  l_poll,
+  l_poll_err,
+
+  l_last,
+};
+
+PerfCountersRef build(CephContext *cct, const std::string& name);
+
+} // namespace sync_counters
diff --git a/src/rgw/driver/rados/rgw_sync_error_repo.cc b/src/rgw/driver/rados/rgw_sync_error_repo.cc
new file mode 100644 (file)
index 0000000..44305b6
--- /dev/null
@@ -0,0 +1,205 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2020 Red Hat, Inc
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ */
+
+#include "rgw_sync_error_repo.h"
+#include "rgw_coroutine.h"
+#include "rgw_sal.h"
+#include "services/svc_rados.h"
+#include "cls/cmpomap/client.h"
+
+namespace rgw::error_repo {
+
+// prefix for the binary encoding of keys. this particular value is not
+// valid as the first byte of a utf8 code point, so we use this to
+// differentiate the binary encoding from existing string keys for
+// backward-compatibility
+constexpr uint8_t binary_key_prefix = 0x80;
+
+struct key_type {
+  rgw_bucket_shard bs;
+  std::optional<uint64_t> gen;
+};
+
+void encode(const key_type& k, bufferlist& bl, uint64_t f=0)
+{
+  ENCODE_START(1, 1, bl);
+  encode(k.bs, bl);
+  encode(k.gen, bl);
+  ENCODE_FINISH(bl);
+}
+
+void decode(key_type& k, bufferlist::const_iterator& bl)
+{
+  DECODE_START(1, bl);
+  decode(k.bs, bl);
+  decode(k.gen, bl);
+  DECODE_FINISH(bl);
+}
+
+std::string encode_key(const rgw_bucket_shard& bs,
+                       std::optional<uint64_t> gen)
+{
+  using ceph::encode;
+  const auto key = key_type{bs, gen};
+  bufferlist bl;
+  encode(binary_key_prefix, bl);
+  encode(key, bl);
+  return bl.to_str();
+}
+
+int decode_key(std::string encoded,
+               rgw_bucket_shard& bs,
+               std::optional<uint64_t>& gen)
+{
+  using ceph::decode;
+  key_type key;
+  const auto bl = bufferlist::static_from_string(encoded);
+  auto p = bl.cbegin();
+  try {
+    uint8_t prefix;
+    decode(prefix, p);
+    if (prefix != binary_key_prefix) {
+      return -EINVAL;
+    }
+    decode(key, p);
+  } catch (const buffer::error&) {
+    return -EIO;
+  }
+  if (!p.end()) {
+    return -EIO; // buffer contained unexpected bytes
+  }
+  bs = std::move(key.bs);
+  gen = key.gen;
+  return 0;
+}
+
+ceph::real_time decode_value(const bufferlist& bl)
+{
+  uint64_t value;
+  try {
+    using ceph::decode;
+    decode(value, bl);
+  } catch (const buffer::error&) {
+    value = 0; // empty buffer = 0
+  }
+  return ceph::real_clock::zero() + ceph::timespan(value);
+}
+
+int write(librados::ObjectWriteOperation& op,
+          const std::string& key,
+          ceph::real_time timestamp)
+{
+  // overwrite the existing timestamp if value is greater
+  const uint64_t value = timestamp.time_since_epoch().count();
+  using namespace ::cls::cmpomap;
+  const bufferlist zero = u64_buffer(0); // compare against 0 for missing keys
+  return cmp_set_vals(op, Mode::U64, Op::GT, {{key, u64_buffer(value)}}, zero);
+}
+
+int remove(librados::ObjectWriteOperation& op,
+           const std::string& key,
+           ceph::real_time timestamp)
+{
+  // remove the omap key if value >= existing
+  const uint64_t value = timestamp.time_since_epoch().count();
+  using namespace ::cls::cmpomap;
+  return cmp_rm_keys(op, Mode::U64, Op::GTE, {{key, u64_buffer(value)}});
+}
+
+class RGWErrorRepoWriteCR : public RGWSimpleCoroutine {
+  RGWSI_RADOS::Obj obj;
+  std::string key;
+  ceph::real_time timestamp;
+
+  boost::intrusive_ptr<RGWAioCompletionNotifier> cn;
+ public:
+  RGWErrorRepoWriteCR(RGWSI_RADOS* rados, const rgw_raw_obj& raw_obj,
+                      const std::string& key, ceph::real_time timestamp)
+    : RGWSimpleCoroutine(rados->ctx()),
+      obj(rados->obj(raw_obj)),
+      key(key), timestamp(timestamp)
+  {}
+
+  int send_request(const DoutPrefixProvider *dpp) override {
+    librados::ObjectWriteOperation op;
+    int r = write(op, key, timestamp);
+    if (r < 0) {
+      return r;
+    }
+    r = obj.open(dpp);
+    if (r < 0) {
+      return r;
+    }
+
+    cn = stack->create_completion_notifier();
+    return obj.aio_operate(cn->completion(), &op);
+  }
+
+  int request_complete() override {
+    return cn->completion()->get_return_value();
+  }
+};
+
+RGWCoroutine* write_cr(RGWSI_RADOS* rados,
+                       const rgw_raw_obj& obj,
+                       const std::string& key,
+                       ceph::real_time timestamp)
+{
+  return new RGWErrorRepoWriteCR(rados, obj, key, timestamp);
+}
+
+
+class RGWErrorRepoRemoveCR : public RGWSimpleCoroutine {
+  RGWSI_RADOS::Obj obj;
+  std::string key;
+  ceph::real_time timestamp;
+
+  boost::intrusive_ptr<RGWAioCompletionNotifier> cn;
+ public:
+  RGWErrorRepoRemoveCR(RGWSI_RADOS* rados, const rgw_raw_obj& raw_obj,
+                       const std::string& key, ceph::real_time timestamp)
+    : RGWSimpleCoroutine(rados->ctx()),
+      obj(rados->obj(raw_obj)),
+      key(key), timestamp(timestamp)
+  {}
+
+  int send_request(const DoutPrefixProvider *dpp) override {
+    librados::ObjectWriteOperation op;
+    int r = remove(op, key, timestamp);
+    if (r < 0) {
+      return r;
+    }
+    r = obj.open(dpp);
+    if (r < 0) {
+      return r;
+    }
+
+    cn = stack->create_completion_notifier();
+    return obj.aio_operate(cn->completion(), &op);
+  }
+
+  int request_complete() override {
+    return cn->completion()->get_return_value();
+  }
+};
+
+RGWCoroutine* remove_cr(RGWSI_RADOS* rados,
+                        const rgw_raw_obj& obj,
+                        const std::string& key,
+                        ceph::real_time timestamp)
+{
+  return new RGWErrorRepoRemoveCR(rados, obj, key, timestamp);
+}
+
+} // namespace rgw::error_repo
diff --git a/src/rgw/driver/rados/rgw_sync_error_repo.h b/src/rgw/driver/rados/rgw_sync_error_repo.h
new file mode 100644 (file)
index 0000000..60525d2
--- /dev/null
@@ -0,0 +1,59 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2020 Red Hat, Inc
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ */
+
+#pragma once
+
+#include <optional>
+#include "include/rados/librados_fwd.hpp"
+#include "include/buffer_fwd.h"
+#include "common/ceph_time.h"
+
+class RGWSI_RADOS;
+class RGWCoroutine;
+struct rgw_raw_obj;
+struct rgw_bucket_shard;
+
+namespace rgw::error_repo {
+
+// binary-encode a bucket/shard/gen and return it as a string
+std::string encode_key(const rgw_bucket_shard& bs,
+                       std::optional<uint64_t> gen);
+
+// try to decode a key. returns -EINVAL if not in binary format
+int decode_key(std::string encoded,
+               rgw_bucket_shard& bs,
+               std::optional<uint64_t>& gen);
+
+// decode a timestamp as a uint64_t for CMPXATTR_MODE_U64
+ceph::real_time decode_value(const ceph::bufferlist& bl);
+
+// write an omap key iff the given timestamp is newer
+int write(librados::ObjectWriteOperation& op,
+          const std::string& key,
+          ceph::real_time timestamp);
+RGWCoroutine* write_cr(RGWSI_RADOS* rados,
+                       const rgw_raw_obj& obj,
+                       const std::string& key,
+                       ceph::real_time timestamp);
+
+// remove an omap key iff there isn't a newer timestamp
+int remove(librados::ObjectWriteOperation& op,
+           const std::string& key,
+           ceph::real_time timestamp);
+RGWCoroutine* remove_cr(RGWSI_RADOS* rados,
+                        const rgw_raw_obj& obj,
+                        const std::string& key,
+                        ceph::real_time timestamp);
+
+} // namespace rgw::error_repo
diff --git a/src/rgw/driver/rados/rgw_sync_module.cc b/src/rgw/driver/rados/rgw_sync_module.cc
new file mode 100644 (file)
index 0000000..5a1e70b
--- /dev/null
@@ -0,0 +1,87 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "rgw_common.h"
+#include "rgw_coroutine.h"
+#include "rgw_cr_rados.h"
+#include "rgw_sync_module.h"
+#include "rgw_data_sync.h"
+#include "rgw_bucket.h"
+
+#include "rgw_sync_module_log.h"
+#include "rgw_sync_module_es.h"
+#include "rgw_sync_module_aws.h"
+
+#include <boost/asio/yield.hpp>
+
+#define dout_subsys ceph_subsys_rgw
+
+RGWMetadataHandler *RGWSyncModuleInstance::alloc_bucket_meta_handler()
+{
+  return RGWBucketMetaHandlerAllocator::alloc();
+}
+
+RGWBucketInstanceMetadataHandlerBase* RGWSyncModuleInstance::alloc_bucket_instance_meta_handler(rgw::sal::Driver* driver)
+{
+  return RGWBucketInstanceMetaHandlerAllocator::alloc(driver);
+}
+
+RGWStatRemoteObjCBCR::RGWStatRemoteObjCBCR(RGWDataSyncCtx *_sc,
+                       rgw_bucket& _src_bucket, rgw_obj_key& _key) : RGWCoroutine(_sc->cct),
+                                                          sc(_sc), sync_env(_sc->env),
+                                                          src_bucket(_src_bucket), key(_key) {
+}
+
+RGWCallStatRemoteObjCR::RGWCallStatRemoteObjCR(RGWDataSyncCtx *_sc,
+                                               rgw_bucket& _src_bucket, rgw_obj_key& _key) : RGWCoroutine(_sc->cct),
+                                                                                                 sc(_sc), sync_env(_sc->env),
+                                                                                                 src_bucket(_src_bucket), key(_key) {
+}
+
+int RGWCallStatRemoteObjCR::operate(const DoutPrefixProvider *dpp) {
+  reenter(this) {
+    yield {
+      call(new RGWStatRemoteObjCR(sync_env->async_rados, sync_env->driver,
+                                  sc->source_zone,
+                                  src_bucket, key, &mtime, &size, &etag, &attrs, &headers));
+    }
+    if (retcode < 0) {
+      ldpp_dout(dpp, 10) << "RGWStatRemoteObjCR() returned " << retcode << dendl;
+      return set_cr_error(retcode);
+    }
+    ldpp_dout(dpp, 20) << "stat of remote obj: z=" << sc->source_zone
+                             << " b=" << src_bucket << " k=" << key
+                             << " size=" << size << " mtime=" << mtime << dendl;
+    yield {
+      RGWStatRemoteObjCBCR *cb = allocate_callback();
+      if (cb) {
+        cb->set_result(mtime, size, etag, std::move(attrs), std::move(headers));
+        call(cb);
+      }
+    }
+    if (retcode < 0) {
+      ldpp_dout(dpp, 10) << "RGWStatRemoteObjCR() callback returned " << retcode << dendl;
+      return set_cr_error(retcode);
+    }
+    return set_cr_done();
+  }
+  return 0;
+}
+
+void rgw_register_sync_modules(RGWSyncModulesManager *modules_manager)
+{
+  RGWSyncModuleRef default_module(std::make_shared<RGWDefaultSyncModule>());
+  modules_manager->register_module("rgw", default_module, true);
+
+  RGWSyncModuleRef archive_module(std::make_shared<RGWArchiveSyncModule>());
+  modules_manager->register_module("archive", archive_module);
+
+  RGWSyncModuleRef log_module(std::make_shared<RGWLogSyncModule>());
+  modules_manager->register_module("log", log_module);
+
+  RGWSyncModuleRef es_module(std::make_shared<RGWElasticSyncModule>());
+  modules_manager->register_module("elasticsearch", es_module);
+
+  RGWSyncModuleRef aws_module(std::make_shared<RGWAWSSyncModule>());
+  modules_manager->register_module("cloud", aws_module);
+}
diff --git a/src/rgw/driver/rados/rgw_sync_module.h b/src/rgw/driver/rados/rgw_sync_module.h
new file mode 100644 (file)
index 0000000..6d974c3
--- /dev/null
@@ -0,0 +1,202 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#ifndef CEPH_RGW_SYNC_MODULE_H
+#define CEPH_RGW_SYNC_MODULE_H
+
+#include "rgw_common.h"
+#include "rgw_coroutine.h"
+
+class RGWBucketInfo;
+class RGWRemoteDataLog;
+struct RGWDataSyncCtx;
+struct RGWDataSyncEnv;
+struct rgw_bucket_entry_owner;
+struct rgw_obj_key;
+struct rgw_bucket_sync_pipe;
+
+
+class RGWDataSyncModule {
+public:
+  RGWDataSyncModule() {}
+  virtual ~RGWDataSyncModule() {}
+
+  virtual void init(RGWDataSyncCtx *sync_env, uint64_t instance_id) {}
+
+  virtual RGWCoroutine *init_sync(const DoutPrefixProvider *dpp, RGWDataSyncCtx *sc) {
+    return nullptr;
+  }
+
+  virtual RGWCoroutine *start_sync(const DoutPrefixProvider *dpp, RGWDataSyncCtx *sc) {
+    return nullptr;
+  }
+  virtual RGWCoroutine *sync_object(const DoutPrefixProvider *dpp, RGWDataSyncCtx *sc, rgw_bucket_sync_pipe& sync_pipe, rgw_obj_key& key, std::optional<uint64_t> versioned_epoch, rgw_zone_set *zones_trace) = 0;
+  virtual RGWCoroutine *remove_object(const DoutPrefixProvider *dpp, RGWDataSyncCtx *sc, rgw_bucket_sync_pipe& bucket_info, rgw_obj_key& key, real_time& mtime,
+                                      bool versioned, uint64_t versioned_epoch, rgw_zone_set *zones_trace) = 0;
+  virtual RGWCoroutine *create_delete_marker(const DoutPrefixProvider *dpp, RGWDataSyncCtx *sc, rgw_bucket_sync_pipe& bucket_info, rgw_obj_key& key, real_time& mtime,
+                                             rgw_bucket_entry_owner& owner, bool versioned, uint64_t versioned_epoch, rgw_zone_set *zones_trace) = 0;
+};
+
+class RGWRESTMgr;
+class RGWMetadataHandler;
+class RGWBucketInstanceMetadataHandlerBase;
+
+class RGWSyncModuleInstance {
+public:
+  RGWSyncModuleInstance() {}
+  virtual ~RGWSyncModuleInstance() {}
+  virtual RGWDataSyncModule *get_data_handler() = 0;
+  virtual RGWRESTMgr *get_rest_filter(int dialect, RGWRESTMgr *orig) {
+    return orig;
+  }
+  virtual bool supports_user_writes() {
+    return false;
+  }
+  virtual RGWMetadataHandler *alloc_bucket_meta_handler();
+  virtual RGWBucketInstanceMetadataHandlerBase *alloc_bucket_instance_meta_handler(rgw::sal::Driver* driver);
+
+  // indication whether the sync module start with full sync (default behavior)
+  // incremental sync would follow anyway
+  virtual bool should_full_sync() const {
+      return true;
+  }
+};
+
+typedef std::shared_ptr<RGWSyncModuleInstance> RGWSyncModuleInstanceRef;
+
+class JSONFormattable;
+
+class RGWSyncModule {
+
+public:
+  RGWSyncModule() {}
+  virtual ~RGWSyncModule() {}
+
+  virtual bool supports_writes() {
+    return false;
+  }
+  virtual bool supports_data_export() = 0;
+  virtual int create_instance(const DoutPrefixProvider *dpp, CephContext *cct, const JSONFormattable& config, RGWSyncModuleInstanceRef *instance) = 0;
+};
+
+typedef std::shared_ptr<RGWSyncModule> RGWSyncModuleRef;
+
+
+class RGWSyncModulesManager {
+  ceph::mutex lock = ceph::make_mutex("RGWSyncModulesManager");
+
+  std::map<std::string, RGWSyncModuleRef> modules;
+public:
+  RGWSyncModulesManager() = default;
+
+  void register_module(const std::string& name, RGWSyncModuleRef& module, bool is_default = false) {
+    std::lock_guard l{lock};
+    modules[name] = module;
+    if (is_default) {
+      modules[std::string()] = module;
+    }
+  }
+
+  bool get_module(const std::string& name, RGWSyncModuleRef *module) {
+    std::lock_guard l{lock};
+    auto iter = modules.find(name);
+    if (iter == modules.end()) {
+      return false;
+    }
+    if (module != nullptr) {
+      *module = iter->second;
+    }
+    return true;
+  }
+
+
+  bool supports_data_export(const std::string& name) {
+    RGWSyncModuleRef module;
+    if (!get_module(name, &module)) {
+      return false;
+    }
+
+    return module->supports_data_export();
+  }
+
+  int create_instance(const DoutPrefixProvider *dpp, CephContext *cct, const std::string& name, const JSONFormattable& config, RGWSyncModuleInstanceRef *instance) {
+    RGWSyncModuleRef module;
+    if (!get_module(name, &module)) {
+      return -ENOENT;
+    }
+
+    return module.get()->create_instance(dpp, cct, config, instance);
+  }
+
+  std::vector<std::string> get_registered_module_names() const {
+    std::vector<std::string> names;
+    for (auto& i: modules) {
+      if (!i.first.empty()) {
+        names.push_back(i.first);
+      }
+    }
+    return names;
+  }
+};
+
+class RGWStatRemoteObjCBCR : public RGWCoroutine {
+protected:
+  RGWDataSyncCtx *sc;
+  RGWDataSyncEnv *sync_env;
+
+  rgw_bucket src_bucket;
+  rgw_obj_key key;
+
+  ceph::real_time mtime;
+  uint64_t size = 0;
+  std::string etag;
+  std::map<std::string, bufferlist> attrs;
+  std::map<std::string, std::string> headers;
+public:
+  RGWStatRemoteObjCBCR(RGWDataSyncCtx *_sc,
+                       rgw_bucket& _src_bucket, rgw_obj_key& _key);
+  ~RGWStatRemoteObjCBCR() override {}
+
+  void set_result(ceph::real_time& _mtime,
+                  uint64_t _size,
+                  const std::string& _etag,
+                  std::map<std::string, bufferlist>&& _attrs,
+                  std::map<std::string, std::string>&& _headers) {
+    mtime = _mtime;
+    size = _size;
+    etag = _etag;
+    attrs = std::move(_attrs);
+    headers = std::move(_headers);
+  }
+};
+
+class RGWCallStatRemoteObjCR : public RGWCoroutine {
+  ceph::real_time mtime;
+  uint64_t size{0};
+  std::string etag;
+  std::map<std::string, bufferlist> attrs;
+  std::map<std::string, std::string> headers;
+
+protected:
+  RGWDataSyncCtx *sc;
+  RGWDataSyncEnv *sync_env;
+
+  rgw_bucket src_bucket;
+  rgw_obj_key key;
+
+public:
+  RGWCallStatRemoteObjCR(RGWDataSyncCtx *_sc,
+                     rgw_bucket& _src_bucket, rgw_obj_key& _key);
+
+  ~RGWCallStatRemoteObjCR() override {}
+
+  int operate(const DoutPrefixProvider *dpp) override;
+
+  virtual RGWStatRemoteObjCBCR *allocate_callback() {
+    return nullptr;
+  }
+};
+
+void rgw_register_sync_modules(RGWSyncModulesManager *modules_manager);
+
+#endif
diff --git a/src/rgw/driver/rados/rgw_sync_module_aws.cc b/src/rgw/driver/rados/rgw_sync_module_aws.cc
new file mode 100644 (file)
index 0000000..6827f7f
--- /dev/null
@@ -0,0 +1,1836 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "common/errno.h"
+
+#include "rgw_common.h"
+#include "rgw_coroutine.h"
+#include "rgw_sync_module.h"
+#include "rgw_data_sync.h"
+#include "rgw_sync_module_aws.h"
+#include "rgw_cr_rados.h"
+#include "rgw_rest_conn.h"
+#include "rgw_cr_rest.h"
+#include "rgw_acl.h"
+#include "rgw_zone.h"
+
+#include "services/svc_zone.h"
+
+#include <boost/asio/yield.hpp>
+
+#define dout_subsys ceph_subsys_rgw
+
+
+#define DEFAULT_MULTIPART_SYNC_PART_SIZE (32 * 1024 * 1024)
+
+using namespace std;
+
+static string default_target_path = "rgw-${zonegroup}-${sid}/${bucket}";
+
+static string get_key_oid(const rgw_obj_key& key)
+{
+  string oid = key.name;
+  if (!key.instance.empty() &&
+      !key.have_null_instance()) {
+    oid += string(":") + key.instance;
+  }
+  return oid;
+}
+
+static string obj_to_aws_path(rgw::sal::Object* obj)
+{
+  string path = obj->get_bucket()->get_name() + "/" + get_key_oid(obj->get_key());
+
+
+  return path;
+}
+
+/*
+
+   json configuration definition:
+
+    {
+      "connection": {
+        "access_key": <access>,
+        "secret": <secret>,
+        "endpoint": <endpoint>,
+        "host_style": <path | virtual>,
+      },
+      "acls": [ { "type": <id | email | uri>,
+                  "source_id": <source_id>,
+                  "dest_id": <dest_id> } ... ],  # optional, acl mappings, no mappings if does not exist
+      "target_path": <target_path>, # override default
+           
+
+      # anything below here is for non trivial configuration 
+      # can be used in conjuction with the above
+
+      "default": {
+        "connection": {
+            "access_key": <access>,
+            "secret": <secret>,
+            "endpoint": <endpoint>,
+            "host_style" <path | virtual>,
+        },
+        "acls": [    # list of source uids and how they map into destination uids in the dest objects acls
+        {
+          "type" : <id | email | uri>,   #  optional, default is id
+          "source_id": <id>,
+          "dest_id": <id>
+        } ... ]
+        "target_path": "rgwx-${sid}/${bucket}" # how a bucket name is mapped to destination path,
+                                               # final object name will be target_path + "/" + obj
+      },
+      "connections": [
+          {
+            "id": <id>,
+            "access_key": <access>,
+            "secret": <secret>,
+            "endpoint": <endpoint>,
+          } ... ],
+      "acl_profiles": [
+          {
+            "id": <id>, # acl mappings
+            "acls": [ {
+                "type": <id | email | uri>,
+                "source_id": <id>,
+                "dest_id": <id>
+              } ... ]
+          }
+      ],
+      "profiles": [
+          {
+           "source_bucket": <source>, # can specify either specific bucket name (foo), or prefix (foo*)
+           "target_path": <dest>,   # (override default)
+           "connection_id": <connection_id>, # optional, if empty references default connection
+           "acls_id": <mappings_id>, # optional, if empty references default mappings
+          } ... ],
+    }
+
+target path optional variables:
+
+(evaluated at init)
+sid: sync instance id, randomly generated by sync process on first sync initalization
+zonegroup: zonegroup name
+zonegroup_id: zonegroup name
+zone: zone name
+zone_id: zone name
+
+(evaluated when syncing)
+bucket: bucket name
+owner: bucket owner
+
+*/
+
+struct ACLMapping {
+  ACLGranteeTypeEnum type{ACL_TYPE_CANON_USER};
+  string source_id;
+  string dest_id;
+
+  ACLMapping() = default;
+
+  ACLMapping(ACLGranteeTypeEnum t,
+             const string& s,
+             const string& d) : type(t),
+  source_id(s),
+  dest_id(d) {}
+
+  void init(const JSONFormattable& config) {
+    const string& t = config["type"];
+
+    if (t == "email") {
+      type = ACL_TYPE_EMAIL_USER;
+    } else if (t == "uri") {
+      type = ACL_TYPE_GROUP;
+    } else {
+      type = ACL_TYPE_CANON_USER;
+    }
+
+    source_id = config["source_id"];
+    dest_id = config["dest_id"];
+  }
+
+  void dump_conf(CephContext *cct, JSONFormatter& jf) const {
+    Formatter::ObjectSection os(jf, "acl_mapping");
+    string s;
+    switch (type) {
+      case ACL_TYPE_EMAIL_USER:
+        s = "email";
+        break;
+      case ACL_TYPE_GROUP:
+        s = "uri";
+        break;
+      default:
+        s = "id";
+        break;
+    }
+    encode_json("type", s, &jf);
+    encode_json("source_id", source_id, &jf);
+    encode_json("dest_id", dest_id, &jf);
+  }
+};
+
+struct ACLMappings {
+  map<string, ACLMapping> acl_mappings;
+
+  void init(const JSONFormattable& config) {
+    for (auto& c : config.array()) {
+      ACLMapping m;
+      m.init(c);
+
+      acl_mappings.emplace(std::make_pair(m.source_id, m));
+    }
+  }
+  void dump_conf(CephContext *cct, JSONFormatter& jf) const {
+    Formatter::ArraySection os(jf, "acls");
+
+    for (auto& i : acl_mappings) {
+      i.second.dump_conf(cct, jf);
+    }
+  }
+};
+
+struct AWSSyncConfig_ACLProfiles {
+  map<string, std::shared_ptr<ACLMappings> > acl_profiles;
+
+  void init(const JSONFormattable& config) {
+    for (auto& c : config.array()) {
+      const string& profile_id = c["id"];
+
+      std::shared_ptr<ACLMappings> ap{new ACLMappings};
+      ap->init(c["acls"]);
+
+      acl_profiles[profile_id] = ap;
+    }
+  }
+  void dump_conf(CephContext *cct, JSONFormatter& jf) const {
+    Formatter::ArraySection section(jf, "acl_profiles");
+
+    for (auto& p : acl_profiles) {
+      Formatter::ObjectSection section(jf, "profile");
+      encode_json("id", p.first, &jf);
+      p.second->dump_conf(cct, jf);
+    }
+  }
+
+  bool find(const string& profile_id, ACLMappings *result) const {
+    auto iter = acl_profiles.find(profile_id);
+    if (iter == acl_profiles.end()) {
+      return false;
+    }
+    *result = *iter->second;
+    return true;
+  }
+};
+
+struct AWSSyncConfig_Connection {
+  string connection_id;
+  string endpoint;
+  RGWAccessKey key;
+  std::optional<string> region;
+  HostStyle host_style{PathStyle};
+
+  bool has_endpoint{false};
+  bool has_key{false};
+  bool has_host_style{false};
+
+  void init(const JSONFormattable& config) {
+    has_endpoint = config.exists("endpoint");
+    has_key = config.exists("access_key") || config.exists("secret");
+    has_host_style = config.exists("host_style");
+
+    connection_id = config["id"];
+    endpoint = config["endpoint"];
+
+    key = RGWAccessKey(config["access_key"], config["secret"]);
+
+    if (config.exists("region")) {
+      region = config["region"];
+    } else {
+      region.reset();
+    }
+
+    string host_style_str = config["host_style"];
+    if (host_style_str != "virtual") {
+      host_style = PathStyle;
+    } else {
+      host_style = VirtualStyle;
+    }
+  }
+  void dump_conf(CephContext *cct, JSONFormatter& jf) const {
+    Formatter::ObjectSection section(jf, "connection");
+    encode_json("id", connection_id, &jf);
+    encode_json("endpoint", endpoint, &jf);
+    string s = (host_style == PathStyle ? "path" : "virtual");
+    encode_json("region", region, &jf);
+    encode_json("host_style", s, &jf);
+
+    {
+      Formatter::ObjectSection os(jf, "key");
+      encode_json("access_key", key.id, &jf);
+      string secret = (key.key.empty() ? "" : "******");
+      encode_json("secret", secret, &jf);
+    }
+  }
+};
+
+static int conf_to_uint64(const DoutPrefixProvider *dpp, CephContext *cct, const JSONFormattable& config, const string& key, uint64_t *pval)
+{
+  string sval;
+  if (config.find(key, &sval)) {
+    string err;
+    uint64_t val = strict_strtoll(sval.c_str(), 10, &err);
+    if (!err.empty()) {
+      ldpp_dout(dpp, 0) << "ERROR: could not parse configurable value for cloud sync module: " << key << ": " << sval << dendl;
+      return -EINVAL;
+    }
+    *pval = val;
+  }
+  return 0;
+}
+
+struct AWSSyncConfig_S3 {
+  uint64_t multipart_sync_threshold{DEFAULT_MULTIPART_SYNC_PART_SIZE};
+  uint64_t multipart_min_part_size{DEFAULT_MULTIPART_SYNC_PART_SIZE};
+
+  int init(const DoutPrefixProvider *dpp, CephContext *cct, const JSONFormattable& config) {
+    int r = conf_to_uint64(dpp, cct, config, "multipart_sync_threshold", &multipart_sync_threshold);
+    if (r < 0) {
+      return r;
+    }
+
+    r = conf_to_uint64(dpp, cct, config, "multipart_min_part_size", &multipart_min_part_size);
+    if (r < 0) {
+      return r;
+    }
+#define MULTIPART_MIN_POSSIBLE_PART_SIZE (5 * 1024 * 1024)
+    if (multipart_min_part_size < MULTIPART_MIN_POSSIBLE_PART_SIZE) {
+      multipart_min_part_size = MULTIPART_MIN_POSSIBLE_PART_SIZE;
+    }
+    return 0;
+  }
+
+  void dump_conf(CephContext *cct, JSONFormatter& jf) const {
+    Formatter::ObjectSection section(jf, "s3");
+    encode_json("multipart_sync_threshold", multipart_sync_threshold, &jf);
+    encode_json("multipart_min_part_size", multipart_min_part_size, &jf);
+  }
+};
+
+struct AWSSyncConfig_Profile {
+  string source_bucket;
+  bool prefix{false};
+  string target_path;
+  string connection_id;
+  string acls_id;
+
+  std::shared_ptr<AWSSyncConfig_Connection> conn_conf;
+  std::shared_ptr<ACLMappings> acls;
+
+  std::shared_ptr<RGWRESTConn> conn;
+
+  void init(const JSONFormattable& config) {
+    source_bucket = config["source_bucket"];
+
+    prefix = (!source_bucket.empty() && source_bucket[source_bucket.size() - 1] == '*');
+
+    if (prefix) {
+      source_bucket = source_bucket.substr(0, source_bucket.size() - 1);
+    }
+
+    target_path = config["target_path"];
+    connection_id = config["connection_id"];
+    acls_id = config["acls_id"];
+
+    if (config.exists("connection")) {
+      conn_conf = make_shared<AWSSyncConfig_Connection>();
+      conn_conf->init(config["connection"]);
+    }
+
+    if (config.exists("acls")) {
+      acls = make_shared<ACLMappings>();
+      acls->init(config["acls"]);
+    }
+  }
+
+  void dump_conf(CephContext *cct, JSONFormatter& jf, const char *section = "config") const {
+    Formatter::ObjectSection config(jf, section);
+    string sb{source_bucket};
+    if (prefix) {
+      sb.append("*");
+    }
+    encode_json("source_bucket", sb, &jf);
+    encode_json("target_path", target_path, &jf);
+    encode_json("connection_id", connection_id, &jf);
+    encode_json("acls_id", acls_id, &jf);
+    if (conn_conf.get()) {
+      conn_conf->dump_conf(cct, jf);
+    }
+    if (acls.get()) {
+      acls->dump_conf(cct, jf);
+    }
+  }
+};
+
+static void find_and_replace(const string& src, const string& find, const string& replace, string *dest)
+{
+  string s = src;
+
+  size_t pos = s.find(find);
+  while (pos != string::npos) {
+    size_t next_ofs = pos + find.size();
+    s = s.substr(0, pos) + replace + s.substr(next_ofs);
+    pos = s.find(find, next_ofs);
+  }
+
+  *dest = s;
+}
+
+static void apply_meta_param(const string& src, const string& param, const string& val, string *dest)
+{
+  string s = string("${") + param + "}";
+  find_and_replace(src, s, val, dest);
+}
+
+
+struct AWSSyncConfig {
+  AWSSyncConfig_Profile default_profile;
+  std::shared_ptr<AWSSyncConfig_Profile> root_profile;
+
+  map<string, std::shared_ptr<AWSSyncConfig_Connection> > connections;
+  AWSSyncConfig_ACLProfiles acl_profiles;
+
+  map<string, std::shared_ptr<AWSSyncConfig_Profile> > explicit_profiles;
+
+  AWSSyncConfig_S3 s3;
+
+  int init_profile(const DoutPrefixProvider *dpp, CephContext *cct, const JSONFormattable& profile_conf, AWSSyncConfig_Profile& profile,
+                   bool connection_must_exist) {
+    if (!profile.connection_id.empty()) {
+      if (profile.conn_conf) {
+        ldpp_dout(dpp, 0) << "ERROR: ambiguous profile connection configuration, connection_id=" << profile.connection_id << dendl;
+        return -EINVAL;
+      }
+      if (connections.find(profile.connection_id) == connections.end()) {
+        ldpp_dout(dpp, 0) << "ERROR: profile configuration reference non-existent connection_id=" << profile.connection_id << dendl;
+        return -EINVAL;
+      }
+      profile.conn_conf = connections[profile.connection_id];
+    } else if (!profile.conn_conf) {
+      profile.connection_id = default_profile.connection_id;
+      auto i = connections.find(profile.connection_id);
+      if (i != connections.end()) {
+        profile.conn_conf = i->second;
+      }
+    }
+
+    if (connection_must_exist && !profile.conn_conf) {
+      ldpp_dout(dpp, 0) << "ERROR: remote connection undefined for sync profile" << dendl;
+      return -EINVAL;
+    }
+
+    if (profile.conn_conf && default_profile.conn_conf) {
+      if (!profile.conn_conf->has_endpoint) {
+        profile.conn_conf->endpoint = default_profile.conn_conf->endpoint;
+      }
+      if (!profile.conn_conf->has_host_style) {
+        profile.conn_conf->host_style = default_profile.conn_conf->host_style;
+      }
+      if (!profile.conn_conf->has_key) {
+        profile.conn_conf->key = default_profile.conn_conf->key;
+      }
+    }
+
+    ACLMappings acl_mappings;
+
+    if (!profile.acls_id.empty()) {
+      if (!acl_profiles.find(profile.acls_id, &acl_mappings)) {
+        ldpp_dout(dpp, 0) << "ERROR: profile configuration reference non-existent acls id=" << profile.acls_id << dendl;
+        return -EINVAL;
+      }
+      profile.acls = acl_profiles.acl_profiles[profile.acls_id];
+    } else if (!profile.acls) {
+      if (default_profile.acls) {
+        profile.acls = default_profile.acls;
+        profile.acls_id = default_profile.acls_id;
+      }
+    }
+
+    if (profile.target_path.empty()) {
+      profile.target_path = default_profile.target_path;
+    }
+    if (profile.target_path.empty()) {
+      profile.target_path = default_target_path;
+    }
+
+    return 0;
+  }
+
+  int init_target(const DoutPrefixProvider *dpp, CephContext *cct, const JSONFormattable& profile_conf, std::shared_ptr<AWSSyncConfig_Profile> *ptarget) {
+    std::shared_ptr<AWSSyncConfig_Profile> profile;
+    profile.reset(new AWSSyncConfig_Profile);
+    profile->init(profile_conf);
+
+    int ret = init_profile(dpp, cct, profile_conf, *profile, true);
+    if (ret < 0) {
+      return ret;
+    }
+
+    auto& sb = profile->source_bucket;
+
+    if (explicit_profiles.find(sb) != explicit_profiles.end()) {
+      ldpp_dout(dpp, 0) << "WARNING: duplicate target configuration in sync module" << dendl;
+    }
+
+    explicit_profiles[sb] = profile;
+    if (ptarget) {
+      *ptarget = profile;
+    }
+    return 0;
+  }
+
+  bool do_find_profile(const rgw_bucket bucket, std::shared_ptr<AWSSyncConfig_Profile> *result) {
+    const string& name = bucket.name;
+    auto iter = explicit_profiles.upper_bound(name);
+    if (iter == explicit_profiles.begin()) {
+      return false;
+    }
+
+    --iter;
+    if (iter->first.size() > name.size()) {
+      return false;
+    }
+    if (name.compare(0, iter->first.size(), iter->first) != 0) {
+      return false;
+    }
+
+    std::shared_ptr<AWSSyncConfig_Profile>& target = iter->second;
+
+    if (!target->prefix &&
+        name.size() != iter->first.size()) {
+      return false;
+    }
+
+    *result = target;
+    return true;
+  }
+
+  void find_profile(const rgw_bucket bucket, std::shared_ptr<AWSSyncConfig_Profile> *result) {
+    if (!do_find_profile(bucket, result)) {
+      *result = root_profile;
+    }
+  }
+
+  AWSSyncConfig() {}
+
+  int init(const DoutPrefixProvider *dpp, CephContext *cct, const JSONFormattable& config) {
+    auto& default_conf = config["default"];
+
+    if (config.exists("default")) {
+      default_profile.init(default_conf);
+      init_profile(dpp, cct, default_conf, default_profile, false);
+    }
+
+    for (auto& conn : config["connections"].array()) {
+      auto new_conn = conn;
+
+      std::shared_ptr<AWSSyncConfig_Connection> c{new AWSSyncConfig_Connection};
+      c->init(new_conn);
+
+      connections[new_conn["id"]] = c;
+    }
+
+    acl_profiles.init(config["acl_profiles"]);
+
+    int r = s3.init(dpp, cct, config["s3"]);
+    if (r < 0) {
+      return r;
+    }
+
+    auto new_root_conf = config;
+
+    r = init_target(dpp, cct, new_root_conf, &root_profile); /* the root profile config */
+    if (r < 0) {
+      return r;
+    }
+
+    for (auto target_conf : config["profiles"].array()) {
+      int r = init_target(dpp, cct, target_conf, nullptr);
+      if (r < 0) {
+        return r;
+      }
+    }
+
+    JSONFormatter jf(true);
+    dump_conf(cct, jf);
+    stringstream ss;
+    jf.flush(ss);
+
+    ldpp_dout(dpp, 5) << "sync module config (parsed representation):\n" << ss.str() << dendl;
+
+    return 0;
+  }
+
+  void expand_target(RGWDataSyncCtx *sc, const string& sid, const string& path, string *dest) {
+      apply_meta_param(path, "sid", sid, dest);
+
+      const RGWZoneGroup& zg = sc->env->svc->zone->get_zonegroup();
+      apply_meta_param(path, "zonegroup", zg.get_name(), dest);
+      apply_meta_param(path, "zonegroup_id", zg.get_id(), dest);
+
+      const RGWZone& zone = sc->env->svc->zone->get_zone();
+      apply_meta_param(path, "zone", zone.name, dest);
+      apply_meta_param(path, "zone_id", zone.id, dest);
+  }
+
+  void update_config(const DoutPrefixProvider *dpp, RGWDataSyncCtx *sc, const string& sid) {
+    expand_target(sc, sid, root_profile->target_path, &root_profile->target_path);
+    ldpp_dout(dpp, 20) << "updated target: (root) -> " << root_profile->target_path << dendl;
+    for (auto& t : explicit_profiles) {
+      expand_target(sc, sid, t.second->target_path, &t.second->target_path);
+      ldpp_dout(dpp, 20) << "updated target: " << t.first << " -> " << t.second->target_path << dendl;
+    }
+  }
+
+  void dump_conf(CephContext *cct, JSONFormatter& jf) const {
+    Formatter::ObjectSection config(jf, "config");
+    root_profile->dump_conf(cct, jf);
+    jf.open_array_section("connections");
+    for (auto c : connections) {
+      c.second->dump_conf(cct, jf);
+    }
+    jf.close_section();
+
+    acl_profiles.dump_conf(cct, jf);
+
+    { // targets
+      Formatter::ArraySection as(jf, "profiles");
+      for (auto& t : explicit_profiles) {
+        Formatter::ObjectSection target_section(jf, "profile");
+        encode_json("name", t.first, &jf);
+        t.second->dump_conf(cct, jf);
+      }
+    }
+  }
+
+  string get_path(std::shared_ptr<AWSSyncConfig_Profile>& profile,
+                  const RGWBucketInfo& bucket_info,
+                  const rgw_obj_key& obj) {
+    string bucket_str;
+    string owner;
+    if (!bucket_info.owner.tenant.empty()) {
+      bucket_str = owner = bucket_info.owner.tenant + "-";
+      owner += bucket_info.owner.id;
+    }
+    bucket_str += bucket_info.bucket.name;
+
+    const string& path = profile->target_path;
+
+    string new_path;
+    apply_meta_param(path, "bucket", bucket_str, &new_path);
+    apply_meta_param(new_path, "owner", owner, &new_path);
+
+    new_path += string("/") + get_key_oid(obj);
+
+    return new_path;
+  }
+
+  void get_target(std::shared_ptr<AWSSyncConfig_Profile>& profile,
+                  const RGWBucketInfo& bucket_info,
+                  const rgw_obj_key& obj,
+                  string *bucket_name,
+                  string *obj_name) {
+    string path = get_path(profile, bucket_info, obj);
+    size_t pos = path.find('/');
+
+    *bucket_name = path.substr(0, pos);
+    *obj_name = path.substr(pos + 1);
+  }
+
+  void init_conns(RGWDataSyncCtx *sc, const string& id) {
+    auto sync_env = sc->env;
+
+    update_config(sync_env->dpp, sc, id);
+
+    auto& root_conf = root_profile->conn_conf;
+
+    root_profile->conn.reset(new S3RESTConn(sc->cct,
+                                           id,
+                                           { root_conf->endpoint },
+                                           root_conf->key,
+                                          sync_env->svc->zone->get_zonegroup().get_id(),
+                                           root_conf->region,
+                                           root_conf->host_style));
+
+    for (auto i : explicit_profiles) {
+      auto& c = i.second;
+
+      c->conn.reset(new S3RESTConn(sc->cct,
+                                   id,
+                                   { c->conn_conf->endpoint },
+                                   c->conn_conf->key,
+                                  sync_env->svc->zone->get_zonegroup().get_id(),
+                                   c->conn_conf->region,
+                                   c->conn_conf->host_style));
+    }
+  }
+};
+
+
+struct AWSSyncInstanceEnv {
+  AWSSyncConfig conf;
+  string id;
+
+  explicit AWSSyncInstanceEnv(AWSSyncConfig& _conf) : conf(_conf) {}
+
+  void init(RGWDataSyncCtx *sc, uint64_t instance_id) {
+    char buf[32];
+    snprintf(buf, sizeof(buf), "%llx", (unsigned long long)instance_id);
+    id = buf;
+
+    conf.init_conns(sc, id);
+  }
+
+  void get_profile(const rgw_bucket& bucket, std::shared_ptr<AWSSyncConfig_Profile> *ptarget) {
+    conf.find_profile(bucket, ptarget);
+    ceph_assert(ptarget);
+  }
+};
+
+static int do_decode_rest_obj(const DoutPrefixProvider *dpp, CephContext *cct, map<string, bufferlist>& attrs, map<string, string>& headers, rgw_rest_obj *info)
+{
+  for (auto header : headers) {
+    const string& val = header.second;
+    if (header.first == "RGWX_OBJECT_SIZE") {
+      info->content_len = atoi(val.c_str());
+    } else {
+      info->attrs[header.first] = val;
+    }
+  }
+
+  info->acls.set_ctx(cct);
+  auto aiter = attrs.find(RGW_ATTR_ACL);
+  if (aiter != attrs.end()) {
+    bufferlist& bl = aiter->second;
+    auto bliter = bl.cbegin();
+    try {
+      info->acls.decode(bliter);
+    } catch (buffer::error& err) {
+      ldpp_dout(dpp, 0) << "ERROR: failed to decode policy off attrs" << dendl;
+      return -EIO;
+    }
+  } else {
+    ldpp_dout(dpp, 0) << "WARNING: acl attrs not provided" << dendl;
+  }
+
+  return 0;
+}
+
+class RGWRESTStreamGetCRF : public RGWStreamReadHTTPResourceCRF
+{
+  RGWDataSyncCtx *sc;
+  RGWRESTConn *conn;
+  rgw::sal::Object* src_obj;
+  RGWRESTConn::get_obj_params req_params;
+
+  rgw_sync_aws_src_obj_properties src_properties;
+public:
+  RGWRESTStreamGetCRF(CephContext *_cct,
+                               RGWCoroutinesEnv *_env,
+                               RGWCoroutine *_caller,
+                               RGWDataSyncCtx *_sc,
+                               RGWRESTConn *_conn,
+                               rgw::sal::Object* _src_obj,
+                               const rgw_sync_aws_src_obj_properties& _src_properties) : RGWStreamReadHTTPResourceCRF(_cct, _env, _caller,
+                                                                                                                      _sc->env->http_manager, _src_obj->get_key()),
+                                                                                 sc(_sc), conn(_conn), src_obj(_src_obj),
+                                                                                 src_properties(_src_properties) {
+  }
+
+  int init(const DoutPrefixProvider *dpp) override {
+    /* init input connection */
+
+
+    req_params.get_op = true;
+    req_params.prepend_metadata = true;
+
+    req_params.unmod_ptr = &src_properties.mtime;
+    req_params.etag = src_properties.etag;
+    req_params.mod_zone_id = src_properties.zone_short_id;
+    req_params.mod_pg_ver = src_properties.pg_ver;
+
+    if (range.is_set) {
+      req_params.range_is_set = true;
+      req_params.range_start = range.ofs;
+      req_params.range_end = range.ofs + range.size - 1;
+    }
+
+    RGWRESTStreamRWRequest *in_req;
+    int ret = conn->get_obj(dpp, src_obj, req_params, false /* send */, &in_req);
+    if (ret < 0) {
+      ldpp_dout(dpp, 0) << "ERROR: " << __func__ << "(): conn->get_obj() returned ret=" << ret << dendl;
+      return ret;
+    }
+
+    set_req(in_req);
+
+    return RGWStreamReadHTTPResourceCRF::init(dpp);
+  }
+
+  int decode_rest_obj(const DoutPrefixProvider *dpp, map<string, string>& headers, bufferlist& extra_data) override {
+    map<string, bufferlist> src_attrs;
+
+    ldpp_dout(dpp, 20) << __func__ << ":" << " headers=" << headers << " extra_data.length()=" << extra_data.length() << dendl;
+
+    if (extra_data.length() > 0) {
+      JSONParser jp;
+      if (!jp.parse(extra_data.c_str(), extra_data.length())) {
+        ldpp_dout(dpp, 0) << "ERROR: failed to parse response extra data. len=" << extra_data.length() << " data=" << extra_data.c_str() << dendl;
+        return -EIO;
+      }
+
+      JSONDecoder::decode_json("attrs", src_attrs, &jp);
+    }
+    return do_decode_rest_obj(dpp, sc->cct, src_attrs, headers, &rest_obj);
+  }
+
+  bool need_extra_data() override {
+    return true;
+  }
+};
+
+static std::set<string> keep_headers = { "CONTENT_TYPE",
+                                         "CONTENT_ENCODING",
+                                         "CONTENT_DISPOSITION",
+                                         "CONTENT_LANGUAGE" };
+
+class RGWAWSStreamPutCRF : public RGWStreamWriteHTTPResourceCRF
+{
+  RGWDataSyncCtx *sc;
+  rgw_sync_aws_src_obj_properties src_properties;
+  std::shared_ptr<AWSSyncConfig_Profile> target;
+  rgw::sal::Object* dest_obj;
+  string etag;
+public:
+  RGWAWSStreamPutCRF(CephContext *_cct,
+                               RGWCoroutinesEnv *_env,
+                               RGWCoroutine *_caller,
+                               RGWDataSyncCtx *_sc,
+                               const rgw_sync_aws_src_obj_properties&  _src_properties,
+                               std::shared_ptr<AWSSyncConfig_Profile>& _target,
+                               rgw::sal::Object* _dest_obj) : RGWStreamWriteHTTPResourceCRF(_cct, _env, _caller, _sc->env->http_manager),
+                                                     sc(_sc), src_properties(_src_properties), target(_target), dest_obj(_dest_obj) {
+  }
+
+  int init() override {
+    /* init output connection */
+    RGWRESTStreamS3PutObj *out_req{nullptr};
+
+    if (multipart.is_multipart) {
+      char buf[32];
+      snprintf(buf, sizeof(buf), "%d", multipart.part_num);
+      rgw_http_param_pair params[] = { { "uploadId", multipart.upload_id.c_str() },
+                                       { "partNumber", buf },
+                                       { nullptr, nullptr } };
+      target->conn->put_obj_send_init(dest_obj, params, &out_req);
+    } else {
+      target->conn->put_obj_send_init(dest_obj, nullptr, &out_req);
+    }
+
+    set_req(out_req);
+
+    return RGWStreamWriteHTTPResourceCRF::init();
+  }
+
+  static bool keep_attr(const string& h) {
+    return (keep_headers.find(h) != keep_headers.end() ||
+            boost::algorithm::starts_with(h, "X_AMZ_"));
+  }
+
+  static void init_send_attrs(const DoutPrefixProvider *dpp,
+                              CephContext *cct,
+                              const rgw_rest_obj& rest_obj,
+                              const rgw_sync_aws_src_obj_properties& src_properties,
+                              const AWSSyncConfig_Profile *target,
+                              map<string, string> *attrs) {
+    auto& new_attrs = *attrs;
+
+    new_attrs.clear();
+
+    for (auto& hi : rest_obj.attrs) {
+      if (keep_attr(hi.first)) {
+        new_attrs.insert(hi);
+      }
+    }
+
+    auto acl = rest_obj.acls.get_acl();
+
+    map<int, vector<string> > access_map;
+
+    if (target->acls) {
+      for (auto& grant : acl.get_grant_map()) {
+        auto& orig_grantee = grant.first;
+        auto& perm = grant.second;
+
+        string grantee;
+
+        const auto& am = target->acls->acl_mappings;
+
+        auto iter = am.find(orig_grantee);
+        if (iter == am.end()) {
+          ldpp_dout(dpp, 20) << "acl_mappings: Could not find " << orig_grantee << " .. ignoring" << dendl;
+          continue;
+        }
+
+        grantee = iter->second.dest_id;
+
+        string type;
+
+        switch (iter->second.type) {
+          case ACL_TYPE_CANON_USER:
+            type = "id";
+            break;
+          case ACL_TYPE_EMAIL_USER:
+            type = "emailAddress";
+            break;
+          case ACL_TYPE_GROUP:
+            type = "uri";
+            break;
+          default:
+            continue;
+        }
+
+        string tv = type + "=" + grantee;
+
+        int flags = perm.get_permission().get_permissions();
+        if ((flags & RGW_PERM_FULL_CONTROL) == RGW_PERM_FULL_CONTROL) {
+          access_map[flags].push_back(tv);
+          continue;
+        }
+
+        for (int i = 1; i <= RGW_PERM_WRITE_ACP; i <<= 1) {
+          if (flags & i) {
+            access_map[i].push_back(tv);
+          }
+        }
+      }
+    }
+
+    for (auto aiter : access_map) {
+      int grant_type = aiter.first;
+
+      string header_str("x-amz-grant-");
+
+      switch (grant_type) {
+        case RGW_PERM_READ:
+          header_str.append("read");
+          break;
+        case RGW_PERM_WRITE:
+          header_str.append("write");
+          break;
+        case RGW_PERM_READ_ACP:
+          header_str.append("read-acp");
+          break;
+        case RGW_PERM_WRITE_ACP:
+          header_str.append("write-acp");
+          break;
+        case RGW_PERM_FULL_CONTROL:
+          header_str.append("full-control");
+          break;
+      }
+
+      string s;
+
+      for (auto viter : aiter.second) {
+        if (!s.empty()) {
+          s.append(", ");
+        }
+        s.append(viter);
+      }
+
+      ldpp_dout(dpp, 20) << "acl_mappings: set acl: " << header_str << "=" << s << dendl;
+
+      new_attrs[header_str] = s;
+    }
+
+    char buf[32];
+    snprintf(buf, sizeof(buf), "%llu", (long long)src_properties.versioned_epoch);
+    new_attrs["x-amz-meta-rgwx-versioned-epoch"] = buf;
+
+    utime_t ut(src_properties.mtime);
+    snprintf(buf, sizeof(buf), "%lld.%09lld",
+             (long long)ut.sec(),
+             (long long)ut.nsec());
+
+    new_attrs["x-amz-meta-rgwx-source-mtime"] = buf;
+    new_attrs["x-amz-meta-rgwx-source-etag"] = src_properties.etag;
+    new_attrs["x-amz-meta-rgwx-source-key"] = rest_obj.key.name;
+    if (!rest_obj.key.instance.empty()) {
+      new_attrs["x-amz-meta-rgwx-source-version-id"] = rest_obj.key.instance;
+    }
+  }
+
+  void send_ready(const DoutPrefixProvider *dpp, const rgw_rest_obj& rest_obj) override {
+    RGWRESTStreamS3PutObj *r = static_cast<RGWRESTStreamS3PutObj *>(req);
+
+    map<string, string> new_attrs;
+    if (!multipart.is_multipart) {
+      init_send_attrs(dpp, sc->cct, rest_obj, src_properties, target.get(), &new_attrs);
+    }
+
+    r->set_send_length(rest_obj.content_len);
+
+    RGWAccessControlPolicy policy;
+
+    r->send_ready(dpp, target->conn->get_key(), new_attrs, policy);
+  }
+
+  void handle_headers(const map<string, string>& headers) {
+    for (auto h : headers) {
+      if (h.first == "ETAG") {
+        etag = h.second;
+      }
+    }
+  }
+
+  bool get_etag(string *petag) {
+    if (etag.empty()) {
+      return false;
+    }
+    *petag = etag;
+    return true;
+  }
+};
+
+
+class RGWAWSStreamObjToCloudPlainCR : public RGWCoroutine {
+  RGWDataSyncCtx *sc;
+  RGWRESTConn *source_conn;
+  std::shared_ptr<AWSSyncConfig_Profile> target;
+  rgw::sal::Object* src_obj;
+  rgw::sal::Object* dest_obj;
+
+  rgw_sync_aws_src_obj_properties src_properties;
+
+  std::shared_ptr<RGWStreamReadHTTPResourceCRF> in_crf;
+  std::shared_ptr<RGWStreamWriteHTTPResourceCRF> out_crf;
+
+public:
+  RGWAWSStreamObjToCloudPlainCR(RGWDataSyncCtx *_sc,
+                                RGWRESTConn *_source_conn,
+                                rgw::sal::Object* _src_obj,
+                                const rgw_sync_aws_src_obj_properties& _src_properties,
+                                std::shared_ptr<AWSSyncConfig_Profile> _target,
+                                rgw::sal::Object* _dest_obj) : RGWCoroutine(_sc->cct),
+                                                   sc(_sc),
+                                                   source_conn(_source_conn),
+                                                   target(_target),
+                                                   src_obj(_src_obj),
+                                                   dest_obj(_dest_obj),
+                                                   src_properties(_src_properties) {}
+
+  int operate(const DoutPrefixProvider *dpp) override {
+    reenter(this) {
+      /* init input */
+      in_crf.reset(new RGWRESTStreamGetCRF(cct, get_env(), this, sc,
+                                           source_conn, src_obj,
+                                           src_properties));
+
+      /* init output */
+      out_crf.reset(new RGWAWSStreamPutCRF(cct, get_env(), this, sc,
+                                           src_properties, target, dest_obj));
+
+      yield call(new RGWStreamSpliceCR(cct, sc->env->http_manager, in_crf, out_crf));
+      if (retcode < 0) {
+        return set_cr_error(retcode);
+      }
+
+      return set_cr_done();
+    }
+
+    return 0;
+  }
+};
+
+class RGWAWSStreamObjToCloudMultipartPartCR : public RGWCoroutine {
+  RGWDataSyncCtx *sc;
+  RGWRESTConn *source_conn;
+  std::shared_ptr<AWSSyncConfig_Profile> target;
+  rgw::sal::Object* src_obj;
+  rgw::sal::Object* dest_obj;
+
+  rgw_sync_aws_src_obj_properties src_properties;
+
+  string upload_id;
+
+  rgw_sync_aws_multipart_part_info part_info;
+
+  std::shared_ptr<RGWStreamReadHTTPResourceCRF> in_crf;
+  std::shared_ptr<RGWStreamWriteHTTPResourceCRF> out_crf;
+
+  string *petag;
+
+public:
+  RGWAWSStreamObjToCloudMultipartPartCR(RGWDataSyncCtx *_sc,
+                                RGWRESTConn *_source_conn,
+                                rgw::sal::Object* _src_obj,
+                                std::shared_ptr<AWSSyncConfig_Profile>& _target,
+                                rgw::sal::Object* _dest_obj,
+                                const rgw_sync_aws_src_obj_properties& _src_properties,
+                                const string& _upload_id,
+                                const rgw_sync_aws_multipart_part_info& _part_info,
+                                string *_petag) : RGWCoroutine(_sc->cct),
+                                                   sc(_sc),
+                                                   source_conn(_source_conn),
+                                                   target(_target),
+                                                   src_obj(_src_obj),
+                                                   dest_obj(_dest_obj),
+                                                   src_properties(_src_properties),
+                                                   upload_id(_upload_id),
+                                                   part_info(_part_info),
+                                                   petag(_petag) {}
+
+  int operate(const DoutPrefixProvider *dpp) override {
+    reenter(this) {
+      /* init input */
+      in_crf.reset(new RGWRESTStreamGetCRF(cct, get_env(), this, sc,
+                                           source_conn, src_obj,
+                                           src_properties));
+
+      in_crf->set_range(part_info.ofs, part_info.size);
+
+      /* init output */
+      out_crf.reset(new RGWAWSStreamPutCRF(cct, get_env(), this, sc,
+                                           src_properties, target, dest_obj));
+
+      out_crf->set_multipart(upload_id, part_info.part_num, part_info.size);
+
+      yield call(new RGWStreamSpliceCR(cct, sc->env->http_manager, in_crf, out_crf));
+      if (retcode < 0) {
+        return set_cr_error(retcode);
+      }
+
+      if (!(static_cast<RGWAWSStreamPutCRF *>(out_crf.get()))->get_etag(petag)) {
+        ldpp_dout(dpp, 0) << "ERROR: failed to get etag from PUT request" << dendl;
+        return set_cr_error(-EIO);
+      }
+
+      return set_cr_done();
+    }
+
+    return 0;
+  }
+};
+
+class RGWAWSAbortMultipartCR : public RGWCoroutine {
+  RGWDataSyncCtx *sc;
+  RGWRESTConn *dest_conn;
+  rgw::sal::Object* dest_obj;
+
+  string upload_id;
+
+public:
+  RGWAWSAbortMultipartCR(RGWDataSyncCtx *_sc,
+                        RGWRESTConn *_dest_conn,
+                        rgw::sal::Object* _dest_obj,
+                        const string& _upload_id) : RGWCoroutine(_sc->cct),
+                                                   sc(_sc),
+                                                   dest_conn(_dest_conn),
+                                                   dest_obj(_dest_obj),
+                                                   upload_id(_upload_id) {}
+
+  int operate(const DoutPrefixProvider *dpp) override {
+    reenter(this) {
+
+      yield {
+        rgw_http_param_pair params[] = { { "uploadId", upload_id.c_str() }, {nullptr, nullptr} };
+        bufferlist bl;
+        call(new RGWDeleteRESTResourceCR(sc->cct, dest_conn, sc->env->http_manager,
+                                         obj_to_aws_path(dest_obj), params));
+      }
+
+      if (retcode < 0) {
+        ldpp_dout(dpp, 0) << "ERROR: failed to abort multipart upload for dest object=" << dest_obj << " (retcode=" << retcode << ")" << dendl;
+        return set_cr_error(retcode);
+      }
+
+      return set_cr_done();
+    }
+
+    return 0;
+  }
+};
+
+class RGWAWSInitMultipartCR : public RGWCoroutine {
+  RGWDataSyncCtx *sc;
+  RGWRESTConn *dest_conn;
+  rgw::sal::Object* dest_obj;
+
+  uint64_t obj_size;
+  map<string, string> attrs;
+
+  bufferlist out_bl;
+
+  string *upload_id;
+
+  struct InitMultipartResult {
+    string bucket;
+    string key;
+    string upload_id;
+
+    void decode_xml(XMLObj *obj) {
+      RGWXMLDecoder::decode_xml("Bucket", bucket, obj);
+      RGWXMLDecoder::decode_xml("Key", key, obj);
+      RGWXMLDecoder::decode_xml("UploadId", upload_id, obj);
+    }
+  } result;
+
+public:
+  RGWAWSInitMultipartCR(RGWDataSyncCtx *_sc,
+                        RGWRESTConn *_dest_conn,
+                        rgw::sal::Object* _dest_obj,
+                        uint64_t _obj_size,
+                        const map<string, string>& _attrs,
+                        string *_upload_id) : RGWCoroutine(_sc->cct),
+                                                   sc(_sc),
+                                                   dest_conn(_dest_conn),
+                                                   dest_obj(_dest_obj),
+                                                   obj_size(_obj_size),
+                                                   attrs(_attrs),
+                                                   upload_id(_upload_id) {}
+
+  int operate(const DoutPrefixProvider *dpp) override {
+    reenter(this) {
+
+      yield {
+        rgw_http_param_pair params[] = { { "uploads", nullptr }, {nullptr, nullptr} };
+        bufferlist bl;
+        call(new RGWPostRawRESTResourceCR <bufferlist> (sc->cct, dest_conn, sc->env->http_manager,
+                                                 obj_to_aws_path(dest_obj), params, &attrs, bl, &out_bl));
+      }
+
+      if (retcode < 0) {
+        ldpp_dout(dpp, 0) << "ERROR: failed to initialize multipart upload for dest object=" << dest_obj << dendl;
+        return set_cr_error(retcode);
+      }
+      {
+        /*
+         * If one of the following fails we cannot abort upload, as we cannot
+         * extract the upload id. If one of these fail it's very likely that that's
+         * the least of our problem.
+         */
+        RGWXMLDecoder::XMLParser parser;
+        if (!parser.init()) {
+          ldpp_dout(dpp, 0) << "ERROR: failed to initialize xml parser for parsing multipart init response from server" << dendl;
+          return set_cr_error(-EIO);
+        }
+
+        if (!parser.parse(out_bl.c_str(), out_bl.length(), 1)) {
+          string str(out_bl.c_str(), out_bl.length());
+          ldpp_dout(dpp, 5) << "ERROR: failed to parse xml: " << str << dendl;
+          return set_cr_error(-EIO);
+        }
+
+        try {
+          RGWXMLDecoder::decode_xml("InitiateMultipartUploadResult", result, &parser, true);
+        } catch (RGWXMLDecoder::err& err) {
+          string str(out_bl.c_str(), out_bl.length());
+          ldpp_dout(dpp, 5) << "ERROR: unexpected xml: " << str << dendl;
+          return set_cr_error(-EIO);
+        }
+      }
+
+      ldpp_dout(dpp, 20) << "init multipart result: bucket=" << result.bucket << " key=" << result.key << " upload_id=" << result.upload_id << dendl;
+
+      *upload_id = result.upload_id;
+
+      return set_cr_done();
+    }
+
+    return 0;
+  }
+};
+
+class RGWAWSCompleteMultipartCR : public RGWCoroutine {
+  RGWDataSyncCtx *sc;
+  RGWRESTConn *dest_conn;
+  rgw::sal::Object* dest_obj;
+
+  bufferlist out_bl;
+
+  string upload_id;
+
+  struct CompleteMultipartReq {
+    map<int, rgw_sync_aws_multipart_part_info> parts;
+
+    explicit CompleteMultipartReq(const map<int, rgw_sync_aws_multipart_part_info>& _parts) : parts(_parts) {}
+
+    void dump_xml(Formatter *f) const {
+      for (auto p : parts) {
+        f->open_object_section("Part");
+        encode_xml("PartNumber", p.first, f);
+        encode_xml("ETag", p.second.etag, f);
+        f->close_section();
+      };
+    }
+  } req_enc;
+
+  struct CompleteMultipartResult {
+    string location;
+    string bucket;
+    string key;
+    string etag;
+
+    void decode_xml(XMLObj *obj) {
+      RGWXMLDecoder::decode_xml("Location", bucket, obj);
+      RGWXMLDecoder::decode_xml("Bucket", bucket, obj);
+      RGWXMLDecoder::decode_xml("Key", key, obj);
+      RGWXMLDecoder::decode_xml("ETag", etag, obj);
+    }
+  } result;
+
+public:
+  RGWAWSCompleteMultipartCR(RGWDataSyncCtx *_sc,
+                        RGWRESTConn *_dest_conn,
+                        rgw::sal::Object* _dest_obj,
+                        string _upload_id,
+                        const map<int, rgw_sync_aws_multipart_part_info>& _parts) : RGWCoroutine(_sc->cct),
+                                                   sc(_sc),
+                                                   dest_conn(_dest_conn),
+                                                   dest_obj(_dest_obj),
+                                                   upload_id(_upload_id),
+                                                   req_enc(_parts) {}
+
+  int operate(const DoutPrefixProvider *dpp) override {
+    reenter(this) {
+
+      yield {
+        rgw_http_param_pair params[] = { { "uploadId", upload_id.c_str() }, {nullptr, nullptr} };
+        stringstream ss;
+        XMLFormatter formatter;
+
+        encode_xml("CompleteMultipartUpload", req_enc, &formatter);
+
+        formatter.flush(ss);
+
+        bufferlist bl;
+        bl.append(ss.str());
+
+        call(new RGWPostRawRESTResourceCR <bufferlist> (sc->cct, dest_conn, sc->env->http_manager,
+                                                 obj_to_aws_path(dest_obj), params, nullptr, bl, &out_bl));
+      }
+
+      if (retcode < 0) {
+        ldpp_dout(dpp, 0) << "ERROR: failed to initialize multipart upload for dest object=" << dest_obj << dendl;
+        return set_cr_error(retcode);
+      }
+      {
+        /*
+         * If one of the following fails we cannot abort upload, as we cannot
+         * extract the upload id. If one of these fail it's very likely that that's
+         * the least of our problem.
+         */
+        RGWXMLDecoder::XMLParser parser;
+        if (!parser.init()) {
+          ldpp_dout(dpp, 0) << "ERROR: failed to initialize xml parser for parsing multipart init response from server" << dendl;
+          return set_cr_error(-EIO);
+        }
+
+        if (!parser.parse(out_bl.c_str(), out_bl.length(), 1)) {
+          string str(out_bl.c_str(), out_bl.length());
+          ldpp_dout(dpp, 5) << "ERROR: failed to parse xml: " << str << dendl;
+          return set_cr_error(-EIO);
+        }
+
+        try {
+          RGWXMLDecoder::decode_xml("CompleteMultipartUploadResult", result, &parser, true);
+        } catch (RGWXMLDecoder::err& err) {
+          string str(out_bl.c_str(), out_bl.length());
+          ldpp_dout(dpp, 5) << "ERROR: unexpected xml: " << str << dendl;
+          return set_cr_error(-EIO);
+        }
+      }
+
+      ldpp_dout(dpp, 20) << "complete multipart result: location=" << result.location << " bucket=" << result.bucket << " key=" << result.key << " etag=" << result.etag << dendl;
+
+      return set_cr_done();
+    }
+
+    return 0;
+  }
+};
+
+
+class RGWAWSStreamAbortMultipartUploadCR : public RGWCoroutine {
+  RGWDataSyncCtx *sc;
+  RGWRESTConn *dest_conn;
+  rgw::sal::Object* dest_obj;
+  const rgw_raw_obj status_obj;
+
+  string upload_id;
+
+public:
+
+  RGWAWSStreamAbortMultipartUploadCR(RGWDataSyncCtx *_sc,
+                                RGWRESTConn *_dest_conn,
+                                rgw::sal::Object* _dest_obj,
+                                const rgw_raw_obj& _status_obj,
+                                const string& _upload_id) : RGWCoroutine(_sc->cct), sc(_sc),
+                                                            dest_conn(_dest_conn),
+                                                            dest_obj(_dest_obj),
+                                                            status_obj(_status_obj),
+                                                            upload_id(_upload_id) {}
+
+  int operate(const DoutPrefixProvider *dpp) override {
+    reenter(this) {
+      yield call(new RGWAWSAbortMultipartCR(sc, dest_conn, dest_obj, upload_id));
+      if (retcode < 0) {
+        ldpp_dout(dpp, 0) << "ERROR: failed to abort multipart upload dest obj=" << dest_obj << " upload_id=" << upload_id << " retcode=" << retcode << dendl;
+        /* ignore error, best effort */
+      }
+      yield call(new RGWRadosRemoveCR(sc->env->driver, status_obj));
+      if (retcode < 0) {
+        ldpp_dout(dpp, 0) << "ERROR: failed to remove sync status obj obj=" << status_obj << " retcode=" << retcode << dendl;
+        /* ignore error, best effort */
+      }
+      return set_cr_done();
+    }
+
+    return 0;
+  }
+};
+
+class RGWAWSStreamObjToCloudMultipartCR : public RGWCoroutine {
+  RGWDataSyncCtx *sc;
+  RGWDataSyncEnv *sync_env;
+  AWSSyncConfig& conf;
+  RGWRESTConn *source_conn;
+  std::shared_ptr<AWSSyncConfig_Profile> target;
+  rgw::sal::Object* src_obj;
+  rgw::sal::Object* dest_obj;
+
+  uint64_t obj_size;
+  string src_etag;
+  rgw_sync_aws_src_obj_properties src_properties;
+  rgw_rest_obj rest_obj;
+
+  rgw_sync_aws_multipart_upload_info status;
+
+  map<string, string> new_attrs;
+
+  rgw_sync_aws_multipart_part_info *pcur_part_info{nullptr};
+
+  int ret_err{0};
+
+  rgw_raw_obj status_obj;
+
+public:
+  RGWAWSStreamObjToCloudMultipartCR(RGWDataSyncCtx *_sc,
+                                   rgw_bucket_sync_pipe& _sync_pipe,
+                                AWSSyncConfig& _conf,
+                                RGWRESTConn *_source_conn,
+                                rgw::sal::Object* _src_obj,
+                                std::shared_ptr<AWSSyncConfig_Profile>& _target,
+                                rgw::sal::Object* _dest_obj,
+                                uint64_t _obj_size,
+                                const rgw_sync_aws_src_obj_properties& _src_properties,
+                                const rgw_rest_obj& _rest_obj) : RGWCoroutine(_sc->cct),
+                                                   sc(_sc),
+                                                   sync_env(_sc->env),
+                                                   conf(_conf),
+                                                   source_conn(_source_conn),
+                                                   target(_target),
+                                                   src_obj(_src_obj),
+                                                   dest_obj(_dest_obj),
+                                                   obj_size(_obj_size),
+                                                   src_properties(_src_properties),
+                                                   rest_obj(_rest_obj),
+                                                   status_obj(sync_env->svc->zone->get_zone_params().log_pool,
+                                                              RGWBucketPipeSyncStatusManager::obj_status_oid(_sync_pipe, sc->source_zone, src_obj)) {
+  }
+
+
+  int operate(const DoutPrefixProvider *dpp) override {
+    reenter(this) {
+      yield call(new RGWSimpleRadosReadCR<rgw_sync_aws_multipart_upload_info>(dpp, sync_env->async_rados, sync_env->svc->sysobj,
+                                                                 status_obj, &status, false));
+
+      if (retcode < 0 && retcode != -ENOENT) {
+        ldpp_dout(dpp, 0) << "ERROR: failed to read sync status of object " << src_obj << " retcode=" << retcode << dendl;
+        return retcode;
+      }
+
+      if (retcode >= 0) {
+        /* check here that mtime and size did not change */
+
+        if (status.src_properties.mtime != src_properties.mtime || status.obj_size != obj_size ||
+            status.src_properties.etag != src_properties.etag) {
+          yield call(new RGWAWSStreamAbortMultipartUploadCR(sc, target->conn.get(), dest_obj, status_obj, status.upload_id));
+          retcode = -ENOENT;
+        }
+      }
+
+      if (retcode == -ENOENT) {
+        RGWAWSStreamPutCRF::init_send_attrs(dpp, sc->cct, rest_obj, src_properties, target.get(), &new_attrs);
+
+        yield call(new RGWAWSInitMultipartCR(sc, target->conn.get(), dest_obj, status.obj_size, std::move(new_attrs), &status.upload_id));
+        if (retcode < 0) {
+          return set_cr_error(retcode);
+        }
+
+        status.obj_size = obj_size;
+        status.src_properties = src_properties;
+#define MULTIPART_MAX_PARTS 10000
+        uint64_t min_part_size = obj_size / MULTIPART_MAX_PARTS;
+        status.part_size = std::max(conf.s3.multipart_min_part_size, min_part_size);
+        status.num_parts = (obj_size + status.part_size - 1) / status.part_size;
+        status.cur_part = 1;
+      }
+
+      for (; (uint32_t)status.cur_part <= status.num_parts; ++status.cur_part) {
+        yield {
+          rgw_sync_aws_multipart_part_info& cur_part_info = status.parts[status.cur_part];
+          cur_part_info.part_num = status.cur_part;
+          cur_part_info.ofs = status.cur_ofs;
+          cur_part_info.size = std::min((uint64_t)status.part_size, status.obj_size - status.cur_ofs);
+
+          pcur_part_info = &cur_part_info;
+
+          status.cur_ofs += status.part_size;
+
+          call(new RGWAWSStreamObjToCloudMultipartPartCR(sc,
+                                                             source_conn, src_obj,
+                                                             target,
+                                                             dest_obj,
+                                                             status.src_properties,
+                                                             status.upload_id,
+                                                             cur_part_info,
+                                                             &cur_part_info.etag));
+        }
+
+        if (retcode < 0) {
+          ldpp_dout(dpp, 0) << "ERROR: failed to sync obj=" << src_obj << ", sync via multipart upload, upload_id=" << status.upload_id << " part number " << status.cur_part << " (error: " << cpp_strerror(-retcode) << ")" << dendl;
+          ret_err = retcode;
+          yield call(new RGWAWSStreamAbortMultipartUploadCR(sc, target->conn.get(), dest_obj, status_obj, status.upload_id));
+          return set_cr_error(ret_err);
+        }
+
+        yield call(new RGWSimpleRadosWriteCR<rgw_sync_aws_multipart_upload_info>(dpp, sync_env->async_rados, sync_env->svc->sysobj, status_obj, status));
+        if (retcode < 0) {
+          ldpp_dout(dpp, 0) << "ERROR: failed to store multipart upload state, retcode=" << retcode << dendl;
+          /* continue with upload anyway */
+        }
+        ldpp_dout(dpp, 20) << "sync of object=" << src_obj << " via multipart upload, finished sending part #" << status.cur_part << " etag=" << pcur_part_info->etag << dendl;
+      }
+
+      yield call(new RGWAWSCompleteMultipartCR(sc, target->conn.get(), dest_obj, status.upload_id, status.parts));
+      if (retcode < 0) {
+        ldpp_dout(dpp, 0) << "ERROR: failed to complete multipart upload of obj=" << src_obj << " (error: " << cpp_strerror(-retcode) << ")" << dendl;
+        ret_err = retcode;
+        yield call(new RGWAWSStreamAbortMultipartUploadCR(sc, target->conn.get(), dest_obj, status_obj, status.upload_id));
+        return set_cr_error(ret_err);
+      }
+
+      /* remove status obj */
+      yield call(new RGWRadosRemoveCR(sync_env->driver, status_obj));
+      if (retcode < 0) {
+        ldpp_dout(dpp, 0) << "ERROR: failed to abort multipart upload obj=" << src_obj << " upload_id=" << status.upload_id << " part number " << status.cur_part << " (" << cpp_strerror(-retcode) << ")" << dendl;
+        /* ignore error, best effort */
+      }
+      return set_cr_done();
+    }
+
+    return 0;
+  }
+};
+template <class T>
+int decode_attr(map<string, bufferlist>& attrs, const char *attr_name, T *result, T def_val)
+{
+  map<string, bufferlist>::iterator iter = attrs.find(attr_name);
+  if (iter == attrs.end()) {
+    *result = def_val;
+    return 0;
+  }
+  bufferlist& bl = iter->second;
+  if (bl.length() == 0) {
+    *result = def_val;
+    return 0;
+  }
+  auto bliter = bl.cbegin();
+  try {
+    decode(*result, bliter);
+  } catch (buffer::error& err) {
+    return -EIO;
+  }
+  return 0;
+}
+
+// maybe use Fetch Remote Obj instead?
+class RGWAWSHandleRemoteObjCBCR: public RGWStatRemoteObjCBCR {
+  rgw_bucket_sync_pipe sync_pipe;
+  AWSSyncInstanceEnv& instance;
+
+  uint64_t versioned_epoch{0};
+
+  RGWRESTConn *source_conn{nullptr};
+  std::shared_ptr<AWSSyncConfig_Profile> target;
+  bufferlist res;
+  unordered_map <string, bool> bucket_created;
+  string target_bucket_name;
+  string target_obj_name;
+  rgw_rest_obj rest_obj;
+  int ret{0};
+
+  uint32_t src_zone_short_id{0};
+  uint64_t src_pg_ver{0};
+
+  bufferlist out_bl;
+
+  struct CreateBucketResult {
+    string code;
+
+    void decode_xml(XMLObj *obj) {
+      RGWXMLDecoder::decode_xml("Code", code, obj);
+    }
+  } result;
+
+  rgw_bucket target_bucket;
+  std::unique_ptr<rgw::sal::RadosBucket> bucket;
+  std::unique_ptr<rgw::sal::RadosObject> src_obj;
+  std::unique_ptr<rgw::sal::RadosBucket> dest_bucket;
+  std::unique_ptr<rgw::sal::RadosObject> dest_obj;
+
+
+public:
+  RGWAWSHandleRemoteObjCBCR(RGWDataSyncCtx *_sc,
+                            rgw_bucket_sync_pipe& _sync_pipe,
+                            rgw_obj_key& _key,
+                            AWSSyncInstanceEnv& _instance,
+                            uint64_t _versioned_epoch) : RGWStatRemoteObjCBCR(_sc, _sync_pipe.info.source_bs.bucket, _key),
+                                                         sync_pipe(_sync_pipe),
+                                                         instance(_instance), versioned_epoch(_versioned_epoch)
+  {}
+
+  ~RGWAWSHandleRemoteObjCBCR(){
+  }
+
+  int operate(const DoutPrefixProvider *dpp) override {
+    reenter(this) {
+      ret = decode_attr(attrs, RGW_ATTR_PG_VER, &src_pg_ver, (uint64_t)0);
+      if (ret < 0) {
+        ldpp_dout(dpp, 0) << "ERROR: failed to decode pg ver attr, ignoring" << dendl;
+      } else {
+        ret = decode_attr(attrs, RGW_ATTR_SOURCE_ZONE, &src_zone_short_id, (uint32_t)0);
+        if (ret < 0) {
+          ldpp_dout(dpp, 0) << "ERROR: failed to decode source zone short_id attr, ignoring" << dendl;
+          src_pg_ver = 0; /* all or nothing */
+        }
+      }
+      ldpp_dout(dpp, 4) << "AWS: download begin: z=" << sc->source_zone
+                              << " b=" << src_bucket << " k=" << key << " size=" << size
+                              << " mtime=" << mtime << " etag=" << etag
+                              << " zone_short_id=" << src_zone_short_id << " pg_ver=" << src_pg_ver
+                              << dendl;
+
+      source_conn = sync_env->svc->zone->get_zone_conn(sc->source_zone);
+      if (!source_conn) {
+        ldpp_dout(dpp, 0) << "ERROR: cannot find http connection to zone " << sc->source_zone << dendl;
+        return set_cr_error(-EINVAL);
+      }
+
+      instance.get_profile(sync_pipe.info.source_bs.bucket, &target);
+      instance.conf.get_target(target, sync_pipe.dest_bucket_info, key, &target_bucket_name, &target_obj_name);
+
+      if (bucket_created.find(target_bucket_name) == bucket_created.end()){
+        yield {
+          ldpp_dout(dpp, 0) << "AWS: creating bucket " << target_bucket_name << dendl;
+          bufferlist bl;
+          call(new RGWPutRawRESTResourceCR <bufferlist> (sc->cct, target->conn.get(),
+                                                  sync_env->http_manager,
+                                                  target_bucket_name, nullptr, bl, &out_bl));
+        }
+        if (retcode < 0 ) {
+          RGWXMLDecoder::XMLParser parser;
+          if (!parser.init()) {
+            ldpp_dout(dpp, 0) << "ERROR: failed to initialize xml parser for parsing multipart init response from server" << dendl;
+            return set_cr_error(retcode);
+          }
+
+          if (!parser.parse(out_bl.c_str(), out_bl.length(), 1)) {
+            string str(out_bl.c_str(), out_bl.length());
+            ldpp_dout(dpp, 5) << "ERROR: failed to parse xml: " << str << dendl;
+            return set_cr_error(retcode);
+          }
+
+          try {
+            RGWXMLDecoder::decode_xml("Error", result, &parser, true);
+          } catch (RGWXMLDecoder::err& err) {
+            string str(out_bl.c_str(), out_bl.length());
+            ldpp_dout(dpp, 5) << "ERROR: unexpected xml: " << str << dendl;
+            return set_cr_error(retcode);
+          }
+
+          if (result.code != "BucketAlreadyOwnedByYou") {
+            return set_cr_error(retcode);
+          }
+        }
+
+        bucket_created[target_bucket_name] = true;
+      }
+
+      yield {
+        bucket.reset(new rgw::sal::RadosBucket(sync_env->driver, src_bucket));
+        src_obj.reset(new rgw::sal::RadosObject(sync_env->driver, key, bucket.get()));
+
+        /* init output */
+        target_bucket.name = target_bucket_name; /* this is only possible because we only use bucket name for
+                                                    uri resolution */
+        dest_bucket.reset(new rgw::sal::RadosBucket(sync_env->driver, target_bucket));
+        dest_obj.reset(new rgw::sal::RadosObject(sync_env->driver, rgw_obj_key(target_obj_name), dest_bucket.get()));
+
+        rgw_sync_aws_src_obj_properties src_properties;
+        src_properties.mtime = mtime;
+        src_properties.etag = etag;
+        src_properties.zone_short_id = src_zone_short_id;
+        src_properties.pg_ver = src_pg_ver;
+        src_properties.versioned_epoch = versioned_epoch;
+
+        if (size < instance.conf.s3.multipart_sync_threshold) {
+          call(new RGWAWSStreamObjToCloudPlainCR(sc, source_conn, src_obj.get(),
+                                                 src_properties,
+                                                 target,
+                                                 dest_obj.get()));
+        } else {
+          rgw_rest_obj rest_obj;
+          rest_obj.init(key);
+          if (do_decode_rest_obj(dpp, sc->cct, attrs, headers, &rest_obj)) {
+            ldpp_dout(dpp, 0) << "ERROR: failed to decode rest obj out of headers=" << headers << ", attrs=" << attrs << dendl;
+            return set_cr_error(-EINVAL);
+          }
+          call(new RGWAWSStreamObjToCloudMultipartCR(sc, sync_pipe, instance.conf, source_conn, src_obj.get(),
+                                                     target, dest_obj.get(), size, src_properties, rest_obj));
+        }
+      }
+      if (retcode < 0) {
+        return set_cr_error(retcode);
+      }
+
+      return set_cr_done();
+    }
+
+    return 0;
+  }
+};
+
+class RGWAWSHandleRemoteObjCR : public RGWCallStatRemoteObjCR {
+  rgw_bucket_sync_pipe sync_pipe;
+  AWSSyncInstanceEnv& instance;
+  uint64_t versioned_epoch;
+public:
+  RGWAWSHandleRemoteObjCR(RGWDataSyncCtx *_sc,
+                              rgw_bucket_sync_pipe& _sync_pipe, rgw_obj_key& _key,
+                              AWSSyncInstanceEnv& _instance, uint64_t _versioned_epoch) : RGWCallStatRemoteObjCR(_sc, _sync_pipe.info.source_bs.bucket, _key),
+                                                          sync_pipe(_sync_pipe),
+                                                          instance(_instance), versioned_epoch(_versioned_epoch) {
+  }
+
+  ~RGWAWSHandleRemoteObjCR() {}
+
+  RGWStatRemoteObjCBCR *allocate_callback() override {
+    return new RGWAWSHandleRemoteObjCBCR(sc, sync_pipe, key, instance, versioned_epoch);
+  }
+};
+
+class RGWAWSRemoveRemoteObjCBCR : public RGWCoroutine {
+  RGWDataSyncCtx *sc;
+  std::shared_ptr<AWSSyncConfig_Profile> target;
+  rgw_bucket_sync_pipe sync_pipe;
+  rgw_obj_key key;
+  ceph::real_time mtime;
+  AWSSyncInstanceEnv& instance;
+  int ret{0};
+public:
+  RGWAWSRemoveRemoteObjCBCR(RGWDataSyncCtx *_sc,
+                          rgw_bucket_sync_pipe& _sync_pipe, rgw_obj_key& _key, const ceph::real_time& _mtime,
+                          AWSSyncInstanceEnv& _instance) : RGWCoroutine(_sc->cct), sc(_sc),
+                                                        sync_pipe(_sync_pipe), key(_key),
+                                                        mtime(_mtime), instance(_instance) {}
+  int operate(const DoutPrefixProvider *dpp) override {
+    reenter(this) {
+      ldpp_dout(dpp, 0) << ": remove remote obj: z=" << sc->source_zone
+                              << " b=" <<sync_pipe.info.source_bs.bucket << " k=" << key << " mtime=" << mtime << dendl;
+      yield {
+        instance.get_profile(sync_pipe.info.source_bs.bucket, &target);
+        string path =  instance.conf.get_path(target, sync_pipe.dest_bucket_info, key);
+        ldpp_dout(dpp, 0) << "AWS: removing aws object at" << path << dendl;
+
+        call(new RGWDeleteRESTResourceCR(sc->cct, target->conn.get(),
+                                         sc->env->http_manager,
+                                         path, nullptr /* params */));
+      }
+      if (retcode < 0) {
+        return set_cr_error(retcode);
+      }
+      return set_cr_done();
+    }
+    return 0;
+  }
+
+};
+
+
+class RGWAWSDataSyncModule: public RGWDataSyncModule {
+  CephContext *cct;
+  AWSSyncInstanceEnv instance;
+public:
+  RGWAWSDataSyncModule(CephContext *_cct, AWSSyncConfig& _conf) :
+                  cct(_cct),
+                  instance(_conf) {
+  }
+
+  void init(RGWDataSyncCtx *sc, uint64_t instance_id) override {
+    instance.init(sc, instance_id);
+  }
+
+  ~RGWAWSDataSyncModule() {}
+
+  RGWCoroutine *sync_object(const DoutPrefixProvider *dpp, RGWDataSyncCtx *sc, rgw_bucket_sync_pipe& sync_pipe, rgw_obj_key& key,
+                            std::optional<uint64_t> versioned_epoch,
+                            rgw_zone_set *zones_trace) override {
+    ldout(sc->cct, 0) << instance.id << ": sync_object: b=" << sync_pipe.info.source_bs.bucket << " k=" << key << " versioned_epoch=" << versioned_epoch.value_or(0) << dendl;
+    return new RGWAWSHandleRemoteObjCR(sc, sync_pipe, key, instance, versioned_epoch.value_or(0));
+  }
+  RGWCoroutine *remove_object(const DoutPrefixProvider *dpp, RGWDataSyncCtx *sc, rgw_bucket_sync_pipe& sync_pipe, rgw_obj_key& key, real_time& mtime, bool versioned, uint64_t versioned_epoch,
+                              rgw_zone_set *zones_trace) override {
+    ldout(sc->cct, 0) <<"rm_object: b=" << sync_pipe.info.source_bs.bucket << " k=" << key << " mtime=" << mtime << " versioned=" << versioned << " versioned_epoch=" << versioned_epoch << dendl;
+    return new RGWAWSRemoveRemoteObjCBCR(sc, sync_pipe, key, mtime, instance);
+  }
+  RGWCoroutine *create_delete_marker(const DoutPrefixProvider *dpp, RGWDataSyncCtx *sc, rgw_bucket_sync_pipe& sync_pipe, rgw_obj_key& key, real_time& mtime,
+                                     rgw_bucket_entry_owner& owner, bool versioned, uint64_t versioned_epoch,
+                                     rgw_zone_set *zones_trace) override {
+    ldout(sc->cct, 0) <<"AWS Not implemented: create_delete_marker: b=" << sync_pipe.info.source_bs.bucket << " k=" << key << " mtime=" << mtime
+                            << " versioned=" << versioned << " versioned_epoch=" << versioned_epoch << dendl;
+    return NULL;
+  }
+};
+
+class RGWAWSSyncModuleInstance : public RGWSyncModuleInstance {
+  RGWAWSDataSyncModule data_handler;
+public:
+  RGWAWSSyncModuleInstance(CephContext *cct, AWSSyncConfig& _conf) : data_handler(cct, _conf) {}
+  RGWDataSyncModule *get_data_handler() override {
+    return &data_handler;
+  }
+};
+
+int RGWAWSSyncModule::create_instance(const DoutPrefixProvider *dpp, CephContext *cct, const JSONFormattable& config,  RGWSyncModuleInstanceRef *instance){
+  AWSSyncConfig conf;
+
+  int r = conf.init(dpp, cct, config);
+  if (r < 0) {
+    return r;
+  }
+
+  instance->reset(new RGWAWSSyncModuleInstance(cct, conf));
+  return 0;
+}
diff --git a/src/rgw/driver/rados/rgw_sync_module_aws.h b/src/rgw/driver/rados/rgw_sync_module_aws.h
new file mode 100644 (file)
index 0000000..48f0145
--- /dev/null
@@ -0,0 +1,111 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#ifndef RGW_SYNC_MODULE_AWS_H
+#define RGW_SYNC_MODULE_AWS_H
+
+#include "rgw_sync_module.h"
+
+struct rgw_sync_aws_multipart_part_info {
+  int part_num{0};
+  uint64_t ofs{0};
+  uint64_t size{0};
+  std::string etag;
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(1, 1, bl);
+    encode(part_num, bl);
+    encode(ofs, bl);
+    encode(size, bl);
+    encode(etag, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) {
+    DECODE_START(1, bl);
+    decode(part_num, bl);
+    decode(ofs, bl);
+    decode(size, bl);
+    decode(etag, bl);
+    DECODE_FINISH(bl);
+  }
+};
+WRITE_CLASS_ENCODER(rgw_sync_aws_multipart_part_info)
+
+struct rgw_sync_aws_src_obj_properties {
+  ceph::real_time mtime;
+  std::string etag;
+  uint32_t zone_short_id{0};
+  uint64_t pg_ver{0};
+  uint64_t versioned_epoch{0};
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(1, 1, bl);
+    encode(mtime, bl);
+    encode(etag, bl);
+    encode(zone_short_id, bl);
+    encode(pg_ver, bl);
+    encode(versioned_epoch, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) {
+    DECODE_START(1, bl);
+    decode(mtime, bl);
+    decode(etag, bl);
+    decode(zone_short_id, bl);
+    decode(pg_ver, bl);
+    decode(versioned_epoch, bl);
+    DECODE_FINISH(bl);
+  }
+};
+WRITE_CLASS_ENCODER(rgw_sync_aws_src_obj_properties)
+
+struct rgw_sync_aws_multipart_upload_info {
+  std::string upload_id;
+  uint64_t obj_size;
+  rgw_sync_aws_src_obj_properties src_properties;
+  uint32_t part_size{0};
+  uint32_t num_parts{0};
+
+  int cur_part{0};
+  uint64_t cur_ofs{0};
+
+  std::map<int, rgw_sync_aws_multipart_part_info> parts;
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(1, 1, bl);
+    encode(upload_id, bl);
+    encode(obj_size, bl);
+    encode(src_properties, bl);
+    encode(part_size, bl);
+    encode(num_parts, bl);
+    encode(cur_part, bl);
+    encode(cur_ofs, bl);
+    encode(parts, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) {
+    DECODE_START(1, bl);
+    decode(upload_id, bl);
+    decode(obj_size, bl);
+    decode(src_properties, bl);
+    decode(part_size, bl);
+    decode(num_parts, bl);
+    decode(cur_part, bl);
+    decode(cur_ofs, bl);
+    decode(parts, bl);
+    DECODE_FINISH(bl);
+  }
+};
+WRITE_CLASS_ENCODER(rgw_sync_aws_multipart_upload_info)
+
+class RGWAWSSyncModule : public RGWSyncModule {
+ public:
+  RGWAWSSyncModule() {}
+  bool supports_data_export() override { return false;}
+  int create_instance(const DoutPrefixProvider *dpp, CephContext *cct, const JSONFormattable& config, RGWSyncModuleInstanceRef *instance) override;
+};
+
+#endif /* RGW_SYNC_MODULE_AWS_H */
diff --git a/src/rgw/driver/rados/rgw_sync_module_es.cc b/src/rgw/driver/rados/rgw_sync_module_es.cc
new file mode 100644 (file)
index 0000000..3c294bb
--- /dev/null
@@ -0,0 +1,962 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "rgw_b64.h"
+#include "rgw_common.h"
+#include "rgw_coroutine.h"
+#include "rgw_sync_module.h"
+#include "rgw_data_sync.h"
+#include "rgw_sync_module_es.h"
+#include "rgw_sync_module_es_rest.h"
+#include "rgw_rest_conn.h"
+#include "rgw_cr_rest.h"
+#include "rgw_op.h"
+#include "rgw_es_query.h"
+#include "rgw_zone.h"
+
+#include "services/svc_zone.h"
+
+#include "include/str_list.h"
+
+#include <boost/asio/yield.hpp>
+
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+
+/*
+ * allowlist utility. Config string is a list of entries, where an entry is either an item,
+ * a prefix, or a suffix. An item would be the name of the entity that we'd look up,
+ * a prefix would be a string ending with an asterisk, a suffix would be a string starting
+ * with an asterisk. For example:
+ *
+ *      bucket1, bucket2, foo*, *bar
+ */
+class ItemList {
+  bool approve_all{false};
+
+  set<string> entries;
+  set<string> prefixes;
+  set<string> suffixes;
+
+  void parse(const string& str) {
+    list<string> l;
+
+    get_str_list(str, ",", l);
+
+    for (auto& entry : l) {
+      entry = rgw_trim_whitespace(entry);
+      if (entry.empty()) {
+        continue;
+      }
+
+      if (entry == "*") {
+        approve_all = true;
+        return;
+      }
+
+      if (entry[0] == '*') {
+        suffixes.insert(entry.substr(1));
+        continue;
+      }
+
+      if (entry.back() == '*') {
+        prefixes.insert(entry.substr(0, entry.size() - 1));
+        continue;
+      }
+
+      entries.insert(entry);
+    }
+  }
+
+public:
+  ItemList() {}
+  void init(const string& str, bool def_val) {
+    if (str.empty()) {
+      approve_all = def_val;
+    } else {
+      parse(str);
+    }
+  }
+
+  bool exists(const string& entry) {
+    if (approve_all) {
+      return true;
+    }
+
+    if (entries.find(entry) != entries.end()) {
+      return true;
+    }
+
+    auto i = prefixes.upper_bound(entry);
+    if (i != prefixes.begin()) {
+      --i;
+      if (boost::algorithm::starts_with(entry, *i)) {
+        return true;
+      }
+    }
+
+    for (i = suffixes.begin(); i != suffixes.end(); ++i) {
+      if (boost::algorithm::ends_with(entry, *i)) {
+        return true;
+      }
+    }
+
+    return false;
+  }
+};
+
+#define ES_NUM_SHARDS_MIN 5
+
+#define ES_NUM_SHARDS_DEFAULT 16
+#define ES_NUM_REPLICAS_DEFAULT 1
+
+using ESVersion = std::pair<int,int>;
+static constexpr ESVersion ES_V5{5,0};
+static constexpr ESVersion ES_V7{7,0};
+
+struct ESInfo {
+  std::string name;
+  std::string cluster_name;
+  std::string cluster_uuid;
+  ESVersion version;
+
+  void decode_json(JSONObj *obj);
+
+  std::string get_version_str(){
+    return std::to_string(version.first) + "." + std::to_string(version.second);
+  }
+};
+
+// simple wrapper structure to wrap the es version nested type
+struct es_version_decoder {
+  ESVersion version;
+
+  int parse_version(const std::string& s) {
+    int major, minor;
+    int ret = sscanf(s.c_str(), "%d.%d", &major, &minor);
+    if (ret < 0) {
+      return ret;
+    }
+    version = std::make_pair(major,minor);
+    return 0;
+  }
+
+  void decode_json(JSONObj *obj) {
+    std::string s;
+    JSONDecoder::decode_json("number",s,obj);
+    if (parse_version(s) < 0)
+      throw JSONDecoder::err("Failed to parse ElasticVersion");
+  }
+};
+
+
+void ESInfo::decode_json(JSONObj *obj)
+{
+  JSONDecoder::decode_json("name", name, obj);
+  JSONDecoder::decode_json("cluster_name", cluster_name, obj);
+  JSONDecoder::decode_json("cluster_uuid", cluster_uuid, obj);
+  es_version_decoder esv;
+  JSONDecoder::decode_json("version", esv, obj);
+  version = std::move(esv.version);
+}
+
+struct ElasticConfig {
+  uint64_t sync_instance{0};
+  string id;
+  string index_path;
+  std::unique_ptr<RGWRESTConn> conn;
+  bool explicit_custom_meta{true};
+  string override_index_path;
+  ItemList index_buckets;
+  ItemList allow_owners;
+  uint32_t num_shards{0};
+  uint32_t num_replicas{0};
+  std::map <string,string> default_headers = {{ "Content-Type", "application/json" }};
+  ESInfo es_info;
+
+  void init(CephContext *cct, const JSONFormattable& config) {
+    string elastic_endpoint = config["endpoint"];
+    id = string("elastic:") + elastic_endpoint;
+    conn.reset(new RGWRESTConn(cct, (rgw::sal::Driver*)nullptr, id, { elastic_endpoint }, nullopt /* region */ ));
+    explicit_custom_meta = config["explicit_custom_meta"](true);
+    index_buckets.init(config["index_buckets_list"], true); /* approve all buckets by default */
+    allow_owners.init(config["approved_owners_list"], true); /* approve all bucket owners by default */
+    override_index_path = config["override_index_path"];
+    num_shards = config["num_shards"](ES_NUM_SHARDS_DEFAULT);
+    if (num_shards < ES_NUM_SHARDS_MIN) {
+      num_shards = ES_NUM_SHARDS_MIN;
+    }
+    num_replicas = config["num_replicas"](ES_NUM_REPLICAS_DEFAULT);
+    if (string user = config["username"], pw = config["password"];
+        !user.empty() && !pw.empty()) {
+      auto auth_string = user + ":" + pw;
+      default_headers.emplace("AUTHORIZATION", "Basic " + rgw::to_base64(auth_string));
+    }
+
+  }
+
+  void init_instance(const RGWRealm& realm, uint64_t instance_id) {
+    sync_instance = instance_id;
+
+    if (!override_index_path.empty()) {
+      index_path = override_index_path;
+      return;
+    }
+
+    char buf[32];
+    snprintf(buf, sizeof(buf), "-%08x", (uint32_t)(sync_instance & 0xFFFFFFFF));
+
+    index_path = "/rgw-" + realm.get_name() + buf;
+  }
+
+  string get_index_path() {
+    return index_path;
+  }
+
+  map<string, string>& get_request_headers() {
+    return default_headers;
+  }
+
+  string get_obj_path(const RGWBucketInfo& bucket_info, const rgw_obj_key& key) {
+    if (es_info.version >= ES_V7) {
+      return index_path+ "/_doc/" + url_encode(bucket_info.bucket.bucket_id + ":" + key.name + ":" + (key.instance.empty() ? "null" : key.instance));
+;
+    } else {
+      return index_path +  "/object/" + url_encode(bucket_info.bucket.bucket_id + ":" + key.name + ":" + (key.instance.empty() ? "null" : key.instance));
+    }
+  }
+
+  bool should_handle_operation(RGWBucketInfo& bucket_info) {
+    return index_buckets.exists(bucket_info.bucket.name) &&
+           allow_owners.exists(bucket_info.owner.to_str());
+  }
+};
+
+using ElasticConfigRef = std::shared_ptr<ElasticConfig>;
+
+static const char *es_type_to_str(const ESType& t) {
+  switch (t) {
+  case ESType::String: return "string";
+  case ESType::Text: return "text";
+  case ESType::Keyword: return "keyword";
+  case ESType::Long: return "long";
+  case ESType::Integer: return "integer";
+  case ESType::Short: return "short";
+  case ESType::Byte: return "byte";
+  case ESType::Double: return "double";
+  case ESType::Float: return "float";
+  case ESType::Half_Float: return "half_float";
+  case ESType::Scaled_Float: return "scaled_float";
+  case ESType::Date: return "date";
+  case ESType::Boolean: return "boolean";
+  case ESType::Integer_Range: return "integer_range";
+  case ESType::Float_Range: return "float_range";
+  case ESType::Double_Range: return "date_range";
+  case ESType::Date_Range: return "date_range";
+  case ESType::Geo_Point: return "geo_point";
+  case ESType::Ip: return "ip";
+  default:
+    return "<unknown>";
+  }
+}
+
+struct es_type_v2 {
+  ESType estype;
+  const char *format{nullptr};
+  std::optional<bool> analyzed;
+
+  es_type_v2(ESType et) : estype(et) {}
+
+  void dump(Formatter *f) const {
+    const char *type_str = es_type_to_str(estype);
+    encode_json("type", type_str, f);
+    if (format) {
+      encode_json("format", format, f);
+    }
+
+    auto is_analyzed = analyzed;
+
+    if (estype == ESType::String &&
+        !is_analyzed) {
+      is_analyzed = false;
+    }
+
+    if (is_analyzed) {
+      encode_json("index", (is_analyzed.value() ? "analyzed" : "not_analyzed"), f);
+    }
+  }
+};
+
+struct es_type_v5 {
+  ESType estype;
+  const char *format{nullptr};
+  std::optional<bool> analyzed;
+  std::optional<bool> index;
+
+  es_type_v5(ESType et) : estype(et) {}
+
+  void dump(Formatter *f) const {
+    ESType new_estype;
+    if (estype != ESType::String) {
+      new_estype = estype;
+    } else {
+      bool is_analyzed = analyzed.value_or(false);
+      new_estype = (is_analyzed ? ESType::Text : ESType::Keyword);
+      /* index = true; ... Not setting index=true, because that's the default,
+       * and dumping a boolean value *might* be a problem when backporting this
+       * because value might get quoted
+       */
+    }
+
+    const char *type_str = es_type_to_str(new_estype);
+    encode_json("type", type_str, f);
+    if (format) {
+      encode_json("format", format, f);
+    }
+    if (index) {
+      encode_json("index", index.value(), f);
+    }
+  }
+};
+
+template <class T>
+struct es_type : public T {
+  es_type(T t) : T(t) {}
+  es_type& set_format(const char *f) {
+    T::format = f;
+    return *this;
+  }
+
+  es_type& set_analyzed(bool a) {
+    T::analyzed = a;
+    return *this;
+  }
+};
+
+template <class T>
+struct es_index_mappings {
+  ESVersion es_version;
+  ESType string_type {ESType::String};
+
+  es_index_mappings(ESVersion esv):es_version(esv) {
+  }
+
+  es_type<T> est(ESType t) const {
+    return es_type<T>(t);
+  }
+
+  void dump_custom(const char *section, ESType type, const char *format, Formatter *f) const {
+    f->open_object_section(section);
+    ::encode_json("type", "nested", f);
+    f->open_object_section("properties");
+    encode_json("name", est(string_type), f);
+    encode_json("value", est(type).set_format(format), f);
+    f->close_section(); // entry
+    f->close_section(); // custom-string
+  }
+
+  void dump(Formatter *f) const {
+    if (es_version <= ES_V7)
+      f->open_object_section("object");
+    f->open_object_section("properties");
+    encode_json("bucket", est(string_type), f);
+    encode_json("name", est(string_type), f);
+    encode_json("instance", est(string_type), f);
+    encode_json("versioned_epoch", est(ESType::Long), f);
+    f->open_object_section("meta");
+    f->open_object_section("properties");
+    encode_json("cache_control", est(string_type), f);
+    encode_json("content_disposition", est(string_type), f);
+    encode_json("content_encoding", est(string_type), f);
+    encode_json("content_language", est(string_type), f);
+    encode_json("content_type", est(string_type), f);
+    encode_json("storage_class", est(string_type), f);
+    encode_json("etag", est(string_type), f);
+    encode_json("expires", est(string_type), f);
+    encode_json("mtime", est(ESType::Date)
+                         .set_format("strict_date_optional_time||epoch_millis"), f);
+    encode_json("size", est(ESType::Long), f);
+    dump_custom("custom-string", string_type, nullptr, f);
+    dump_custom("custom-int", ESType::Long, nullptr, f);
+    dump_custom("custom-date", ESType::Date, "strict_date_optional_time||epoch_millis", f);
+    f->close_section(); // properties
+    f->close_section(); // meta
+    f->close_section(); // properties
+
+    if (es_version <= ES_V7)
+    f->close_section(); // object
+  }
+};
+
+struct es_index_settings {
+  uint32_t num_replicas;
+  uint32_t num_shards;
+
+  es_index_settings(uint32_t _replicas, uint32_t _shards) : num_replicas(_replicas), num_shards(_shards) {}
+
+  void dump(Formatter *f) const {
+    encode_json("number_of_replicas", num_replicas, f);
+    encode_json("number_of_shards", num_shards, f);
+  }
+};
+
+struct es_index_config_base {
+  virtual ~es_index_config_base() {}
+  virtual void dump(Formatter *f) const = 0;
+};
+
+template <class T>
+struct es_index_config : public es_index_config_base {
+  es_index_settings settings;
+  es_index_mappings<T> mappings;
+
+  es_index_config(es_index_settings& _s, ESVersion esv) : settings(_s), mappings(esv) {
+  }
+
+  void dump(Formatter *f) const {
+    encode_json("settings", settings, f);
+    encode_json("mappings", mappings, f);
+  }
+};
+
+static bool is_sys_attr(const std::string& attr_name){
+  static constexpr std::initializer_list<const char*> rgw_sys_attrs =
+                                                         {RGW_ATTR_PG_VER,
+                                                          RGW_ATTR_SOURCE_ZONE,
+                                                          RGW_ATTR_ID_TAG,
+                                                          RGW_ATTR_TEMPURL_KEY1,
+                                                          RGW_ATTR_TEMPURL_KEY2,
+                                                          RGW_ATTR_UNIX1,
+                                                          RGW_ATTR_UNIX_KEY1
+  };
+
+  return std::find(rgw_sys_attrs.begin(), rgw_sys_attrs.end(), attr_name) != rgw_sys_attrs.end();
+}
+
+static size_t attr_len(const bufferlist& val)
+{
+  size_t len = val.length();
+  if (len && val[len - 1] == '\0') {
+    --len;
+  }
+
+  return len;
+}
+
+struct es_obj_metadata {
+  const DoutPrefixProvider *dpp;
+  CephContext *cct;
+  ElasticConfigRef es_conf;
+  RGWBucketInfo bucket_info;
+  rgw_obj_key key;
+  ceph::real_time mtime;
+  uint64_t size;
+  map<string, bufferlist> attrs;
+  uint64_t versioned_epoch;
+
+  es_obj_metadata(CephContext *_cct, ElasticConfigRef _es_conf, const RGWBucketInfo& _bucket_info,
+                  const rgw_obj_key& _key, ceph::real_time& _mtime, uint64_t _size,
+                  map<string, bufferlist>& _attrs, uint64_t _versioned_epoch) : cct(_cct), es_conf(_es_conf), bucket_info(_bucket_info), key(_key),
+                                                     mtime(_mtime), size(_size), attrs(std::move(_attrs)), versioned_epoch(_versioned_epoch) {}
+
+  void dump(Formatter *f) const {
+    map<string, string> out_attrs;
+    map<string, string> custom_meta;
+    RGWAccessControlPolicy policy;
+    set<string> permissions;
+    RGWObjTags obj_tags;
+
+    for (auto i : attrs) {
+      const string& attr_name = i.first;
+      bufferlist& val = i.second;
+
+      if (!boost::algorithm::starts_with(attr_name, RGW_ATTR_PREFIX)) {
+        continue;
+      }
+
+      if (boost::algorithm::starts_with(attr_name, RGW_ATTR_META_PREFIX)) {
+        custom_meta.emplace(attr_name.substr(sizeof(RGW_ATTR_META_PREFIX) - 1),
+                            string(val.c_str(), attr_len(val)));
+        continue;
+      }
+
+      if (boost::algorithm::starts_with(attr_name, RGW_ATTR_CRYPT_PREFIX)) {
+        continue;
+      }
+
+      if (boost::algorithm::starts_with(attr_name, RGW_ATTR_OLH_PREFIX)) {
+        // skip versioned object olh info
+        continue;
+      }
+
+      if (attr_name == RGW_ATTR_ACL) {
+        try {
+          auto i = val.cbegin();
+          decode(policy, i);
+        } catch (buffer::error& err) {
+          ldpp_dout(dpp, 0) << "ERROR: failed to decode acl for " << bucket_info.bucket << "/" << key << dendl;
+          continue;
+        }
+
+        const RGWAccessControlList& acl = policy.get_acl();
+
+        permissions.insert(policy.get_owner().get_id().to_str());
+        for (auto acliter : acl.get_grant_map()) {
+          const ACLGrant& grant = acliter.second;
+          if (grant.get_type().get_type() == ACL_TYPE_CANON_USER &&
+              ((uint32_t)grant.get_permission().get_permissions() & RGW_PERM_READ) != 0) {
+            rgw_user user;
+            if (grant.get_id(user)) {
+              permissions.insert(user.to_str());
+            }
+          }
+        }
+      } else if (attr_name == RGW_ATTR_TAGS) {
+        try {
+          auto tags_bl = val.cbegin();
+          decode(obj_tags, tags_bl);
+        } catch (buffer::error& err) {
+          ldpp_dout(dpp, 0) << "ERROR: failed to decode obj tags for "
+                       << bucket_info.bucket << "/" << key << dendl;
+          continue;
+        }
+      } else if (attr_name == RGW_ATTR_COMPRESSION) {
+        RGWCompressionInfo cs_info;
+        try {
+          auto vals_bl = val.cbegin();
+          decode(cs_info, vals_bl);
+        } catch (buffer::error& err) {
+          ldpp_dout(dpp, 0) << "ERROR: failed to decode compression attr for "
+                       << bucket_info.bucket << "/" << key << dendl;
+          continue;
+        }
+        out_attrs.emplace("compression",std::move(cs_info.compression_type));
+      } else {
+        if (!is_sys_attr(attr_name)) {
+          out_attrs.emplace(attr_name.substr(sizeof(RGW_ATTR_PREFIX) - 1),
+                            std::string(val.c_str(), attr_len(val)));
+        }
+      }
+    }
+    ::encode_json("bucket", bucket_info.bucket.name, f);
+    ::encode_json("name", key.name, f);
+    string instance = key.instance;
+    if (instance.empty())
+      instance = "null";
+    ::encode_json("instance", instance, f);
+    ::encode_json("versioned_epoch", versioned_epoch, f);
+    ::encode_json("owner", policy.get_owner(), f);
+    ::encode_json("permissions", permissions, f);
+    f->open_object_section("meta");
+    ::encode_json("size", size, f);
+
+    string mtime_str;
+    rgw_to_iso8601(mtime, &mtime_str);
+    ::encode_json("mtime", mtime_str, f);
+    for (auto i : out_attrs) {
+      ::encode_json(i.first.c_str(), i.second, f);
+    }
+    map<string, string> custom_str;
+    map<string, string> custom_int;
+    map<string, string> custom_date;
+
+    for (auto i : custom_meta) {
+      auto config = bucket_info.mdsearch_config.find(i.first);
+      if (config == bucket_info.mdsearch_config.end()) {
+        if (!es_conf->explicit_custom_meta) {
+          /* default custom meta is of type string */
+          custom_str[i.first] = i.second;
+        } else {
+          ldpp_dout(dpp, 20) << "custom meta entry key=" << i.first << " not found in bucket mdsearch config: " << bucket_info.mdsearch_config << dendl;
+        }
+        continue;
+      }
+      switch (config->second) {
+        case ESEntityTypeMap::ES_ENTITY_DATE:
+          custom_date[i.first] = i.second;
+          break;
+        case ESEntityTypeMap::ES_ENTITY_INT:
+          custom_int[i.first] = i.second;
+          break;
+        default:
+          custom_str[i.first] = i.second;
+      }
+    }
+
+    if (!custom_str.empty()) {
+      f->open_array_section("custom-string");
+      for (auto i : custom_str) {
+        f->open_object_section("entity");
+        ::encode_json("name", i.first.c_str(), f);
+        ::encode_json("value", i.second, f);
+        f->close_section();
+      }
+      f->close_section();
+    }
+    if (!custom_int.empty()) {
+      f->open_array_section("custom-int");
+      for (auto i : custom_int) {
+        f->open_object_section("entity");
+        ::encode_json("name", i.first.c_str(), f);
+        ::encode_json("value", i.second, f);
+        f->close_section();
+      }
+      f->close_section();
+    }
+    if (!custom_date.empty()) {
+      f->open_array_section("custom-date");
+      for (auto i : custom_date) {
+        /*
+         * try to exlicitly parse date field, otherwise elasticsearch could reject the whole doc,
+         * which will end up with failed sync
+         */
+        real_time t;
+        int r = parse_time(i.second.c_str(), &t);
+        if (r < 0) {
+          ldpp_dout(dpp, 20) << __func__ << "(): failed to parse time (" << i.second << "), skipping encoding of custom date attribute" << dendl;
+          continue;
+        }
+
+        string time_str;
+        rgw_to_iso8601(t, &time_str);
+
+        f->open_object_section("entity");
+        ::encode_json("name", i.first.c_str(), f);
+        ::encode_json("value", time_str.c_str(), f);
+        f->close_section();
+      }
+      f->close_section();
+    }
+    f->close_section(); // meta
+    const auto& m = obj_tags.get_tags();
+    if (m.size() > 0){
+      f->open_array_section("tagging");
+      for (const auto &it : m) {
+        f->open_object_section("tag");
+        ::encode_json("key", it.first, f);
+        ::encode_json("value",it.second, f);
+        f->close_section();
+      }
+      f->close_section(); // tagging
+    }
+  }
+};
+
+class RGWElasticGetESInfoCBCR : public RGWCoroutine {
+public:
+  RGWElasticGetESInfoCBCR(RGWDataSyncCtx *_sc, 
+                          ElasticConfigRef _conf) : RGWCoroutine(_sc->cct),
+                                                    sc(_sc), sync_env(_sc->env),
+                                                    conf(_conf) {}
+  int operate(const DoutPrefixProvider *dpp) override {
+    reenter(this) {
+      ldpp_dout(dpp, 5) << conf->id << ": get elasticsearch info for zone: " << sc->source_zone << dendl;
+      yield call(new RGWReadRESTResourceCR<ESInfo> (sync_env->cct,
+                                                    conf->conn.get(),
+                                                    sync_env->http_manager,
+                                                    "/", nullptr /*params*/,
+                                                    &(conf->default_headers),
+                                                    &(conf->es_info)));
+      if (retcode < 0) {
+        ldpp_dout(dpp, 5) << conf->id << ": get elasticsearch failed: " << retcode << dendl;
+        return set_cr_error(retcode);
+      }
+
+      ldpp_dout(dpp, 5) << conf->id << ": got elastic version=" << conf->es_info.get_version_str() << dendl;
+      return set_cr_done();
+    }
+    return 0;
+  }
+private:
+  RGWDataSyncCtx *sc;
+  RGWDataSyncEnv *sync_env;
+  ElasticConfigRef conf;
+};
+
+class RGWElasticPutIndexCBCR : public RGWCoroutine {
+public:
+  RGWElasticPutIndexCBCR(RGWDataSyncCtx *_sc,
+                         ElasticConfigRef _conf) : RGWCoroutine(_sc->cct),
+                                                   sc(_sc), sync_env(_sc->env),
+                                                   conf(_conf) {}
+  int operate(const DoutPrefixProvider *dpp) override {
+    reenter(this) {
+      ldpp_dout(dpp, 5) << conf->id << ": put elasticsearch index for zone: " << sc->source_zone << dendl;
+
+      yield {
+        string path = conf->get_index_path();
+        es_index_settings settings(conf->num_replicas, conf->num_shards);
+        std::unique_ptr<es_index_config_base> index_conf;
+
+        if (conf->es_info.version >= ES_V5) {
+          ldpp_dout(dpp, 0) << "elasticsearch: index mapping: version >= 5" << dendl;
+          index_conf.reset(new es_index_config<es_type_v5>(settings, conf->es_info.version));
+        } else {
+          ldpp_dout(dpp, 0) << "elasticsearch: index mapping: version < 5" << dendl;
+          index_conf.reset(new es_index_config<es_type_v2>(settings, conf->es_info.version));
+        }
+        call(new RGWPutRESTResourceCR<es_index_config_base, int, _err_response> (sc->cct,
+                                                             conf->conn.get(),
+                                                             sync_env->http_manager,
+                                                             path, nullptr /*params*/,
+                                                             &(conf->default_headers),
+                                                             *index_conf, nullptr, &err_response));
+      }
+      if (retcode < 0) {
+
+        if (err_response.error.type != "index_already_exists_exception" &&
+                 err_response.error.type != "resource_already_exists_exception") {
+          ldpp_dout(dpp, 0) << "elasticsearch: failed to initialize index: response.type=" << err_response.error.type << " response.reason=" << err_response.error.reason << dendl;
+          return set_cr_error(retcode);
+        }
+
+        ldpp_dout(dpp, 0) << "elasticsearch: index already exists, assuming external initialization" << dendl;
+      }
+      return set_cr_done();
+    }
+    return 0;
+  }
+
+private:
+  RGWDataSyncCtx *sc;
+  RGWDataSyncEnv *sync_env;
+  ElasticConfigRef conf;
+
+    struct _err_response {
+    struct err_reason {
+      vector<err_reason> root_cause;
+      string type;
+      string reason;
+      string index;
+
+      void decode_json(JSONObj *obj) {
+        JSONDecoder::decode_json("root_cause", root_cause, obj);
+        JSONDecoder::decode_json("type", type, obj);
+        JSONDecoder::decode_json("reason", reason, obj);
+        JSONDecoder::decode_json("index", index, obj);
+      }
+    } error;
+
+    void decode_json(JSONObj *obj) {
+      JSONDecoder::decode_json("error", error, obj);
+    }
+  } err_response;
+};
+
+class RGWElasticInitConfigCBCR : public RGWCoroutine {
+  RGWDataSyncCtx *sc;
+  RGWDataSyncEnv *sync_env;
+  ElasticConfigRef conf;
+
+public:
+  RGWElasticInitConfigCBCR(RGWDataSyncCtx *_sc,
+                          ElasticConfigRef _conf) : RGWCoroutine(_sc->cct),
+                                                    sc(_sc), sync_env(_sc->env),
+                                                    conf(_conf) {}
+  int operate(const DoutPrefixProvider *dpp) override {
+    reenter(this) {
+
+      yield call(new RGWElasticGetESInfoCBCR(sc, conf));
+
+      if (retcode < 0) {
+        return set_cr_error(retcode);
+      }
+
+      yield call(new RGWElasticPutIndexCBCR(sc, conf));
+      if (retcode < 0) {
+          return set_cr_error(retcode);
+      }
+      return set_cr_done();
+    }
+    return 0;
+  }
+
+};
+
+class RGWElasticHandleRemoteObjCBCR : public RGWStatRemoteObjCBCR {
+  rgw_bucket_sync_pipe sync_pipe;
+  ElasticConfigRef conf;
+  uint64_t versioned_epoch;
+public:
+  RGWElasticHandleRemoteObjCBCR(RGWDataSyncCtx *_sc,
+                          rgw_bucket_sync_pipe& _sync_pipe, rgw_obj_key& _key,
+                          ElasticConfigRef _conf, uint64_t _versioned_epoch) : RGWStatRemoteObjCBCR(_sc, _sync_pipe.info.source_bs.bucket, _key),
+                                                                               sync_pipe(_sync_pipe), conf(_conf),
+                                                                               versioned_epoch(_versioned_epoch) {}
+  int operate(const DoutPrefixProvider *dpp) override {
+    reenter(this) {
+      ldpp_dout(dpp, 10) << ": stat of remote obj: z=" << sc->source_zone
+                               << " b=" << sync_pipe.info.source_bs.bucket << " k=" << key
+                               << " size=" << size << " mtime=" << mtime << dendl;
+
+      yield {
+        string path = conf->get_obj_path(sync_pipe.dest_bucket_info, key);
+        es_obj_metadata doc(sync_env->cct, conf, sync_pipe.dest_bucket_info, key, mtime, size, attrs, versioned_epoch);
+
+        call(new RGWPutRESTResourceCR<es_obj_metadata, int>(sync_env->cct, conf->conn.get(),
+                                                            sync_env->http_manager,
+                                                            path, nullptr /* params */,
+                                                            &(conf->default_headers),
+                                                            doc, nullptr /* result */));
+
+      }
+      if (retcode < 0) {
+        return set_cr_error(retcode);
+      }
+      return set_cr_done();
+    }
+    return 0;
+  }
+};
+
+class RGWElasticHandleRemoteObjCR : public RGWCallStatRemoteObjCR {
+  rgw_bucket_sync_pipe sync_pipe;
+  ElasticConfigRef conf;
+  uint64_t versioned_epoch;
+public:
+  RGWElasticHandleRemoteObjCR(RGWDataSyncCtx *_sc,
+                        rgw_bucket_sync_pipe& _sync_pipe, rgw_obj_key& _key,
+                        ElasticConfigRef _conf, uint64_t _versioned_epoch) : RGWCallStatRemoteObjCR(_sc, _sync_pipe.info.source_bs.bucket, _key),
+                                                           sync_pipe(_sync_pipe),
+                                                           conf(_conf), versioned_epoch(_versioned_epoch) {
+  }
+
+  ~RGWElasticHandleRemoteObjCR() override {}
+
+  RGWStatRemoteObjCBCR *allocate_callback() override {
+    return new RGWElasticHandleRemoteObjCBCR(sc, sync_pipe, key, conf, versioned_epoch);
+  }
+};
+
+class RGWElasticRemoveRemoteObjCBCR : public RGWCoroutine {
+  RGWDataSyncCtx *sc;
+  RGWDataSyncEnv *sync_env;
+  rgw_bucket_sync_pipe sync_pipe;
+  rgw_obj_key key;
+  ceph::real_time mtime;
+  ElasticConfigRef conf;
+public:
+  RGWElasticRemoveRemoteObjCBCR(RGWDataSyncCtx *_sc,
+                          rgw_bucket_sync_pipe& _sync_pipe, rgw_obj_key& _key, const ceph::real_time& _mtime,
+                          ElasticConfigRef _conf) : RGWCoroutine(_sc->cct), sc(_sc), sync_env(_sc->env),
+                                                        sync_pipe(_sync_pipe), key(_key),
+                                                        mtime(_mtime), conf(_conf) {}
+  int operate(const DoutPrefixProvider *dpp) override {
+    reenter(this) {
+      ldpp_dout(dpp, 10) << ": remove remote obj: z=" << sc->source_zone
+                               << " b=" << sync_pipe.info.source_bs.bucket << " k=" << key << " mtime=" << mtime << dendl;
+      yield {
+        string path = conf->get_obj_path(sync_pipe.dest_bucket_info, key);
+
+        call(new RGWDeleteRESTResourceCR(sync_env->cct, conf->conn.get(),
+                                         sync_env->http_manager,
+                                         path, nullptr /* params */));
+      }
+      if (retcode < 0) {
+        return set_cr_error(retcode);
+      }
+      return set_cr_done();
+    }
+    return 0;
+  }
+
+};
+
+class RGWElasticDataSyncModule : public RGWDataSyncModule {
+  ElasticConfigRef conf;
+public:
+  RGWElasticDataSyncModule(const DoutPrefixProvider *dpp, CephContext *cct, const JSONFormattable& config) : conf(std::make_shared<ElasticConfig>()) {
+    conf->init(cct, config);
+  }
+  ~RGWElasticDataSyncModule() override {}
+
+  void init(RGWDataSyncCtx *sc, uint64_t instance_id) override {
+    conf->init_instance(sc->env->svc->zone->get_realm(), instance_id);
+  }
+
+  RGWCoroutine *init_sync(const DoutPrefixProvider *dpp, RGWDataSyncCtx *sc) override {
+    ldpp_dout(dpp, 5) << conf->id << ": init" << dendl;
+    return new RGWElasticInitConfigCBCR(sc, conf);
+  }
+
+  RGWCoroutine *start_sync(const DoutPrefixProvider *dpp, RGWDataSyncCtx *sc) override {
+    ldpp_dout(dpp, 5) << conf->id << ": start_sync" << dendl;
+    // try to get elastic search version
+    return new RGWElasticGetESInfoCBCR(sc, conf);
+  }
+
+  RGWCoroutine *sync_object(const DoutPrefixProvider *dpp, RGWDataSyncCtx *sc, rgw_bucket_sync_pipe& sync_pipe, rgw_obj_key& key, std::optional<uint64_t> versioned_epoch, rgw_zone_set *zones_trace) override {
+    ldpp_dout(dpp, 10) << conf->id << ": sync_object: b=" << sync_pipe.info.source_bs.bucket << " k=" << key << " versioned_epoch=" << versioned_epoch.value_or(0) << dendl;
+    if (!conf->should_handle_operation(sync_pipe.dest_bucket_info)) {
+      ldpp_dout(dpp, 10) << conf->id << ": skipping operation (bucket not approved)" << dendl;
+      return nullptr;
+    }
+    return new RGWElasticHandleRemoteObjCR(sc, sync_pipe, key, conf, versioned_epoch.value_or(0));
+  }
+  RGWCoroutine *remove_object(const DoutPrefixProvider *dpp, RGWDataSyncCtx *sc, rgw_bucket_sync_pipe& sync_pipe, rgw_obj_key& key, real_time& mtime, bool versioned, uint64_t versioned_epoch, rgw_zone_set *zones_trace) override {
+    /* versioned and versioned epoch params are useless in the elasticsearch backend case */
+    ldpp_dout(dpp, 10) << conf->id << ": rm_object: b=" << sync_pipe.info.source_bs.bucket << " k=" << key << " mtime=" << mtime << " versioned=" << versioned << " versioned_epoch=" << versioned_epoch << dendl;
+    if (!conf->should_handle_operation(sync_pipe.dest_bucket_info)) {
+      ldpp_dout(dpp, 10) << conf->id << ": skipping operation (bucket not approved)" << dendl;
+      return nullptr;
+    }
+    return new RGWElasticRemoveRemoteObjCBCR(sc, sync_pipe, key, mtime, conf);
+  }
+  RGWCoroutine *create_delete_marker(const DoutPrefixProvider *dpp, RGWDataSyncCtx *sc, rgw_bucket_sync_pipe& sync_pipe, rgw_obj_key& key, real_time& mtime,
+                                     rgw_bucket_entry_owner& owner, bool versioned, uint64_t versioned_epoch, rgw_zone_set *zones_trace) override {
+    ldpp_dout(dpp, 10) << conf->id << ": create_delete_marker: b=" << sync_pipe.info.source_bs.bucket << " k=" << key << " mtime=" << mtime
+                            << " versioned=" << versioned << " versioned_epoch=" << versioned_epoch << dendl;
+    ldpp_dout(dpp, 10) << conf->id << ": skipping operation (not handled)" << dendl;
+    return NULL;
+  }
+  RGWRESTConn *get_rest_conn() {
+    return conf->conn.get();
+  }
+
+  string get_index_path() {
+    return conf->get_index_path();
+  }
+
+  map<string, string>& get_request_headers() {
+    return conf->get_request_headers();
+  }
+};
+
+RGWElasticSyncModuleInstance::RGWElasticSyncModuleInstance(const DoutPrefixProvider *dpp, CephContext *cct, const JSONFormattable& config)
+{
+  data_handler = std::unique_ptr<RGWElasticDataSyncModule>(new RGWElasticDataSyncModule(dpp, cct, config));
+}
+
+RGWDataSyncModule *RGWElasticSyncModuleInstance::get_data_handler()
+{
+  return data_handler.get();
+}
+
+RGWRESTConn *RGWElasticSyncModuleInstance::get_rest_conn()
+{
+  return data_handler->get_rest_conn();
+}
+
+string RGWElasticSyncModuleInstance::get_index_path() {
+  return data_handler->get_index_path();
+}
+
+map<string, string>& RGWElasticSyncModuleInstance::get_request_headers() {
+  return data_handler->get_request_headers();
+}
+
+RGWRESTMgr *RGWElasticSyncModuleInstance::get_rest_filter(int dialect, RGWRESTMgr *orig) {
+  if (dialect != RGW_REST_S3) {
+    return orig;
+  }
+  delete orig;
+  return new RGWRESTMgr_MDSearch_S3();
+}
+
+int RGWElasticSyncModule::create_instance(const DoutPrefixProvider *dpp, CephContext *cct, const JSONFormattable& config, RGWSyncModuleInstanceRef *instance) {
+  string endpoint = config["endpoint"];
+  instance->reset(new RGWElasticSyncModuleInstance(dpp, cct, config));
+  return 0;
+}
+
diff --git a/src/rgw/driver/rados/rgw_sync_module_es.h b/src/rgw/driver/rados/rgw_sync_module_es.h
new file mode 100644 (file)
index 0000000..6c0c422
--- /dev/null
@@ -0,0 +1,62 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#ifndef CEPH_RGW_SYNC_MODULE_ES_H
+#define CEPH_RGW_SYNC_MODULE_ES_H
+
+#include "rgw_sync_module.h"
+
+enum class ESType {
+  /* string datatypes */
+  String, /* Deprecated Since 5.X+ */
+  Text,
+  Keyword,
+
+  /* Numeric Types */
+  Long, Integer, Short, Byte, Double, Float, Half_Float, Scaled_Float,
+
+  /* Date Type */
+  Date,
+
+  /* Boolean */
+  Boolean,
+
+  /* Binary; Must Be Base64 Encoded */
+  Binary,
+
+  /* Range Types */
+  Integer_Range, Float_Range, Long_Range, Double_Range, Date_Range,
+
+  /* A Few Specialized Types */
+  Geo_Point,
+  Ip
+};
+
+
+class RGWElasticSyncModule : public RGWSyncModule {
+public:
+  RGWElasticSyncModule() {}
+  bool supports_data_export() override {
+    return false;
+  }
+  int create_instance(const DoutPrefixProvider *dpp, CephContext *cct, const JSONFormattable& config, RGWSyncModuleInstanceRef *instance) override;
+};
+
+class RGWElasticDataSyncModule;
+class RGWRESTConn;
+
+class RGWElasticSyncModuleInstance : public RGWSyncModuleInstance {
+  std::unique_ptr<RGWElasticDataSyncModule> data_handler;
+public:
+  RGWElasticSyncModuleInstance(const DoutPrefixProvider *dpp, CephContext *cct, const JSONFormattable& config);
+  RGWDataSyncModule *get_data_handler() override;
+  RGWRESTMgr *get_rest_filter(int dialect, RGWRESTMgr *orig) override;
+  RGWRESTConn *get_rest_conn();
+  std::string get_index_path();
+  std::map<std::string, std::string>& get_request_headers();
+  bool supports_user_writes() override {
+    return true;
+  }
+};
+
+#endif
diff --git a/src/rgw/driver/rados/rgw_sync_module_es_rest.cc b/src/rgw/driver/rados/rgw_sync_module_es_rest.cc
new file mode 100644 (file)
index 0000000..db9d48a
--- /dev/null
@@ -0,0 +1,428 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "rgw_sync_module_es.h"
+#include "rgw_sync_module_es_rest.h"
+#include "rgw_es_query.h"
+#include "rgw_op.h"
+#include "rgw_rest.h"
+#include "rgw_rest_s3.h"
+#include "rgw_sal_rados.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+
+struct es_index_obj_response {
+  string bucket;
+  rgw_obj_key key;
+  uint64_t versioned_epoch{0};
+  ACLOwner owner;
+  set<string> read_permissions;
+
+  struct {
+    uint64_t size{0};
+    ceph::real_time mtime;
+    string etag;
+    string content_type;
+    string storage_class;
+    map<string, string> custom_str;
+    map<string, int64_t> custom_int;
+    map<string, string> custom_date;
+
+    template <class T>
+    struct _custom_entry {
+      string name;
+      T value;
+      void decode_json(JSONObj *obj) {
+        JSONDecoder::decode_json("name", name, obj);
+        JSONDecoder::decode_json("value", value, obj);
+      }
+    };
+
+    void decode_json(JSONObj *obj) {
+      JSONDecoder::decode_json("size", size, obj);
+      string mtime_str;
+      JSONDecoder::decode_json("mtime", mtime_str, obj);
+      parse_time(mtime_str.c_str(), &mtime);
+      JSONDecoder::decode_json("etag", etag, obj);
+      JSONDecoder::decode_json("content_type", content_type, obj);
+      JSONDecoder::decode_json("storage_class", storage_class, obj);
+      list<_custom_entry<string> > str_entries;
+      JSONDecoder::decode_json("custom-string", str_entries, obj);
+      for (auto& e : str_entries) {
+        custom_str[e.name] = e.value;
+      }
+      list<_custom_entry<int64_t> > int_entries;
+      JSONDecoder::decode_json("custom-int", int_entries, obj);
+      for (auto& e : int_entries) {
+        custom_int[e.name] = e.value;
+      }
+      list<_custom_entry<string> > date_entries;
+      JSONDecoder::decode_json("custom-date", date_entries, obj);
+      for (auto& e : date_entries) {
+        custom_date[e.name] = e.value;
+      }
+    }
+  } meta;
+
+  void decode_json(JSONObj *obj) {
+    JSONDecoder::decode_json("bucket", bucket, obj);
+    JSONDecoder::decode_json("name", key.name, obj);
+    JSONDecoder::decode_json("instance", key.instance, obj);
+    JSONDecoder::decode_json("versioned_epoch", versioned_epoch, obj);
+    JSONDecoder::decode_json("permissions", read_permissions, obj);
+    JSONDecoder::decode_json("owner", owner, obj);
+    JSONDecoder::decode_json("meta", meta, obj);
+  }
+};
+
+struct es_search_response {
+  uint32_t took;
+  bool timed_out;
+  struct {
+    uint32_t total;
+    uint32_t successful;
+    uint32_t failed;
+    void decode_json(JSONObj *obj) {
+      JSONDecoder::decode_json("total", total, obj);
+      JSONDecoder::decode_json("successful", successful, obj);
+      JSONDecoder::decode_json("failed", failed, obj);
+    }
+  } shards;
+  struct obj_hit {
+    string index;
+    string type;
+    string id;
+    // double score
+    es_index_obj_response source;
+    void decode_json(JSONObj *obj) {
+      JSONDecoder::decode_json("_index", index, obj);
+      JSONDecoder::decode_json("_type", type, obj);
+      JSONDecoder::decode_json("_id", id, obj);
+      JSONDecoder::decode_json("_source", source, obj);
+    }
+  };
+  struct {
+    uint32_t total;
+    // double max_score;
+    list<obj_hit> hits;
+    void decode_json(JSONObj *obj) {
+      JSONDecoder::decode_json("total", total, obj);
+      // JSONDecoder::decode_json("max_score", max_score, obj);
+      JSONDecoder::decode_json("hits", hits, obj);
+    }
+  } hits;
+  void decode_json(JSONObj *obj) {
+    JSONDecoder::decode_json("took", took, obj);
+    JSONDecoder::decode_json("timed_out", timed_out, obj);
+    JSONDecoder::decode_json("_shards", shards, obj);
+    JSONDecoder::decode_json("hits", hits, obj);
+  }
+};
+
+class RGWMetadataSearchOp : public RGWOp {
+  RGWSyncModuleInstanceRef sync_module_ref;
+  RGWElasticSyncModuleInstance *es_module;
+protected:
+  string expression;
+  string custom_prefix;
+#define MAX_KEYS_DEFAULT 100
+  uint64_t max_keys{MAX_KEYS_DEFAULT};
+  string marker_str;
+  uint64_t marker{0};
+  string next_marker;
+  bool is_truncated{false};
+  string err;
+
+  es_search_response response;
+
+public:
+  RGWMetadataSearchOp(const RGWSyncModuleInstanceRef& sync_module) : sync_module_ref(sync_module) {
+    es_module = static_cast<RGWElasticSyncModuleInstance *>(sync_module_ref.get());
+  }
+
+  int verify_permission(optional_yield) override {
+    return 0;
+  }
+  virtual int get_params() = 0;
+  void pre_exec() override;
+  void execute(optional_yield y) override;
+
+  const char* name() const override { return "metadata_search"; }
+  virtual RGWOpType get_type() override { return RGW_OP_METADATA_SEARCH; }
+  virtual uint32_t op_mask() override { return RGW_OP_TYPE_READ; }
+};
+
+void RGWMetadataSearchOp::pre_exec()
+{
+  rgw_bucket_object_pre_exec(s);
+}
+
+void RGWMetadataSearchOp::execute(optional_yield y)
+{
+  op_ret = get_params();
+  if (op_ret < 0)
+    return;
+
+  list<pair<string, string> > conds;
+
+  if (!s->user->get_info().system) {
+    conds.push_back(make_pair("permissions", s->user->get_id().to_str()));
+  }
+
+  if (!s->bucket_name.empty()) {
+    conds.push_back(make_pair("bucket", s->bucket_name));
+  }
+
+  ESQueryCompiler es_query(expression, &conds, custom_prefix);
+  
+  static map<string, string, ltstr_nocase> aliases = {
+                                  { "bucket", "bucket" }, /* forces lowercase */
+                                  { "name", "name" },
+                                  { "key", "name" },
+                                  { "instance", "instance" },
+                                  { "etag", "meta.etag" },
+                                  { "size", "meta.size" },
+                                  { "mtime", "meta.mtime" },
+                                  { "lastmodified", "meta.mtime" },
+                                  { "last_modified", "meta.mtime" },
+                                  { "contenttype", "meta.content_type" },
+                                  { "content_type", "meta.content_type" },
+                                  { "storageclass", "meta.storage_class" },
+                                  { "storage_class", "meta.storage_class" },
+  };
+  es_query.set_field_aliases(&aliases);
+
+  static map<string, ESEntityTypeMap::EntityType> generic_map = { {"bucket", ESEntityTypeMap::ES_ENTITY_STR},
+                                                           {"name", ESEntityTypeMap::ES_ENTITY_STR},
+                                                           {"instance", ESEntityTypeMap::ES_ENTITY_STR},
+                                                           {"permissions", ESEntityTypeMap::ES_ENTITY_STR},
+                                                           {"meta.etag", ESEntityTypeMap::ES_ENTITY_STR},
+                                                           {"meta.content_type", ESEntityTypeMap::ES_ENTITY_STR},
+                                                           {"meta.mtime", ESEntityTypeMap::ES_ENTITY_DATE},
+                                                           {"meta.size", ESEntityTypeMap::ES_ENTITY_INT},
+                                                           {"meta.storage_class", ESEntityTypeMap::ES_ENTITY_STR} };
+  ESEntityTypeMap gm(generic_map);
+  es_query.set_generic_type_map(&gm);
+
+  static set<string> restricted_fields = { {"permissions"} };
+  es_query.set_restricted_fields(&restricted_fields);
+
+  map<string, ESEntityTypeMap::EntityType> custom_map;
+  for (auto& i : s->bucket->get_info().mdsearch_config) {
+    custom_map[i.first] = (ESEntityTypeMap::EntityType)i.second;
+  }
+
+  ESEntityTypeMap em(custom_map);
+  es_query.set_custom_type_map(&em);
+
+  bool valid = es_query.compile(&err);
+  if (!valid) {
+    ldpp_dout(this, 10) << "invalid query, failed generating request json" << dendl;
+    op_ret = -EINVAL;
+    return;
+  }
+
+  JSONFormatter f;
+  encode_json("root", es_query, &f);
+
+  RGWRESTConn *conn = es_module->get_rest_conn();
+
+  bufferlist in;
+  bufferlist out;
+
+  stringstream ss;
+
+  f.flush(ss);
+  in.append(ss.str());
+
+  string resource = es_module->get_index_path() + "/_search";
+  param_vec_t params;
+  static constexpr int BUFSIZE = 32;
+  char buf[BUFSIZE];
+  snprintf(buf, sizeof(buf), "%lld", (long long)max_keys);
+  params.push_back(param_pair_t("size", buf));
+  if (marker > 0) {
+    params.push_back(param_pair_t("from", marker_str.c_str()));
+  }
+  ldpp_dout(this, 20) << "sending request to elasticsearch, payload=" << string(in.c_str(), in.length()) << dendl;
+  auto& extra_headers = es_module->get_request_headers();
+  op_ret = conn->get_resource(s, resource, &params, &extra_headers,
+                              out, &in, nullptr, y);
+  if (op_ret < 0) {
+    ldpp_dout(this, 0) << "ERROR: failed to fetch resource (r=" << resource << ", ret=" << op_ret << ")" << dendl;
+    return;
+  }
+
+  ldpp_dout(this, 20) << "response: " << string(out.c_str(), out.length()) << dendl;
+
+  JSONParser jparser;
+  if (!jparser.parse(out.c_str(), out.length())) {
+    ldpp_dout(this, 0) << "ERROR: failed to parse elasticsearch response" << dendl;
+    op_ret = -EINVAL;
+    return;
+  }
+
+  try {
+    decode_json_obj(response, &jparser);
+  } catch (const JSONDecoder::err& e) {
+    ldpp_dout(this, 0) << "ERROR: failed to decode JSON input: " << e.what() << dendl;
+    op_ret = -EINVAL;
+    return;
+  }
+
+}
+
+class RGWMetadataSearch_ObjStore_S3 : public RGWMetadataSearchOp {
+public:
+  explicit RGWMetadataSearch_ObjStore_S3(const RGWSyncModuleInstanceRef& _sync_module) : RGWMetadataSearchOp(_sync_module) {
+    custom_prefix = "x-amz-meta-";
+  }
+
+  int get_params() override {
+    expression = s->info.args.get("query");
+    bool exists;
+    string max_keys_str = s->info.args.get("max-keys", &exists);
+#define MAX_KEYS_MAX 10000
+    if (exists) {
+      string err;
+      max_keys = strict_strtoll(max_keys_str.c_str(), 10, &err);
+      if (!err.empty()) {
+        return -EINVAL;
+      }
+      if (max_keys > MAX_KEYS_MAX) {
+        max_keys = MAX_KEYS_MAX;
+      }
+    }
+    marker_str = s->info.args.get("marker", &exists);
+    if (exists) {
+      string err;
+      marker = strict_strtoll(marker_str.c_str(), 10, &err);
+      if (!err.empty()) {
+        return -EINVAL;
+      }
+    }
+    uint64_t nm = marker + max_keys;
+    static constexpr int BUFSIZE = 32;
+    char buf[BUFSIZE];
+    snprintf(buf, sizeof(buf), "%lld", (long long)nm);
+    next_marker = buf;
+    return 0;
+  }
+  void send_response() override {
+    if (op_ret) {
+      s->err.message = err;
+      set_req_state_err(s, op_ret);
+    }
+    dump_errno(s);
+    end_header(s, this, "application/xml");
+
+    if (op_ret < 0) {
+      return;
+    }
+
+    is_truncated = (response.hits.hits.size() >= max_keys);
+
+    s->formatter->open_object_section("SearchMetadataResponse");
+    s->formatter->dump_string("Marker", marker_str);
+    s->formatter->dump_string("IsTruncated", (is_truncated ? "true" : "false"));
+    if (is_truncated) {
+      s->formatter->dump_string("NextMarker", next_marker);
+    }
+    if (s->format == RGWFormat::JSON) {
+      s->formatter->open_array_section("Objects");
+    }
+    for (auto& i : response.hits.hits) {
+      s->formatter->open_object_section("Contents");
+      es_index_obj_response& e = i.source;
+      s->formatter->dump_string("Bucket", e.bucket);
+      s->formatter->dump_string("Key", e.key.name);
+      string instance = (!e.key.instance.empty() ? e.key.instance : "null");
+      s->formatter->dump_string("Instance", instance.c_str());
+      s->formatter->dump_int("VersionedEpoch", e.versioned_epoch);
+      dump_time(s, "LastModified", e.meta.mtime);
+      s->formatter->dump_int("Size", e.meta.size);
+      s->formatter->dump_format("ETag", "\"%s\"", e.meta.etag.c_str());
+      s->formatter->dump_string("ContentType", e.meta.content_type.c_str());
+      s->formatter->dump_string("StorageClass", e.meta.storage_class.c_str());
+      dump_owner(s, e.owner.get_id(), e.owner.get_display_name());
+      s->formatter->open_array_section("CustomMetadata");
+      for (auto& m : e.meta.custom_str) {
+        s->formatter->open_object_section("Entry");
+        s->formatter->dump_string("Name", m.first.c_str());
+        s->formatter->dump_string("Value", m.second);
+        s->formatter->close_section();
+      }
+      for (auto& m : e.meta.custom_int) {
+        s->formatter->open_object_section("Entry");
+        s->formatter->dump_string("Name", m.first.c_str());
+        s->formatter->dump_int("Value", m.second);
+        s->formatter->close_section();
+      }
+      for (auto& m : e.meta.custom_date) {
+        s->formatter->open_object_section("Entry");
+        s->formatter->dump_string("Name", m.first.c_str());
+        s->formatter->dump_string("Value", m.second);
+        s->formatter->close_section();
+      }
+      s->formatter->close_section();
+      rgw_flush_formatter(s, s->formatter);
+      s->formatter->close_section();
+    };
+    if (s->format == RGWFormat::JSON) {
+      s->formatter->close_section();
+    }
+    s->formatter->close_section();
+   rgw_flush_formatter_and_reset(s, s->formatter);
+  }
+};
+
+class RGWHandler_REST_MDSearch_S3 : public RGWHandler_REST_S3 {
+protected:
+  RGWOp *op_get() override {
+    if (s->info.args.exists("query")) {
+      return new RGWMetadataSearch_ObjStore_S3(driver->get_sync_module());
+    }
+    if (!s->init_state.url_bucket.empty() &&
+        s->info.args.exists("mdsearch")) {
+      return new RGWGetBucketMetaSearch_ObjStore_S3;
+    }
+    return nullptr;
+  }
+  RGWOp *op_head() override {
+    return nullptr;
+  }
+  RGWOp *op_post() override {
+    return nullptr;
+  }
+public:
+  explicit RGWHandler_REST_MDSearch_S3(const rgw::auth::StrategyRegistry& auth_registry) : RGWHandler_REST_S3(auth_registry) {}
+  virtual ~RGWHandler_REST_MDSearch_S3() {}
+};
+
+
+RGWHandler_REST* RGWRESTMgr_MDSearch_S3::get_handler(rgw::sal::Driver* driver,
+                                                    req_state* const s,
+                                                     const rgw::auth::StrategyRegistry& auth_registry,
+                                                     const std::string& frontend_prefix)
+{
+  int ret =
+    RGWHandler_REST_S3::init_from_header(driver, s,
+                                       RGWFormat::XML, true);
+  if (ret < 0) {
+    return nullptr;
+  }
+
+  if (!s->object->empty()) {
+    return nullptr;
+  }
+
+  RGWHandler_REST *handler = new RGWHandler_REST_MDSearch_S3(auth_registry);
+
+  ldpp_dout(s, 20) << __func__ << " handler=" << typeid(*handler).name()
+                   << dendl;
+  return handler;
+}
+
diff --git a/src/rgw/driver/rados/rgw_sync_module_es_rest.h b/src/rgw/driver/rados/rgw_sync_module_es_rest.h
new file mode 100644 (file)
index 0000000..b18271a
--- /dev/null
@@ -0,0 +1,18 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include "rgw_rest.h"
+
+class RGWElasticSyncModuleInstance;
+
+class RGWRESTMgr_MDSearch_S3 : public RGWRESTMgr {
+public:
+  explicit RGWRESTMgr_MDSearch_S3() {}
+
+  RGWHandler_REST *get_handler(rgw::sal::Driver* driver,
+                              req_state* s,
+                               const rgw::auth::StrategyRegistry& auth_registry,
+                               const std::string& frontend_prefix) override;
+};
diff --git a/src/rgw/driver/rados/rgw_sync_module_log.cc b/src/rgw/driver/rados/rgw_sync_module_log.cc
new file mode 100644 (file)
index 0000000..a21604c
--- /dev/null
@@ -0,0 +1,76 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "rgw_common.h"
+#include "rgw_coroutine.h"
+#include "rgw_cr_rados.h"
+#include "rgw_sync_module.h"
+#include "rgw_data_sync.h"
+#include "rgw_sync_module_log.h"
+
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+
+class RGWLogStatRemoteObjCBCR : public RGWStatRemoteObjCBCR {
+public:
+  RGWLogStatRemoteObjCBCR(RGWDataSyncCtx *_sc,
+                          rgw_bucket& _src_bucket, rgw_obj_key& _key) : RGWStatRemoteObjCBCR(_sc, _src_bucket, _key) {}
+  int operate(const DoutPrefixProvider *dpp) override {
+    ldpp_dout(dpp, 0) << "SYNC_LOG: stat of remote obj: z=" << sc->source_zone
+                            << " b=" << src_bucket << " k=" << key << " size=" << size << " mtime=" << mtime
+                            << " attrs=" << attrs << dendl;
+    return set_cr_done();
+  }
+
+};
+
+class RGWLogStatRemoteObjCR : public RGWCallStatRemoteObjCR {
+public:
+  RGWLogStatRemoteObjCR(RGWDataSyncCtx *_sc,
+                        rgw_bucket& _src_bucket, rgw_obj_key& _key) : RGWCallStatRemoteObjCR(_sc, _src_bucket, _key) {
+  }
+
+  ~RGWLogStatRemoteObjCR() override {}
+
+  RGWStatRemoteObjCBCR *allocate_callback() override {
+    return new RGWLogStatRemoteObjCBCR(sc, src_bucket, key);
+  }
+};
+
+class RGWLogDataSyncModule : public RGWDataSyncModule {
+  string prefix;
+public:
+  explicit RGWLogDataSyncModule(const string& _prefix) : prefix(_prefix) {}
+
+  RGWCoroutine *sync_object(const DoutPrefixProvider *dpp, RGWDataSyncCtx *sc, rgw_bucket_sync_pipe& sync_pipe, rgw_obj_key& key, std::optional<uint64_t> versioned_epoch, rgw_zone_set *zones_trace) override {
+    ldpp_dout(dpp, 0) << prefix << ": SYNC_LOG: sync_object: b=" << sync_pipe.info.source_bs.bucket << " k=" << key << " versioned_epoch=" << versioned_epoch.value_or(0) << dendl;
+    return new RGWLogStatRemoteObjCR(sc, sync_pipe.info.source_bs.bucket, key);
+  }
+  RGWCoroutine *remove_object(const DoutPrefixProvider *dpp, RGWDataSyncCtx *sc, rgw_bucket_sync_pipe& sync_pipe, rgw_obj_key& key, real_time& mtime, bool versioned, uint64_t versioned_epoch, rgw_zone_set *zones_trace) override {
+    ldpp_dout(dpp, 0) << prefix << ": SYNC_LOG: rm_object: b=" << sync_pipe.info.source_bs.bucket << " k=" << key << " mtime=" << mtime << " versioned=" << versioned << " versioned_epoch=" << versioned_epoch << dendl;
+    return NULL;
+  }
+  RGWCoroutine *create_delete_marker(const DoutPrefixProvider *dpp, RGWDataSyncCtx *sc, rgw_bucket_sync_pipe& sync_pipe, rgw_obj_key& key, real_time& mtime,
+                                     rgw_bucket_entry_owner& owner, bool versioned, uint64_t versioned_epoch, rgw_zone_set *zones_trace) override {
+    ldpp_dout(dpp, 0) << prefix << ": SYNC_LOG: create_delete_marker: b=" << sync_pipe.info.source_bs.bucket << " k=" << key << " mtime=" << mtime
+                            << " versioned=" << versioned << " versioned_epoch=" << versioned_epoch << dendl;
+    return NULL;
+  }
+};
+
+class RGWLogSyncModuleInstance : public RGWSyncModuleInstance {
+  RGWLogDataSyncModule data_handler;
+public:
+  explicit RGWLogSyncModuleInstance(const string& prefix) : data_handler(prefix) {}
+  RGWDataSyncModule *get_data_handler() override {
+    return &data_handler;
+  }
+};
+
+int RGWLogSyncModule::create_instance(const DoutPrefixProvider *dpp, CephContext *cct, const JSONFormattable& config, RGWSyncModuleInstanceRef *instance) {
+  string prefix = config["prefix"];
+  instance->reset(new RGWLogSyncModuleInstance(prefix));
+  return 0;
+}
+
diff --git a/src/rgw/driver/rados/rgw_sync_module_log.h b/src/rgw/driver/rados/rgw_sync_module_log.h
new file mode 100644 (file)
index 0000000..ecf3bb7
--- /dev/null
@@ -0,0 +1,18 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#ifndef CEPH_RGW_SYNC_MODULE_LOG_H
+#define CEPH_RGW_SYNC_MODULE_LOG_H
+
+#include "rgw_sync_module.h"
+
+class RGWLogSyncModule : public RGWSyncModule {
+public:
+  RGWLogSyncModule() {}
+  bool supports_data_export() override {
+    return false;
+  }
+  int create_instance(const DoutPrefixProvider *dpp, CephContext *cct, const JSONFormattable& config, RGWSyncModuleInstanceRef *instance) override;
+};
+
+#endif
diff --git a/src/rgw/driver/rados/rgw_sync_trace.cc b/src/rgw/driver/rados/rgw_sync_trace.cc
new file mode 100644 (file)
index 0000000..b346835
--- /dev/null
@@ -0,0 +1,290 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#ifndef CEPH_RGW_SYNC_TRACE_H
+#define CEPH_RGW_SYNC_TRACE_H
+
+#include <regex>
+
+#include "common/debug.h"
+#include "common/ceph_json.h"
+
+#include "rgw_sync_trace.h"
+#include "rgw_rados.h"
+#include "rgw_worker.h"
+
+#define dout_context g_ceph_context
+
+static constexpr auto dout_subsys = ceph_subsys_rgw;
+
+using namespace std;
+
+
+RGWSyncTraceNode::RGWSyncTraceNode(CephContext *_cct, uint64_t _handle,
+                                   const RGWSyncTraceNodeRef& _parent,
+                                   const string& _type, const string& _id) : cct(_cct),
+                                                                             parent(_parent),
+                                                                             type(_type),
+                                                                             id(_id),
+                                                                             handle(_handle),
+                                                                             history(cct->_conf->rgw_sync_trace_per_node_log_size)
+{
+  if (parent.get()) {
+    prefix = parent->get_prefix();
+  }
+
+  if (!type.empty()) {
+    prefix += type;
+    if (!id.empty()) {
+      prefix += "[" + id + "]";
+    }
+    prefix += ":";
+  }
+}
+
+void RGWSyncTraceNode::log(int level, const string& s)
+{
+  status = s;
+  history.push_back(status);
+  /* dump output on either rgw_sync, or rgw -- but only once */
+  if (cct->_conf->subsys.should_gather(ceph_subsys_rgw_sync, level)) {
+    lsubdout(cct, rgw_sync,
+      ceph::dout::need_dynamic(level)) << "RGW-SYNC:" << to_str() << dendl;
+  } else {
+    lsubdout(cct, rgw,
+      ceph::dout::need_dynamic(level)) << "RGW-SYNC:" << to_str() << dendl;
+  }
+}
+
+
+class RGWSyncTraceServiceMapThread : public RGWRadosThread {
+  RGWRados *store;
+  RGWSyncTraceManager *manager;
+
+  uint64_t interval_msec() override {
+    return cct->_conf->rgw_sync_trace_servicemap_update_interval * 1000;
+  }
+public:
+  RGWSyncTraceServiceMapThread(RGWRados *_store, RGWSyncTraceManager *_manager)
+    : RGWRadosThread(_store, "sync-trace"), store(_store), manager(_manager) {}
+
+  int process(const DoutPrefixProvider *dpp) override;
+};
+
+int RGWSyncTraceServiceMapThread::process(const DoutPrefixProvider *dpp)
+{
+  map<string, string> status;
+  status["current_sync"] = manager->get_active_names();
+  int ret = store->update_service_map(dpp, std::move(status));
+  if (ret < 0) {
+    ldout(store->ctx(), 0) << "ERROR: update_service_map() returned ret=" << ret << dendl;
+  }
+  return 0;
+}
+
+RGWSyncTraceNodeRef RGWSyncTraceManager::add_node(const RGWSyncTraceNodeRef& parent,
+                                                  const std::string& type,
+                                                  const std::string& id)
+{
+  shunique_lock wl(lock, ceph::acquire_unique);
+  auto handle = alloc_handle();
+  RGWSyncTraceNodeRef& ref = nodes[handle];
+  ref.reset(new RGWSyncTraceNode(cct, handle, parent, type, id));
+  // return a separate shared_ptr that calls finish() on the node instead of
+  // deleting it. the lambda capture holds a reference to the original 'ref'
+  auto deleter = [ref, this] (RGWSyncTraceNode *node) { finish_node(node); };
+  return {ref.get(), deleter};
+}
+
+bool RGWSyncTraceNode::match(const string& search_term, bool search_history)
+{
+  try {
+    std::regex expr(search_term);
+    std::smatch m;
+
+    if (regex_search(prefix, m, expr)) {
+      return true;
+    }
+    if (regex_search(status, m,expr)) {
+      return true;
+    }
+    if (!search_history) {
+      return false;
+    }
+
+    for (auto h : history) {
+      if (regex_search(h, m, expr)) {
+        return true;
+      }
+    }
+  } catch (const std::regex_error& e) {
+    ldout(cct, 5) << "NOTICE: sync trace: bad expression: bad regex search term" << dendl;
+  }
+
+  return false;
+}
+
+void RGWSyncTraceManager::init(RGWRados *store)
+{
+  service_map_thread = new RGWSyncTraceServiceMapThread(store, this);
+  service_map_thread->start();
+}
+
+RGWSyncTraceManager::~RGWSyncTraceManager()
+{
+  cct->get_admin_socket()->unregister_commands(this);
+  service_map_thread->stop();
+  delete service_map_thread;
+
+  nodes.clear();
+}
+
+int RGWSyncTraceManager::hook_to_admin_command()
+{
+  AdminSocket *admin_socket = cct->get_admin_socket();
+
+  admin_commands = { { "sync trace show name=search,type=CephString,req=false", "sync trace show [filter_str]: show current multisite tracing information" },
+                     { "sync trace history name=search,type=CephString,req=false", "sync trace history [filter_str]: show history of multisite tracing information" },
+                     { "sync trace active name=search,type=CephString,req=false", "show active multisite sync entities information" },
+                     { "sync trace active_short name=search,type=CephString,req=false", "show active multisite sync entities entries" } };
+  for (auto cmd : admin_commands) {
+    int r = admin_socket->register_command(cmd[0], this,
+                                           cmd[1]);
+    if (r < 0) {
+      lderr(cct) << "ERROR: fail to register admin socket command (r=" << r << ")" << dendl;
+      return r;
+    }
+  }
+  return 0;
+}
+
+static void dump_node(RGWSyncTraceNode *entry, bool show_history, Formatter *f)
+{
+  f->open_object_section("entry");
+  ::encode_json("status", entry->to_str(), f);
+  if (show_history) {
+    f->open_array_section("history");
+    for (auto h : entry->get_history()) {
+      ::encode_json("entry", h, f);
+    }
+    f->close_section();
+  }
+  f->close_section();
+}
+
+string RGWSyncTraceManager::get_active_names()
+{
+  shunique_lock rl(lock, ceph::acquire_shared);
+
+  stringstream ss;
+  JSONFormatter f;
+
+  f.open_array_section("result");
+  for (auto n : nodes) {
+    auto& entry = n.second;
+
+    if (!entry->test_flags(RGW_SNS_FLAG_ACTIVE)) {
+      continue;
+    }
+    const string& name = entry->get_resource_name();
+    if (!name.empty()) {
+      ::encode_json("entry", name, &f);
+    }
+    f.flush(ss);
+  }
+  f.close_section();
+  f.flush(ss);
+
+  return ss.str();
+}
+
+int RGWSyncTraceManager::call(std::string_view command, const cmdmap_t& cmdmap,
+                             const bufferlist&,
+                             Formatter *f,
+                             std::ostream& ss,
+                             bufferlist& out) {
+
+  bool show_history = (command == "sync trace history");
+  bool show_short = (command == "sync trace active_short");
+  bool show_active = (command == "sync trace active") || show_short;
+
+  string search;
+
+  auto si = cmdmap.find("search");
+  if (si != cmdmap.end()) {
+    search = boost::get<string>(si->second);
+  }
+
+  shunique_lock rl(lock, ceph::acquire_shared);
+
+  f->open_object_section("result");
+  f->open_array_section("running");
+  for (auto n : nodes) {
+    auto& entry = n.second;
+
+    if (!search.empty() && !entry->match(search, show_history)) {
+      continue;
+    }
+    if (show_active && !entry->test_flags(RGW_SNS_FLAG_ACTIVE)) {
+      continue;
+    }
+    if (show_short) {
+      const string& name = entry->get_resource_name();
+      if (!name.empty()) {
+        ::encode_json("entry", name, f);
+      }
+    } else {
+      dump_node(entry.get(), show_history, f);
+    }
+    f->flush(out);
+  }
+  f->close_section();
+
+  f->open_array_section("complete");
+  for (auto& entry : complete_nodes) {
+    if (!search.empty() && !entry->match(search, show_history)) {
+      continue;
+    }
+    if (show_active && !entry->test_flags(RGW_SNS_FLAG_ACTIVE)) {
+      continue;
+    }
+    dump_node(entry.get(), show_history, f);
+    f->flush(out);
+  }
+  f->close_section();
+
+  f->close_section();
+
+  return 0;
+}
+
+void RGWSyncTraceManager::finish_node(RGWSyncTraceNode *node)
+{
+  RGWSyncTraceNodeRef old_node;
+
+  {
+    shunique_lock wl(lock, ceph::acquire_unique);
+    if (!node) {
+      return;
+    }
+    auto iter = nodes.find(node->handle);
+    if (iter == nodes.end()) {
+      /* not found, already finished */
+      return;
+    }
+
+    if (complete_nodes.full()) {
+      /* take a reference to the entry that is going to be evicted,
+       * can't let it get evicted under lock held, otherwise
+       * it's a deadlock as it will call finish_node()
+       */
+      old_node = complete_nodes.front();
+    }
+
+    complete_nodes.push_back(iter->second);
+    nodes.erase(iter);
+  }
+};
+
+#endif
+
diff --git a/src/rgw/driver/rados/rgw_sync_trace.h b/src/rgw/driver/rados/rgw_sync_trace.h
new file mode 100644 (file)
index 0000000..9617dac
--- /dev/null
@@ -0,0 +1,145 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#ifndef CEPH_RGW_SYNC_LOG_H
+#define CEPH_RGW_SYNC_LOG_H
+
+#include <atomic>
+
+#include "common/ceph_mutex.h"
+#include "common/shunique_lock.h"
+#include "common/admin_socket.h"
+
+#include <set>
+#include <ostream>
+#include <string>
+#include <shared_mutex>
+#include <boost/circular_buffer.hpp>
+
+#define SSTR(o) ({      \
+  std::stringstream ss; \
+  ss << o;              \
+  ss.str();             \
+})
+
+#define RGW_SNS_FLAG_ACTIVE   1
+#define RGW_SNS_FLAG_ERROR    2
+
+class RGWRados;
+class RGWSyncTraceManager;
+class RGWSyncTraceNode;
+class RGWSyncTraceServiceMapThread;
+
+using RGWSyncTraceNodeRef = std::shared_ptr<RGWSyncTraceNode>;
+
+class RGWSyncTraceNode final {
+  friend class RGWSyncTraceManager;
+
+  CephContext *cct;
+  RGWSyncTraceNodeRef parent;
+
+  uint16_t state{0};
+  std::string status;
+
+  ceph::mutex lock = ceph::make_mutex("RGWSyncTraceNode::lock");
+
+  std::string type;
+  std::string id;
+
+  std::string prefix;
+
+  std::string resource_name;
+
+  uint64_t handle;
+
+  boost::circular_buffer<std::string> history;
+
+  // private constructor, create with RGWSyncTraceManager::add_node()
+  RGWSyncTraceNode(CephContext *_cct, uint64_t _handle,
+                   const RGWSyncTraceNodeRef& _parent,
+                   const std::string& _type, const std::string& _id);
+
+ public:
+  void set_resource_name(const std::string& s) {
+    resource_name = s;
+  }
+
+  const std::string& get_resource_name() {
+    return resource_name;
+  }
+
+  void set_flag(uint16_t s) {
+    state |= s;
+  }
+  void unset_flag(uint16_t s) {
+    state &= ~s;
+  }
+  bool test_flags(uint16_t f) {
+    return (state & f) == f;
+  }
+  void log(int level, const std::string& s);
+
+  std::string to_str() {
+    return prefix + " " + status;
+  }
+
+  const std::string& get_prefix() {
+    return prefix;
+  }
+
+  std::ostream& operator<<(std::ostream& os) { 
+    os << to_str();
+    return os;            
+  }
+
+  boost::circular_buffer<std::string>& get_history() {
+    return history;
+  }
+
+  bool match(const std::string& search_term, bool search_history);
+};
+
+class RGWSyncTraceManager : public AdminSocketHook {
+  friend class RGWSyncTraceNode;
+
+  mutable std::shared_timed_mutex lock;
+  using shunique_lock = ceph::shunique_lock<decltype(lock)>;
+
+  CephContext *cct;
+  RGWSyncTraceServiceMapThread *service_map_thread{nullptr};
+
+  std::map<uint64_t, RGWSyncTraceNodeRef> nodes;
+  boost::circular_buffer<RGWSyncTraceNodeRef> complete_nodes;
+
+  std::atomic<uint64_t> count = { 0 };
+
+  std::list<std::array<std::string, 3> > admin_commands;
+
+  uint64_t alloc_handle() {
+    return ++count;
+  }
+  void finish_node(RGWSyncTraceNode *node);
+
+public:
+  RGWSyncTraceManager(CephContext *_cct, int max_lru) : cct(_cct), complete_nodes(max_lru) {}
+  ~RGWSyncTraceManager();
+
+  void init(RGWRados *store);
+
+  const RGWSyncTraceNodeRef root_node;
+
+  RGWSyncTraceNodeRef add_node(const RGWSyncTraceNodeRef& parent,
+                               const std::string& type,
+                               const std::string& id = "");
+
+  int hook_to_admin_command();
+  int call(std::string_view command, const cmdmap_t& cmdmap,
+          const bufferlist&,
+          Formatter *f,
+          std::ostream& ss,
+          bufferlist& out) override;
+  std::string get_active_names();
+};
+
+
+#endif
diff --git a/src/rgw/driver/rados/rgw_tools.cc b/src/rgw/driver/rados/rgw_tools.cc
new file mode 100644 (file)
index 0000000..5a8aefa
--- /dev/null
@@ -0,0 +1,414 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "common/errno.h"
+#include "librados/librados_asio.h"
+
+#include "include/stringify.h"
+
+#include "rgw_tools.h"
+#include "rgw_acl_s3.h"
+#include "rgw_aio_throttle.h"
+#include "rgw_compression.h"
+
+#define dout_subsys ceph_subsys_rgw
+
+#define READ_CHUNK_LEN (512 * 1024)
+
+using namespace std;
+
+int rgw_init_ioctx(const DoutPrefixProvider *dpp,
+                   librados::Rados *rados, const rgw_pool& pool,
+                   librados::IoCtx& ioctx, bool create,
+                  bool mostly_omap)
+{
+  int r = rados->ioctx_create(pool.name.c_str(), ioctx);
+  if (r == -ENOENT && create) {
+    r = rados->pool_create(pool.name.c_str());
+    if (r == -ERANGE) {
+      ldpp_dout(dpp, 0)
+        << __func__
+        << " ERROR: librados::Rados::pool_create returned " << cpp_strerror(-r)
+        << " (this can be due to a pool or placement group misconfiguration, e.g."
+        << " pg_num < pgp_num or mon_max_pg_per_osd exceeded)"
+        << dendl;
+    }
+    if (r < 0 && r != -EEXIST) {
+      return r;
+    }
+
+    r = rados->ioctx_create(pool.name.c_str(), ioctx);
+    if (r < 0) {
+      return r;
+    }
+
+    r = ioctx.application_enable(pg_pool_t::APPLICATION_NAME_RGW, false);
+    if (r < 0 && r != -EOPNOTSUPP) {
+      return r;
+    }
+
+    if (mostly_omap) {
+      // set pg_autoscale_bias
+      bufferlist inbl;
+      float bias = g_conf().get_val<double>("rgw_rados_pool_autoscale_bias");
+      int r = rados->mon_command(
+       "{\"prefix\": \"osd pool set\", \"pool\": \"" +
+       pool.name + "\", \"var\": \"pg_autoscale_bias\", \"val\": \"" +
+       stringify(bias) + "\"}",
+       inbl, NULL, NULL);
+      if (r < 0) {
+       ldpp_dout(dpp, 10) << __func__ << " warning: failed to set pg_autoscale_bias on "
+                << pool.name << dendl;
+      }
+      // set recovery_priority
+      int p = g_conf().get_val<uint64_t>("rgw_rados_pool_recovery_priority");
+      r = rados->mon_command(
+       "{\"prefix\": \"osd pool set\", \"pool\": \"" +
+       pool.name + "\", \"var\": \"recovery_priority\": \"" +
+       stringify(p) + "\"}",
+       inbl, NULL, NULL);
+      if (r < 0) {
+       ldpp_dout(dpp, 10) << __func__ << " warning: failed to set recovery_priority on "
+                << pool.name << dendl;
+      }
+    }
+  } else if (r < 0) {
+    return r;
+  }
+  if (!pool.ns.empty()) {
+    ioctx.set_namespace(pool.ns);
+  }
+  return 0;
+}
+
+map<string, bufferlist>* no_change_attrs() {
+  static map<string, bufferlist> no_change;
+  return &no_change;
+}
+
+int rgw_put_system_obj(const DoutPrefixProvider *dpp, RGWSI_SysObj* svc_sysobj,
+                       const rgw_pool& pool, const string& oid, bufferlist& data, bool exclusive,
+                       RGWObjVersionTracker *objv_tracker, real_time set_mtime, optional_yield y, map<string, bufferlist> *pattrs)
+{
+  map<string,bufferlist> no_attrs;
+  if (!pattrs) {
+    pattrs = &no_attrs;
+  }
+
+  rgw_raw_obj obj(pool, oid);
+
+  auto sysobj = svc_sysobj->get_obj(obj);
+  int ret;
+
+  if (pattrs != no_change_attrs()) {
+    ret = sysobj.wop()
+      .set_objv_tracker(objv_tracker)
+      .set_exclusive(exclusive)
+      .set_mtime(set_mtime)
+      .set_attrs(*pattrs)
+      .write(dpp, data, y);
+  } else {
+    ret = sysobj.wop()
+      .set_objv_tracker(objv_tracker)
+      .set_exclusive(exclusive)
+      .set_mtime(set_mtime)
+      .write_data(dpp, data, y);
+  }
+
+  return ret;
+}
+
+int rgw_stat_system_obj(const DoutPrefixProvider *dpp, RGWSI_SysObj* svc_sysobj,
+                        const rgw_pool& pool, const std::string& key,
+                        RGWObjVersionTracker *objv_tracker,
+                       real_time *pmtime, optional_yield y,
+                       std::map<std::string, bufferlist> *pattrs)
+{
+  rgw_raw_obj obj(pool, key);
+  auto sysobj = svc_sysobj->get_obj(obj);
+  return sysobj.rop()
+               .set_attrs(pattrs)
+               .set_last_mod(pmtime)
+               .stat(y, dpp);
+}
+
+
+int rgw_get_system_obj(RGWSI_SysObj* svc_sysobj, const rgw_pool& pool, const string& key, bufferlist& bl,
+                       RGWObjVersionTracker *objv_tracker, real_time *pmtime, optional_yield y,
+                       const DoutPrefixProvider *dpp, map<string, bufferlist> *pattrs,
+                       rgw_cache_entry_info *cache_info,
+                      boost::optional<obj_version> refresh_version, bool raw_attrs)
+{
+  const rgw_raw_obj obj(pool, key);
+  auto sysobj = svc_sysobj->get_obj(obj);
+  auto rop = sysobj.rop();
+  return rop.set_attrs(pattrs)
+            .set_last_mod(pmtime)
+            .set_objv_tracker(objv_tracker)
+            .set_raw_attrs(raw_attrs)
+            .set_cache_info(cache_info)
+            .set_refresh_version(refresh_version)
+            .read(dpp, &bl, y);
+}
+
+int rgw_delete_system_obj(const DoutPrefixProvider *dpp, 
+                          RGWSI_SysObj *sysobj_svc, const rgw_pool& pool, const string& oid,
+                          RGWObjVersionTracker *objv_tracker, optional_yield y)
+{
+  auto sysobj = sysobj_svc->get_obj(rgw_raw_obj{pool, oid});
+  rgw_raw_obj obj(pool, oid);
+  return sysobj.wop()
+               .set_objv_tracker(objv_tracker)
+               .remove(dpp, y);
+}
+
+int rgw_rados_operate(const DoutPrefixProvider *dpp, librados::IoCtx& ioctx, const std::string& oid,
+                      librados::ObjectReadOperation *op, bufferlist* pbl,
+                      optional_yield y, int flags)
+{
+  // given a yield_context, call async_operate() to yield the coroutine instead
+  // of blocking
+  if (y) {
+    auto& context = y.get_io_context();
+    auto& yield = y.get_yield_context();
+    boost::system::error_code ec;
+    auto bl = librados::async_operate(
+      context, ioctx, oid, op, flags, yield[ec]);
+    if (pbl) {
+      *pbl = std::move(bl);
+    }
+    return -ec.value();
+  }
+  // work on asio threads should be asynchronous, so warn when they block
+  if (is_asio_thread) {
+    ldpp_dout(dpp, 20) << "WARNING: blocking librados call" << dendl;
+  }
+  return ioctx.operate(oid, op, nullptr, flags);
+}
+
+int rgw_rados_operate(const DoutPrefixProvider *dpp, librados::IoCtx& ioctx, const std::string& oid,
+                      librados::ObjectWriteOperation *op, optional_yield y,
+                     int flags)
+{
+  if (y) {
+    auto& context = y.get_io_context();
+    auto& yield = y.get_yield_context();
+    boost::system::error_code ec;
+    librados::async_operate(context, ioctx, oid, op, flags, yield[ec]);
+    return -ec.value();
+  }
+  if (is_asio_thread) {
+    ldpp_dout(dpp, 20) << "WARNING: blocking librados call" << dendl;
+  }
+  return ioctx.operate(oid, op, flags);
+}
+
+int rgw_rados_notify(const DoutPrefixProvider *dpp, librados::IoCtx& ioctx, const std::string& oid,
+                     bufferlist& bl, uint64_t timeout_ms, bufferlist* pbl,
+                     optional_yield y)
+{
+  if (y) {
+    auto& context = y.get_io_context();
+    auto& yield = y.get_yield_context();
+    boost::system::error_code ec;
+    auto reply = librados::async_notify(context, ioctx, oid,
+                                        bl, timeout_ms, yield[ec]);
+    if (pbl) {
+      *pbl = std::move(reply);
+    }
+    return -ec.value();
+  }
+  if (is_asio_thread) {
+    ldpp_dout(dpp, 20) << "WARNING: blocking librados call" << dendl;
+  }
+  return ioctx.notify2(oid, bl, timeout_ms, pbl);
+}
+
+void rgw_filter_attrset(map<string, bufferlist>& unfiltered_attrset, const string& check_prefix,
+                        map<string, bufferlist> *attrset)
+{
+  attrset->clear();
+  map<string, bufferlist>::iterator iter;
+  for (iter = unfiltered_attrset.lower_bound(check_prefix);
+       iter != unfiltered_attrset.end(); ++iter) {
+    if (!boost::algorithm::starts_with(iter->first, check_prefix))
+      break;
+    (*attrset)[iter->first] = iter->second;
+  }
+}
+
+RGWDataAccess::RGWDataAccess(rgw::sal::Driver* _driver) : driver(_driver)
+{
+}
+
+
+int RGWDataAccess::Bucket::finish_init()
+{
+  auto iter = attrs.find(RGW_ATTR_ACL);
+  if (iter == attrs.end()) {
+    return 0;
+  }
+
+  bufferlist::const_iterator bliter = iter->second.begin();
+  try {
+    policy.decode(bliter);
+  } catch (buffer::error& err) {
+    return -EIO;
+  }
+
+  return 0;
+}
+
+int RGWDataAccess::Bucket::init(const DoutPrefixProvider *dpp, optional_yield y)
+{
+  std::unique_ptr<rgw::sal::Bucket> bucket;
+  int ret = sd->driver->get_bucket(dpp, nullptr, tenant, name, &bucket, y);
+  if (ret < 0) {
+    return ret;
+  }
+
+  bucket_info = bucket->get_info();
+  mtime = bucket->get_modification_time();
+  attrs = bucket->get_attrs();
+
+  return finish_init();
+}
+
+int RGWDataAccess::Bucket::init(const RGWBucketInfo& _bucket_info,
+                               const map<string, bufferlist>& _attrs)
+{
+  bucket_info = _bucket_info;
+  attrs = _attrs;
+
+  return finish_init();
+}
+
+int RGWDataAccess::Bucket::get_object(const rgw_obj_key& key,
+                                     ObjectRef *obj) {
+  obj->reset(new Object(sd, shared_from_this(), key));
+  return 0;
+}
+
+int RGWDataAccess::Object::put(bufferlist& data,
+                              map<string, bufferlist>& attrs,
+                               const DoutPrefixProvider *dpp,
+                               optional_yield y)
+{
+  rgw::sal::Driver* driver = sd->driver;
+  CephContext *cct = driver->ctx();
+
+  string tag;
+  append_rand_alpha(cct, tag, tag, 32);
+
+  RGWBucketInfo& bucket_info = bucket->bucket_info;
+
+  rgw::BlockingAioThrottle aio(driver->ctx()->_conf->rgw_put_obj_min_window_size);
+
+  std::unique_ptr<rgw::sal::Bucket> b;
+  driver->get_bucket(NULL, bucket_info, &b);
+  std::unique_ptr<rgw::sal::Object> obj = b->get_object(key);
+
+  auto& owner = bucket->policy.get_owner();
+
+  string req_id = driver->zone_unique_id(driver->get_new_req_id());
+
+  std::unique_ptr<rgw::sal::Writer> processor;
+  processor = driver->get_atomic_writer(dpp, y, std::move(obj),
+                                      owner.get_id(),
+                                      nullptr, olh_epoch, req_id);
+
+  int ret = processor->prepare(y);
+  if (ret < 0)
+    return ret;
+
+  rgw::sal::DataProcessor *filter = processor.get();
+
+  CompressorRef plugin;
+  boost::optional<RGWPutObj_Compress> compressor;
+
+  const auto& compression_type = driver->get_compression_type(bucket_info.placement_rule);
+  if (compression_type != "none") {
+    plugin = Compressor::create(driver->ctx(), compression_type);
+    if (!plugin) {
+      ldpp_dout(dpp, 1) << "Cannot load plugin for compression type "
+        << compression_type << dendl;
+    } else {
+      compressor.emplace(driver->ctx(), plugin, filter);
+      filter = &*compressor;
+    }
+  }
+
+  off_t ofs = 0;
+  auto obj_size = data.length();
+
+  RGWMD5Etag etag_calc;
+
+  do {
+    size_t read_len = std::min(data.length(), (unsigned int)cct->_conf->rgw_max_chunk_size);
+
+    bufferlist bl;
+
+    data.splice(0, read_len, &bl);
+    etag_calc.update(bl);
+
+    ret = filter->process(std::move(bl), ofs);
+    if (ret < 0)
+      return ret;
+
+    ofs += read_len;
+  } while (data.length() > 0);
+
+  ret = filter->process({}, ofs);
+  if (ret < 0) {
+    return ret;
+  }
+  bool has_etag_attr = false;
+  auto iter = attrs.find(RGW_ATTR_ETAG);
+  if (iter != attrs.end()) {
+    bufferlist& bl = iter->second;
+    etag = bl.to_str();
+    has_etag_attr = true;
+  }
+
+  if (!aclbl) {
+    RGWAccessControlPolicy_S3 policy(cct);
+
+    policy.create_canned(bucket->policy.get_owner(), bucket->policy.get_owner(), string()); /* default private policy */
+
+    policy.encode(aclbl.emplace());
+  }
+
+  if (etag.empty()) {
+    etag_calc.finish(&etag);
+  }
+
+  if (!has_etag_attr) {
+    bufferlist etagbl;
+    etagbl.append(etag);
+    attrs[RGW_ATTR_ETAG] = etagbl;
+  }
+  attrs[RGW_ATTR_ACL] = *aclbl;
+
+  string *puser_data = nullptr;
+  if (user_data) {
+    puser_data = &(*user_data);
+  }
+
+  return processor->complete(obj_size, etag,
+                           &mtime, mtime,
+                           attrs, delete_at,
+                            nullptr, nullptr,
+                            puser_data,
+                            nullptr, nullptr, y);
+}
+
+void RGWDataAccess::Object::set_policy(const RGWAccessControlPolicy& policy)
+{
+  policy.encode(aclbl.emplace());
+}
+
+void rgw_complete_aio_completion(librados::AioCompletion* c, int r) {
+  auto pc = c->pc;
+  librados::CB_AioCompleteAndSafe cb(pc);
+  cb(r);
+}
diff --git a/src/rgw/driver/rados/rgw_tools.h b/src/rgw/driver/rados/rgw_tools.h
new file mode 100644 (file)
index 0000000..6aeb9b8
--- /dev/null
@@ -0,0 +1,277 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#ifndef CEPH_RGW_TOOLS_H
+#define CEPH_RGW_TOOLS_H
+
+#include <string>
+
+#include "include/types.h"
+#include "include/ceph_hash.h"
+
+#include "common/ceph_time.h"
+
+#include "rgw_common.h"
+#include "rgw_sal_fwd.h"
+
+class RGWSI_SysObj;
+
+class RGWRados;
+struct RGWObjVersionTracker;
+class optional_yield;
+
+struct obj_version;
+
+
+int rgw_init_ioctx(const DoutPrefixProvider *dpp,
+                   librados::Rados *rados, const rgw_pool& pool,
+                   librados::IoCtx& ioctx,
+                  bool create = false,
+                  bool mostly_omap = false);
+
+#define RGW_NO_SHARD -1
+
+#define RGW_SHARDS_PRIME_0 7877
+#define RGW_SHARDS_PRIME_1 65521
+
+extern const std::string MP_META_SUFFIX;
+
+inline int rgw_shards_max()
+{
+  return RGW_SHARDS_PRIME_1;
+}
+
+// only called by rgw_shard_id and rgw_bucket_shard_index
+static inline int rgw_shards_mod(unsigned hval, int max_shards)
+{
+  if (max_shards <= RGW_SHARDS_PRIME_0) {
+    return hval % RGW_SHARDS_PRIME_0 % max_shards;
+  }
+  return hval % RGW_SHARDS_PRIME_1 % max_shards;
+}
+
+// used for logging and tagging
+inline int rgw_shard_id(const std::string& key, int max_shards)
+{
+  return rgw_shards_mod(ceph_str_hash_linux(key.c_str(), key.size()),
+                       max_shards);
+}
+
+void rgw_shard_name(const std::string& prefix, unsigned max_shards, const std::string& key, std::string& name, int *shard_id);
+void rgw_shard_name(const std::string& prefix, unsigned max_shards, const std::string& section, const std::string& key, std::string& name);
+void rgw_shard_name(const std::string& prefix, unsigned shard_id, std::string& name);
+
+int rgw_put_system_obj(const DoutPrefixProvider *dpp, RGWSI_SysObj* svc_sysobj,
+                       const rgw_pool& pool, const std::string& oid,
+                       bufferlist& data, bool exclusive,
+                       RGWObjVersionTracker *objv_tracker,
+                       real_time set_mtime, optional_yield y,
+                       std::map<std::string, bufferlist> *pattrs = nullptr);
+int rgw_get_system_obj(RGWSI_SysObj* svc_sysobj, const rgw_pool& pool,
+                       const std::string& key, bufferlist& bl,
+                       RGWObjVersionTracker *objv_tracker, real_time *pmtime,
+                       optional_yield y, const DoutPrefixProvider *dpp,
+                       std::map<std::string, bufferlist> *pattrs = nullptr,
+                       rgw_cache_entry_info *cache_info = nullptr,
+                      boost::optional<obj_version> refresh_version = boost::none,
+                       bool raw_attrs=false);
+int rgw_delete_system_obj(const DoutPrefixProvider *dpp, 
+                          RGWSI_SysObj *sysobj_svc, const rgw_pool& pool, const std::string& oid,
+                          RGWObjVersionTracker *objv_tracker, optional_yield y);
+int rgw_stat_system_obj(const DoutPrefixProvider *dpp, RGWSI_SysObj* svc_sysobj,
+                        const rgw_pool& pool, const std::string& key,
+                        RGWObjVersionTracker *objv_tracker,
+                        real_time *pmtime, optional_yield y,
+                        std::map<std::string, bufferlist> *pattrs = nullptr);
+
+const char *rgw_find_mime_by_ext(std::string& ext);
+
+void rgw_filter_attrset(std::map<std::string, bufferlist>& unfiltered_attrset, const std::string& check_prefix,
+                        std::map<std::string, bufferlist> *attrset);
+
+/// indicates whether the current thread is in boost::asio::io_context::run(),
+/// used to log warnings if synchronous librados calls are made
+extern thread_local bool is_asio_thread;
+
+/// perform the rados operation, using the yield context when given
+int rgw_rados_operate(const DoutPrefixProvider *dpp, librados::IoCtx& ioctx, const std::string& oid,
+                      librados::ObjectReadOperation *op, bufferlist* pbl,
+                      optional_yield y, int flags = 0);
+int rgw_rados_operate(const DoutPrefixProvider *dpp, librados::IoCtx& ioctx, const std::string& oid,
+                      librados::ObjectWriteOperation *op, optional_yield y,
+                     int flags = 0);
+int rgw_rados_notify(const DoutPrefixProvider *dpp, librados::IoCtx& ioctx, const std::string& oid,
+                     bufferlist& bl, uint64_t timeout_ms, bufferlist* pbl,
+                     optional_yield y);
+
+int rgw_tools_init(const DoutPrefixProvider *dpp, CephContext *cct);
+void rgw_tools_cleanup();
+
+template<class H, size_t S>
+class RGWEtag
+{
+  H hash;
+
+public:
+  RGWEtag() {
+    if constexpr (std::is_same_v<H, MD5>) {
+      // Allow use of MD5 digest in FIPS mode for non-cryptographic purposes
+      hash.SetFlags(EVP_MD_CTX_FLAG_NON_FIPS_ALLOW);
+    }
+  }
+
+  void update(const char *buf, size_t len) {
+    hash.Update((const unsigned char *)buf, len);
+  }
+
+  void update(bufferlist& bl) {
+    if (bl.length() > 0) {
+      update(bl.c_str(), bl.length());
+    }
+  }
+
+  void update(const std::string& s) {
+    if (!s.empty()) {
+      update(s.c_str(), s.size());
+    }
+  }
+  void finish(std::string *etag) {
+    char etag_buf[S];
+    char etag_buf_str[S * 2 + 16];
+
+    hash.Final((unsigned char *)etag_buf);
+    buf_to_hex((const unsigned char *)etag_buf, S,
+              etag_buf_str);
+
+    *etag = etag_buf_str;
+  }
+};
+
+using RGWMD5Etag = RGWEtag<MD5, CEPH_CRYPTO_MD5_DIGESTSIZE>;
+
+class RGWDataAccess
+{
+  rgw::sal::Driver* driver;
+
+public:
+  RGWDataAccess(rgw::sal::Driver* _driver);
+
+  class Object;
+  class Bucket;
+
+  using BucketRef = std::shared_ptr<Bucket>;
+  using ObjectRef = std::shared_ptr<Object>;
+
+  class Bucket : public std::enable_shared_from_this<Bucket> {
+    friend class RGWDataAccess;
+    friend class Object;
+
+    RGWDataAccess *sd{nullptr};
+    RGWBucketInfo bucket_info;
+    std::string tenant;
+    std::string name;
+    std::string bucket_id;
+    ceph::real_time mtime;
+    std::map<std::string, bufferlist> attrs;
+
+    RGWAccessControlPolicy policy;
+    int finish_init();
+    
+    Bucket(RGWDataAccess *_sd,
+          const std::string& _tenant,
+          const std::string& _name,
+          const std::string& _bucket_id) : sd(_sd),
+                                       tenant(_tenant),
+                                       name(_name),
+                                      bucket_id(_bucket_id) {}
+    Bucket(RGWDataAccess *_sd) : sd(_sd) {}
+    int init(const DoutPrefixProvider *dpp, optional_yield y);
+    int init(const RGWBucketInfo& _bucket_info, const std::map<std::string, bufferlist>& _attrs);
+  public:
+    int get_object(const rgw_obj_key& key,
+                  ObjectRef *obj);
+
+  };
+
+
+  class Object {
+    RGWDataAccess *sd{nullptr};
+    BucketRef bucket;
+    rgw_obj_key key;
+
+    ceph::real_time mtime;
+    std::string etag;
+    uint64_t olh_epoch{0};
+    ceph::real_time delete_at;
+    std::optional<std::string> user_data;
+
+    std::optional<bufferlist> aclbl;
+
+    Object(RGWDataAccess *_sd,
+           BucketRef&& _bucket,
+           const rgw_obj_key& _key) : sd(_sd),
+                                      bucket(_bucket),
+                                      key(_key) {}
+  public:
+    int put(bufferlist& data, std::map<std::string, bufferlist>& attrs, const DoutPrefixProvider *dpp, optional_yield y); /* might modify attrs */
+
+    void set_mtime(const ceph::real_time& _mtime) {
+      mtime = _mtime;
+    }
+
+    void set_etag(const std::string& _etag) {
+      etag = _etag;
+    }
+
+    void set_olh_epoch(uint64_t epoch) {
+      olh_epoch = epoch;
+    }
+
+    void set_delete_at(ceph::real_time _delete_at) {
+      delete_at = _delete_at;
+    }
+
+    void set_user_data(const std::string& _user_data) {
+      user_data = _user_data;
+    }
+
+    void set_policy(const RGWAccessControlPolicy& policy);
+
+    friend class Bucket;
+  };
+
+  int get_bucket(const DoutPrefixProvider *dpp, 
+                 const std::string& tenant,
+                const std::string name,
+                const std::string bucket_id,
+                BucketRef *bucket,
+                optional_yield y) {
+    bucket->reset(new Bucket(this, tenant, name, bucket_id));
+    return (*bucket)->init(dpp, y);
+  }
+
+  int get_bucket(const RGWBucketInfo& bucket_info,
+                const std::map<std::string, bufferlist>& attrs,
+                BucketRef *bucket) {
+    bucket->reset(new Bucket(this));
+    return (*bucket)->init(bucket_info, attrs);
+  }
+  friend class Bucket;
+  friend class Object;
+};
+
+using RGWDataAccessRef = std::shared_ptr<RGWDataAccess>;
+
+/// Complete an AioCompletion. To return error values or otherwise
+/// satisfy the caller. Useful for making complicated asynchronous
+/// calls and error handling.
+void rgw_complete_aio_completion(librados::AioCompletion* c, int r);
+
+/// This returns a static, non-NULL pointer, recognized only by
+/// rgw_put_system_obj(). When supplied instead of the attributes, the
+/// attributes will be unmodified.
+///
+// (Currently providing nullptr will wipe all attributes.)
+
+std::map<std::string, ceph::buffer::list>* no_change_attrs();
+#endif
diff --git a/src/rgw/driver/rados/rgw_trim_bilog.cc b/src/rgw/driver/rados/rgw_trim_bilog.cc
new file mode 100644 (file)
index 0000000..6ddda5d
--- /dev/null
@@ -0,0 +1,1445 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2017 Red Hat, Inc
+ *
+ * Author: Casey Bodley <cbodley@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ */
+
+#include <mutex>
+#include <boost/circular_buffer.hpp>
+#include <boost/container/flat_map.hpp>
+
+#include "include/scope_guard.h"
+#include "common/bounded_key_counter.h"
+#include "common/errno.h"
+#include "rgw_trim_bilog.h"
+#include "rgw_cr_rados.h"
+#include "rgw_cr_rest.h"
+#include "rgw_cr_tools.h"
+#include "rgw_data_sync.h"
+#include "rgw_metadata.h"
+#include "rgw_sal.h"
+#include "rgw_zone.h"
+#include "rgw_sync.h"
+#include "rgw_bucket.h"
+
+#include "services/svc_zone.h"
+#include "services/svc_meta.h"
+#include "services/svc_bilog_rados.h"
+
+#include <boost/asio/yield.hpp>
+#include "include/ceph_assert.h"
+
+#define dout_subsys ceph_subsys_rgw
+
+#undef dout_prefix
+#define dout_prefix (*_dout << "trim: ")
+
+using namespace std;
+
+using rgw::BucketTrimConfig;
+using BucketChangeCounter = BoundedKeyCounter<std::string, int>;
+
+const std::string rgw::BucketTrimStatus::oid = "bilog.trim";
+using rgw::BucketTrimStatus;
+
+
+// watch/notify api for gateways to coordinate about which buckets to trim
+enum TrimNotifyType {
+  NotifyTrimCounters = 0,
+  NotifyTrimComplete,
+};
+WRITE_RAW_ENCODER(TrimNotifyType);
+
+struct TrimNotifyHandler {
+  virtual ~TrimNotifyHandler() = default;
+
+  virtual void handle(bufferlist::const_iterator& input, bufferlist& output) = 0;
+};
+
+/// api to share the bucket trim counters between gateways in the same zone.
+/// each gateway will process different datalog shards, so the gateway that runs
+/// the trim process needs to accumulate their counters
+struct TrimCounters {
+  /// counter for a single bucket
+  struct BucketCounter {
+    std::string bucket; //< bucket instance metadata key
+    int count{0};
+
+    BucketCounter() = default;
+    BucketCounter(const std::string& bucket, int count)
+      : bucket(bucket), count(count) {}
+
+    void encode(bufferlist& bl) const;
+    void decode(bufferlist::const_iterator& p);
+  };
+  using Vector = std::vector<BucketCounter>;
+
+  /// request bucket trim counters from peer gateways
+  struct Request {
+    uint16_t max_buckets; //< maximum number of bucket counters to return
+
+    void encode(bufferlist& bl) const;
+    void decode(bufferlist::const_iterator& p);
+  };
+
+  /// return the current bucket trim counters
+  struct Response {
+    Vector bucket_counters;
+
+    void encode(bufferlist& bl) const;
+    void decode(bufferlist::const_iterator& p);
+  };
+
+  /// server interface to query the hottest buckets
+  struct Server {
+    virtual ~Server() = default;
+
+    virtual void get_bucket_counters(int count, Vector& counters) = 0;
+    virtual void reset_bucket_counters() = 0;
+  };
+
+  /// notify handler
+  class Handler : public TrimNotifyHandler {
+    Server *const server;
+   public:
+    explicit Handler(Server *server) : server(server) {}
+
+    void handle(bufferlist::const_iterator& input, bufferlist& output) override;
+  };
+};
+std::ostream& operator<<(std::ostream& out, const TrimCounters::BucketCounter& rhs)
+{
+  return out << rhs.bucket << ":" << rhs.count;
+}
+
+void TrimCounters::BucketCounter::encode(bufferlist& bl) const
+{
+  using ceph::encode;
+  // no versioning to save space
+  encode(bucket, bl);
+  encode(count, bl);
+}
+void TrimCounters::BucketCounter::decode(bufferlist::const_iterator& p)
+{
+  using ceph::decode;
+  decode(bucket, p);
+  decode(count, p);
+}
+WRITE_CLASS_ENCODER(TrimCounters::BucketCounter);
+
+void TrimCounters::Request::encode(bufferlist& bl) const
+{
+  ENCODE_START(1, 1, bl);
+  encode(max_buckets, bl);
+  ENCODE_FINISH(bl);
+}
+void TrimCounters::Request::decode(bufferlist::const_iterator& p)
+{
+  DECODE_START(1, p);
+  decode(max_buckets, p);
+  DECODE_FINISH(p);
+}
+WRITE_CLASS_ENCODER(TrimCounters::Request);
+
+void TrimCounters::Response::encode(bufferlist& bl) const
+{
+  ENCODE_START(1, 1, bl);
+  encode(bucket_counters, bl);
+  ENCODE_FINISH(bl);
+}
+void TrimCounters::Response::decode(bufferlist::const_iterator& p)
+{
+  DECODE_START(1, p);
+  decode(bucket_counters, p);
+  DECODE_FINISH(p);
+}
+WRITE_CLASS_ENCODER(TrimCounters::Response);
+
+void TrimCounters::Handler::handle(bufferlist::const_iterator& input,
+                                   bufferlist& output)
+{
+  Request request;
+  decode(request, input);
+  auto count = std::min<uint16_t>(request.max_buckets, 128);
+
+  Response response;
+  server->get_bucket_counters(count, response.bucket_counters);
+  encode(response, output);
+}
+
+/// api to notify peer gateways that trim has completed and their bucket change
+/// counters can be reset
+struct TrimComplete {
+  struct Request {
+    void encode(bufferlist& bl) const;
+    void decode(bufferlist::const_iterator& p);
+  };
+  struct Response {
+    void encode(bufferlist& bl) const;
+    void decode(bufferlist::const_iterator& p);
+  };
+
+  /// server interface to reset bucket counters
+  using Server = TrimCounters::Server;
+
+  /// notify handler
+  class Handler : public TrimNotifyHandler {
+    Server *const server;
+   public:
+    explicit Handler(Server *server) : server(server) {}
+
+    void handle(bufferlist::const_iterator& input, bufferlist& output) override;
+  };
+};
+
+void TrimComplete::Request::encode(bufferlist& bl) const
+{
+  ENCODE_START(1, 1, bl);
+  ENCODE_FINISH(bl);
+}
+void TrimComplete::Request::decode(bufferlist::const_iterator& p)
+{
+  DECODE_START(1, p);
+  DECODE_FINISH(p);
+}
+WRITE_CLASS_ENCODER(TrimComplete::Request);
+
+void TrimComplete::Response::encode(bufferlist& bl) const
+{
+  ENCODE_START(1, 1, bl);
+  ENCODE_FINISH(bl);
+}
+void TrimComplete::Response::decode(bufferlist::const_iterator& p)
+{
+  DECODE_START(1, p);
+  DECODE_FINISH(p);
+}
+WRITE_CLASS_ENCODER(TrimComplete::Response);
+
+void TrimComplete::Handler::handle(bufferlist::const_iterator& input,
+                                   bufferlist& output)
+{
+  Request request;
+  decode(request, input);
+
+  server->reset_bucket_counters();
+
+  Response response;
+  encode(response, output);
+}
+
+
+/// rados watcher for bucket trim notifications
+class BucketTrimWatcher : public librados::WatchCtx2 {
+  rgw::sal::RadosStore* const store;
+  const rgw_raw_obj& obj;
+  rgw_rados_ref ref;
+  uint64_t handle{0};
+
+  using HandlerPtr = std::unique_ptr<TrimNotifyHandler>;
+  boost::container::flat_map<TrimNotifyType, HandlerPtr> handlers;
+
+ public:
+  BucketTrimWatcher(rgw::sal::RadosStore* store, const rgw_raw_obj& obj,
+                    TrimCounters::Server *counters)
+    : store(store), obj(obj) {
+    handlers.emplace(NotifyTrimCounters, new TrimCounters::Handler(counters));
+    handlers.emplace(NotifyTrimComplete, new TrimComplete::Handler(counters));
+  }
+
+  ~BucketTrimWatcher() {
+    stop();
+  }
+
+  int start(const DoutPrefixProvider *dpp) {
+    int r = store->getRados()->get_raw_obj_ref(dpp, obj, &ref);
+    if (r < 0) {
+      return r;
+    }
+
+    // register a watch on the realm's control object
+    r = ref.pool.ioctx().watch2(ref.obj.oid, &handle, this);
+    if (r == -ENOENT) {
+      constexpr bool exclusive = true;
+      r = ref.pool.ioctx().create(ref.obj.oid, exclusive);
+      if (r == -EEXIST || r == 0) {
+        r = ref.pool.ioctx().watch2(ref.obj.oid, &handle, this);
+      }
+    }
+    if (r < 0) {
+      ldpp_dout(dpp, -1) << "Failed to watch " << ref.obj
+          << " with " << cpp_strerror(-r) << dendl;
+      ref.pool.ioctx().close();
+      return r;
+    }
+
+    ldpp_dout(dpp, 10) << "Watching " << ref.obj.oid << dendl;
+    return 0;
+  }
+
+  int restart() {
+    int r = ref.pool.ioctx().unwatch2(handle);
+    if (r < 0) {
+      lderr(store->ctx()) << "Failed to unwatch on " << ref.obj
+          << " with " << cpp_strerror(-r) << dendl;
+    }
+    r = ref.pool.ioctx().watch2(ref.obj.oid, &handle, this);
+    if (r < 0) {
+      lderr(store->ctx()) << "Failed to restart watch on " << ref.obj
+          << " with " << cpp_strerror(-r) << dendl;
+      ref.pool.ioctx().close();
+    }
+    return r;
+  }
+
+  void stop() {
+    if (handle) {
+      ref.pool.ioctx().unwatch2(handle);
+      ref.pool.ioctx().close();
+    }
+  }
+
+  /// respond to bucket trim notifications
+  void handle_notify(uint64_t notify_id, uint64_t cookie,
+                     uint64_t notifier_id, bufferlist& bl) override {
+    if (cookie != handle) {
+      return;
+    }
+    bufferlist reply;
+    try {
+      auto p = bl.cbegin();
+      TrimNotifyType type;
+      decode(type, p);
+
+      auto handler = handlers.find(type);
+      if (handler != handlers.end()) {
+        handler->second->handle(p, reply);
+      } else {
+        lderr(store->ctx()) << "no handler for notify type " << type << dendl;
+      }
+    } catch (const buffer::error& e) {
+      lderr(store->ctx()) << "Failed to decode notification: " << e.what() << dendl;
+    }
+    ref.pool.ioctx().notify_ack(ref.obj.oid, notify_id, cookie, reply);
+  }
+
+  /// reestablish the watch if it gets disconnected
+  void handle_error(uint64_t cookie, int err) override {
+    if (cookie != handle) {
+      return;
+    }
+    if (err == -ENOTCONN) {
+      ldout(store->ctx(), 4) << "Disconnected watch on " << ref.obj << dendl;
+      restart();
+    }
+  }
+};
+
+
+/// Interface to communicate with the trim manager about completed operations
+struct BucketTrimObserver {
+  virtual ~BucketTrimObserver() = default;
+
+  virtual void on_bucket_trimmed(std::string&& bucket_instance) = 0;
+  virtual bool trimmed_recently(const std::string_view& bucket_instance) = 0;
+};
+
+/// trim each bilog shard to the given marker, while limiting the number of
+/// concurrent requests
+class BucketTrimShardCollectCR : public RGWShardCollectCR {
+  static constexpr int MAX_CONCURRENT_SHARDS = 16;
+  const DoutPrefixProvider *dpp;
+  rgw::sal::RadosStore* const store;
+  const RGWBucketInfo& bucket_info;
+  rgw::bucket_index_layout_generation generation;
+  const std::vector<std::string>& markers; //< shard markers to trim
+  size_t i{0}; //< index of current shard marker
+
+  int handle_result(int r) override {
+    if (r == -ENOENT) { // ENOENT is not a fatal error
+      return 0;
+    }
+    if (r < 0) {
+      ldout(cct, 4) << "failed to trim bilog shard: " << cpp_strerror(r) << dendl;
+    }
+    return r;
+  }
+ public:
+  BucketTrimShardCollectCR(const DoutPrefixProvider *dpp,
+                           rgw::sal::RadosStore* store, const RGWBucketInfo& bucket_info,
+                          const rgw::bucket_index_layout_generation& generation,
+                           const std::vector<std::string>& markers)
+    : RGWShardCollectCR(store->ctx(), MAX_CONCURRENT_SHARDS),
+      dpp(dpp), store(store), bucket_info(bucket_info),
+      generation(generation), markers(markers)
+  {}
+  bool spawn_next() override;
+};
+
+bool BucketTrimShardCollectCR::spawn_next()
+{
+  while (i < markers.size()) {
+    const auto& marker = markers[i];
+    const auto shard_id = i++;
+
+    // skip empty markers
+    if (!marker.empty()) {
+      ldpp_dout(dpp, 10) << "trimming bilog shard " << shard_id
+          << " of " << bucket_info.bucket << " at marker " << marker << dendl;
+      spawn(new RGWRadosBILogTrimCR(dpp, store, bucket_info, shard_id,
+                                    generation, std::string{}, marker),
+            false);
+      return true;
+    }
+  }
+  return false;
+}
+
+/// Delete a BI generation, limiting the number of requests in flight.
+class BucketCleanIndexCollectCR : public RGWShardCollectCR {
+  static constexpr int MAX_CONCURRENT_SHARDS = 16;
+  const DoutPrefixProvider *dpp;
+  rgw::sal::RadosStore* const store;
+  const RGWBucketInfo& bucket_info;
+  rgw::bucket_index_layout_generation index;
+  uint32_t shard = 0;
+  const uint32_t num_shards = rgw::num_shards(index);
+
+  int handle_result(int r) override {
+    if (r == -ENOENT) { // ENOENT is not a fatal error
+      return 0;
+    }
+    if (r < 0) {
+      ldout(cct, 4) << "clean index: " << cpp_strerror(r) << dendl;
+    }
+    return r;
+  }
+ public:
+  BucketCleanIndexCollectCR(const DoutPrefixProvider *dpp,
+                           rgw::sal::RadosStore* store,
+                           const RGWBucketInfo& bucket_info,
+                           rgw::bucket_index_layout_generation index)
+    : RGWShardCollectCR(store->ctx(), MAX_CONCURRENT_SHARDS),
+      dpp(dpp), store(store), bucket_info(bucket_info),
+      index(index)
+  {}
+  bool spawn_next() override {
+    if (shard < num_shards) {
+      RGWRados::BucketShard bs(store->getRados());
+      bs.init(dpp, bucket_info, index, shard);
+      spawn(new RGWRadosRemoveOidCR(store, std::move(bs.bucket_obj), nullptr),
+           false);
+      ++shard;
+      return true;
+    } else {
+      return false;
+    }
+  }
+};
+
+
+/// trim the bilog of all of the given bucket instance's shards
+class BucketTrimInstanceCR : public RGWCoroutine {
+  static constexpr auto MAX_RETRIES = 25u;
+  rgw::sal::RadosStore* const store;
+  RGWHTTPManager *const http;
+  BucketTrimObserver *const observer;
+  std::string bucket_instance;
+  rgw_bucket_get_sync_policy_params get_policy_params;
+  std::shared_ptr<rgw_bucket_get_sync_policy_result> source_policy;
+  rgw_bucket bucket;
+  const std::string& zone_id; //< my zone id
+  RGWBucketInfo _bucket_info;
+  const RGWBucketInfo *pbucket_info; //< pointer to bucket instance info to locate bucket indices
+  int child_ret = 0;
+  const DoutPrefixProvider *dpp;
+public:
+  struct StatusShards {
+    uint64_t generation = 0;
+    std::vector<rgw_bucket_shard_sync_info> shards;
+  };
+private:
+  std::vector<StatusShards> peer_status; //< sync status for each peer
+  std::vector<std::string> min_markers; //< min marker per shard
+
+  /// The log generation to trim
+  rgw::bucket_log_layout_generation totrim;
+
+  /// Generation to be cleaned/New bucket info (if any)
+  std::optional<std::pair<RGWBucketInfo,
+                         rgw::bucket_log_layout_generation>> clean_info;
+  /// Maximum number of times to attempt to put bucket info
+  unsigned retries = 0;
+
+  int take_min_generation() {
+    // Initialize the min_generation to the bucket's current
+    // generation, used in case we have no peers.
+    auto min_generation = pbucket_info->layout.logs.back().gen;
+
+    // Determine the minimum generation
+    if (auto m = std::min_element(peer_status.begin(),
+                                 peer_status.end(),
+                                 [](const StatusShards& l,
+                                    const StatusShards& r) {
+                                   return l.generation < r.generation;
+                                 }); m != peer_status.end()) {
+      min_generation = m->generation;
+    }
+
+    auto& logs = pbucket_info->layout.logs;
+    auto log = std::find_if(logs.begin(), logs.end(),
+                           rgw::matches_gen(min_generation));
+    if (log == logs.end()) {
+      ldpp_dout(dpp, 5) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                       << "ERROR: No log layout for min_generation="
+                       << min_generation << dendl;
+      return -ENOENT;
+    }
+
+    totrim = *log;
+    return 0;
+  }
+
+  /// If there is a generation below the minimum, prepare to clean it up.
+  int maybe_remove_generation() {
+    if (clean_info)
+      return 0;
+
+
+    if (pbucket_info->layout.logs.front().gen < totrim.gen) {
+      clean_info = {*pbucket_info, {}};
+      auto log = clean_info->first.layout.logs.cbegin();
+      clean_info->second = *log;
+
+      if (clean_info->first.layout.logs.size() == 1) {
+       ldpp_dout(dpp, -1)
+         << "Critical error! Attempt to remove only log generation! "
+         << "log.gen=" << log->gen << ", totrim.gen=" << totrim.gen
+         << dendl;
+       return -EIO;
+      }
+      clean_info->first.layout.logs.erase(log);
+    }
+    return 0;
+  }
+
+ public:
+  BucketTrimInstanceCR(rgw::sal::RadosStore* store, RGWHTTPManager *http,
+                       BucketTrimObserver *observer,
+                       const std::string& bucket_instance,
+                       const DoutPrefixProvider *dpp)
+    : RGWCoroutine(store->ctx()), store(store),
+      http(http), observer(observer),
+      bucket_instance(bucket_instance),
+      zone_id(store->svc()->zone->get_zone().id),
+      dpp(dpp) {
+    rgw_bucket_parse_bucket_key(cct, bucket_instance, &bucket, nullptr);
+    source_policy = make_shared<rgw_bucket_get_sync_policy_result>();
+  }
+
+  int operate(const DoutPrefixProvider *dpp) override;
+};
+
+namespace {
+/// populate the status with the minimum stable marker of each shard
+int take_min_status(
+  CephContext *cct,
+  const uint64_t min_generation,
+  std::vector<BucketTrimInstanceCR::StatusShards>::const_iterator first,
+  std::vector<BucketTrimInstanceCR::StatusShards>::const_iterator last,
+  std::vector<std::string> *status) {
+  for (auto peer = first; peer != last; ++peer) {
+    // Peers on later generations don't get a say in the matter
+    if (peer->generation > min_generation) {
+      continue;
+    }
+    if (peer->shards.size() != status->size()) {
+      // all peers must agree on the number of shards
+      return -EINVAL;
+    }
+
+    auto m = status->begin();
+    for (auto& shard : peer->shards) {
+      auto& marker = *m++;
+      // always take the first marker, or any later marker that's smaller
+      if (peer == first || marker > shard.inc_marker.position) {
+       marker = std::move(shard.inc_marker.position);
+      }
+    }
+  }
+  return 0;
+}
+}
+
+template<>
+inline int parse_decode_json<BucketTrimInstanceCR::StatusShards>(
+  BucketTrimInstanceCR::StatusShards& s, bufferlist& bl)
+{
+  JSONParser p;
+  if (!p.parse(bl.c_str(), bl.length())) {
+    return -EINVAL;
+  }
+
+  try {
+    bilog_status_v2 v;
+    decode_json_obj(v, &p);
+    s.generation = v.sync_status.incremental_gen;
+    s.shards = std::move(v.inc_status);
+  } catch (JSONDecoder::err& e) {
+    try {
+      // Fall back if we're talking to an old node that can't give v2
+      // output.
+      s.generation = 0;
+      decode_json_obj(s.shards, &p);
+    } catch (JSONDecoder::err& e) {
+      return -EINVAL;
+    }
+  }
+  return 0;
+}
+
+int BucketTrimInstanceCR::operate(const DoutPrefixProvider *dpp)
+{
+  reenter(this) {
+    ldpp_dout(dpp, 4) << "starting trim on bucket=" << bucket_instance << dendl;
+
+    get_policy_params.zone = zone_id;
+    get_policy_params.bucket = bucket;
+    yield call(new RGWBucketGetSyncPolicyHandlerCR(store->svc()->rados->get_async_processor(),
+                                                   store,
+                                                   get_policy_params,
+                                                   source_policy,
+                                                   dpp));
+    if (retcode < 0) {
+      if (retcode != -ENOENT) {
+        ldpp_dout(dpp, 0) << "ERROR: failed to fetch policy handler for bucket=" << bucket << dendl;
+      }
+
+      return set_cr_error(retcode);
+    }
+
+    if (auto& opt_bucket_info = source_policy->policy_handler->get_bucket_info();
+        opt_bucket_info) {
+      pbucket_info = &(*opt_bucket_info);
+    } else {
+      /* this shouldn't really happen */
+      return set_cr_error(-ENOENT);
+    }
+
+    if (pbucket_info->layout.logs.empty()) {
+      return set_cr_done(); // no bilogs to trim
+    }
+
+    // query peers for sync status
+    set_status("fetching sync status from relevant peers");
+    yield {
+      const auto& all_dests = source_policy->policy_handler->get_all_dests();
+
+      vector<rgw_zone_id> zids;
+      rgw_zone_id last_zid;
+      for (auto& diter : all_dests) {
+        const auto& zid = diter.first;
+        if (zid == last_zid) {
+          continue;
+        }
+        last_zid = zid;
+        zids.push_back(zid);
+      }
+
+      peer_status.resize(zids.size());
+
+      auto& zone_conn_map = store->svc()->zone->get_zone_conn_map();
+
+      auto p = peer_status.begin();
+      for (auto& zid : zids) {
+        // query data sync status from each sync peer
+        rgw_http_param_pair params[] = {
+          { "type", "bucket-index" },
+          { "status", nullptr },
+          { "options", "merge" },
+          { "bucket", bucket_instance.c_str() }, /* equal to source-bucket when `options==merge` and source-bucket
+                                                    param is not provided */
+          { "source-zone", zone_id.c_str() },
+          { "version", "2" },
+          { nullptr, nullptr }
+        };
+
+        auto ziter = zone_conn_map.find(zid);
+        if (ziter == zone_conn_map.end()) {
+          ldpp_dout(dpp, 0) << "WARNING: no connection to zone " << zid << ", can't trim bucket: " << bucket << dendl;
+          return set_cr_error(-ECANCELED);
+        }
+
+       using StatusCR = RGWReadRESTResourceCR<StatusShards>;
+        spawn(new StatusCR(cct, ziter->second, http, "/admin/log/", params, &*p),
+              false);
+        ++p;
+      }
+    }
+    // wait for a response from each peer. all must respond to attempt trim
+    while (num_spawned()) {
+      yield wait_for_child();
+      collect(&child_ret, nullptr);
+      if (child_ret < 0) {
+        drain_all();
+        return set_cr_error(child_ret);
+      }
+    }
+
+    // Determine the minimum generation
+    retcode = take_min_generation();
+    if (retcode < 0) {
+      ldpp_dout(dpp, 4) << "failed to find minimum generation" << dendl;
+      return set_cr_error(retcode);
+    }
+    retcode = maybe_remove_generation();
+    if (retcode < 0) {
+      ldpp_dout(dpp, 4) << "error removing old generation from log: "
+                       << cpp_strerror(retcode) << dendl;
+      return set_cr_error(retcode);
+    }
+
+    if (clean_info) {
+      if (clean_info->second.layout.type != rgw::BucketLogType::InIndex) {
+       ldpp_dout(dpp, 0) << "Unable to convert log of unknown type "
+                         << clean_info->second.layout.type
+                         << " to rgw::bucket_index_layout_generation " << dendl;
+       return set_cr_error(-EINVAL);
+      }
+
+      yield call(new BucketCleanIndexCollectCR(dpp, store, clean_info->first,
+                                              clean_info->second.layout.in_index));
+      if (retcode < 0) {
+       ldpp_dout(dpp, 0) << "failed to remove previous generation: "
+                         << cpp_strerror(retcode) << dendl;
+       return set_cr_error(retcode);
+      }
+      while (clean_info && retries < MAX_RETRIES) {
+       yield call(new RGWPutBucketInstanceInfoCR(
+                    store->svc()->rados->get_async_processor(),
+                    store, clean_info->first, false, {},
+                    no_change_attrs(), dpp));
+
+       // Raced, try again.
+       if (retcode == -ECANCELED) {
+         yield call(new RGWGetBucketInstanceInfoCR(
+                      store->svc()->rados->get_async_processor(),
+                      store, clean_info->first.bucket,
+                      &(clean_info->first), nullptr, dpp));
+         if (retcode < 0) {
+           ldpp_dout(dpp, 0) << "failed to get bucket info: "
+                             << cpp_strerror(retcode) << dendl;
+           return set_cr_error(retcode);
+         }
+         if (clean_info->first.layout.logs.front().gen ==
+             clean_info->second.gen) {
+           clean_info->first.layout.logs.erase(
+             clean_info->first.layout.logs.begin());
+           ++retries;
+           continue;
+         }
+         // Raced, but someone else did what we needed to.
+         retcode = 0;
+       }
+
+       if (retcode < 0) {
+         ldpp_dout(dpp, 0) << "failed to put bucket info: "
+                           << cpp_strerror(retcode) << dendl;
+         return set_cr_error(retcode);
+       }
+       clean_info = std::nullopt;
+      }
+    } else {
+      if (totrim.layout.type != rgw::BucketLogType::InIndex) {
+       ldpp_dout(dpp, 0) << "Unable to convert log of unknown type "
+                         << totrim.layout.type
+                         << " to rgw::bucket_index_layout_generation " << dendl;
+       return set_cr_error(-EINVAL);
+      }
+      // To avoid hammering the OSD too hard, either trim old
+      // generations OR trim the current one.
+
+      // determine the minimum marker for each shard
+
+      // initialize each shard with the maximum marker, which is only used when
+      // there are no peers syncing from us
+      min_markers.assign(std::max(1u, rgw::num_shards(totrim.layout.in_index)),
+                        RGWSyncLogTrimCR::max_marker);
+
+
+      retcode = take_min_status(cct, totrim.gen, peer_status.cbegin(),
+                               peer_status.cend(), &min_markers);
+      if (retcode < 0) {
+       ldpp_dout(dpp, 4) << "failed to correlate bucket sync status from peers" << dendl;
+       return set_cr_error(retcode);
+      }
+
+      // trim shards with a ShardCollectCR
+      ldpp_dout(dpp, 10) << "trimming bilogs for bucket=" << pbucket_info->bucket
+                        << " markers=" << min_markers << ", shards=" << min_markers.size() << dendl;
+      set_status("trimming bilog shards");
+      yield call(new BucketTrimShardCollectCR(dpp, store, *pbucket_info, totrim.layout.in_index,
+                                             min_markers));
+      // ENODATA just means there were no keys to trim
+      if (retcode == -ENODATA) {
+       retcode = 0;
+      }
+      if (retcode < 0) {
+       ldpp_dout(dpp, 4) << "failed to trim bilog shards: "
+                         << cpp_strerror(retcode) << dendl;
+       return set_cr_error(retcode);
+      }
+    }
+
+    observer->on_bucket_trimmed(std::move(bucket_instance));
+    return set_cr_done();
+  }
+  return 0;
+}
+
+/// trim each bucket instance while limiting the number of concurrent operations
+
+class BucketTrimInstanceCollectCR : public RGWShardCollectCR {
+  rgw::sal::RadosStore* const store;
+  RGWHTTPManager *const http;
+  BucketTrimObserver *const observer;
+  std::vector<std::string>::const_iterator bucket;
+  std::vector<std::string>::const_iterator end;
+  const DoutPrefixProvider *dpp;
+
+  int handle_result(int r) override {
+    if (r == -ENOENT) { // ENOENT is not a fatal error
+      return 0;
+    }
+    if (r < 0) {
+      ldout(cct, 4) << "failed to trim bucket instance: " << cpp_strerror(r) << dendl;
+    }
+    return r;
+  }
+ public:
+  BucketTrimInstanceCollectCR(rgw::sal::RadosStore* store, RGWHTTPManager *http,
+                              BucketTrimObserver *observer,
+                              const std::vector<std::string>& buckets,
+                              int max_concurrent,
+                              const DoutPrefixProvider *dpp)
+    : RGWShardCollectCR(store->ctx(), max_concurrent),
+      store(store), http(http), observer(observer),
+      bucket(buckets.begin()), end(buckets.end()),
+      dpp(dpp)
+  {}
+  bool spawn_next() override;
+};
+
+bool BucketTrimInstanceCollectCR::spawn_next()
+{
+  if (bucket == end) {
+    return false;
+  }
+  spawn(new BucketTrimInstanceCR(store, http, observer, *bucket, dpp), false);
+  ++bucket;
+  return true;
+}
+
+/// correlate the replies from each peer gateway into the given counter
+int accumulate_peer_counters(bufferlist& bl, BucketChangeCounter& counter)
+{
+  counter.clear();
+
+  try {
+    // decode notify responses
+    auto p = bl.cbegin();
+    std::map<std::pair<uint64_t, uint64_t>, bufferlist> replies;
+    std::set<std::pair<uint64_t, uint64_t>> timeouts;
+    decode(replies, p);
+    decode(timeouts, p);
+
+    for (auto& peer : replies) {
+      auto q = peer.second.cbegin();
+      TrimCounters::Response response;
+      decode(response, q);
+      for (const auto& b : response.bucket_counters) {
+        counter.insert(b.bucket, b.count);
+      }
+    }
+  } catch (const buffer::error& e) {
+    return -EIO;
+  }
+  return 0;
+}
+
+/// metadata callback has the signature bool(string&& key, string&& marker)
+using MetadataListCallback = std::function<bool(std::string&&, std::string&&)>;
+
+/// lists metadata keys, passing each to a callback until it returns false.
+/// on reaching the end, it will restart at the beginning and list up to the
+/// initial marker
+class AsyncMetadataList : public RGWAsyncRadosRequest {
+  CephContext *const cct;
+  RGWMetadataManager *const mgr;
+  const std::string section;
+  const std::string start_marker;
+  MetadataListCallback callback;
+
+  int _send_request(const DoutPrefixProvider *dpp) override;
+ public:
+  AsyncMetadataList(CephContext *cct, RGWCoroutine *caller,
+                    RGWAioCompletionNotifier *cn, RGWMetadataManager *mgr,
+                    const std::string& section, const std::string& start_marker,
+                    const MetadataListCallback& callback)
+    : RGWAsyncRadosRequest(caller, cn), cct(cct), mgr(mgr),
+      section(section), start_marker(start_marker), callback(callback)
+  {}
+};
+
+int AsyncMetadataList::_send_request(const DoutPrefixProvider *dpp)
+{
+  void* handle = nullptr;
+  std::list<std::string> keys;
+  bool truncated{false};
+  std::string marker;
+
+  // start a listing at the given marker
+  int r = mgr->list_keys_init(dpp, section, start_marker, &handle);
+  if (r == -EINVAL) {
+    // restart with empty marker below
+  } else if (r < 0) {
+    ldpp_dout(dpp, 10) << "failed to init metadata listing: "
+        << cpp_strerror(r) << dendl;
+    return r;
+  } else {
+    ldpp_dout(dpp, 20) << "starting metadata listing at " << start_marker << dendl;
+
+    // release the handle when scope exits
+    auto g = make_scope_guard([=, this] { mgr->list_keys_complete(handle); });
+
+    do {
+      // get the next key and marker
+      r = mgr->list_keys_next(dpp, handle, 1, keys, &truncated);
+      if (r < 0) {
+        ldpp_dout(dpp, 10) << "failed to list metadata: "
+            << cpp_strerror(r) << dendl;
+        return r;
+      }
+      marker = mgr->get_marker(handle);
+
+      if (!keys.empty()) {
+        ceph_assert(keys.size() == 1);
+        auto& key = keys.front();
+        if (!callback(std::move(key), std::move(marker))) {
+          return 0;
+        }
+      }
+    } while (truncated);
+
+    if (start_marker.empty()) {
+      // already listed all keys
+      return 0;
+    }
+  }
+
+  // restart the listing from the beginning (empty marker)
+  handle = nullptr;
+
+  r = mgr->list_keys_init(dpp, section, "", &handle);
+  if (r < 0) {
+    ldpp_dout(dpp, 10) << "failed to restart metadata listing: "
+        << cpp_strerror(r) << dendl;
+    return r;
+  }
+  ldpp_dout(dpp, 20) << "restarting metadata listing" << dendl;
+
+  // release the handle when scope exits
+  auto g = make_scope_guard([=, this] { mgr->list_keys_complete(handle); });
+  do {
+    // get the next key and marker
+    r = mgr->list_keys_next(dpp, handle, 1, keys, &truncated);
+    if (r < 0) {
+      ldpp_dout(dpp, 10) << "failed to list metadata: "
+          << cpp_strerror(r) << dendl;
+      return r;
+    }
+    marker = mgr->get_marker(handle);
+
+    if (!keys.empty()) {
+      ceph_assert(keys.size() == 1);
+      auto& key = keys.front();
+      // stop at original marker
+      if (marker > start_marker) {
+        return 0;
+      }
+      if (!callback(std::move(key), std::move(marker))) {
+        return 0;
+      }
+    }
+  } while (truncated);
+
+  return 0;
+}
+
+/// coroutine wrapper for AsyncMetadataList
+class MetadataListCR : public RGWSimpleCoroutine {
+  RGWAsyncRadosProcessor *const async_rados;
+  RGWMetadataManager *const mgr;
+  const std::string& section;
+  const std::string& start_marker;
+  MetadataListCallback callback;
+  RGWAsyncRadosRequest *req{nullptr};
+ public:
+  MetadataListCR(CephContext *cct, RGWAsyncRadosProcessor *async_rados,
+                 RGWMetadataManager *mgr, const std::string& section,
+                 const std::string& start_marker,
+                 const MetadataListCallback& callback)
+    : RGWSimpleCoroutine(cct), async_rados(async_rados), mgr(mgr),
+      section(section), start_marker(start_marker), callback(callback)
+  {}
+  ~MetadataListCR() override {
+    request_cleanup();
+  }
+
+  int send_request(const DoutPrefixProvider *dpp) override {
+    req = new AsyncMetadataList(cct, this, stack->create_completion_notifier(),
+                                mgr, section, start_marker, callback);
+    async_rados->queue(req);
+    return 0;
+  }
+  int request_complete() override {
+    return req->get_ret_status();
+  }
+  void request_cleanup() override {
+    if (req) {
+      req->finish();
+      req = nullptr;
+    }
+  }
+};
+
+class BucketTrimCR : public RGWCoroutine {
+  rgw::sal::RadosStore* const store;
+  RGWHTTPManager *const http;
+  const BucketTrimConfig& config;
+  BucketTrimObserver *const observer;
+  const rgw_raw_obj& obj;
+  ceph::mono_time start_time;
+  bufferlist notify_replies;
+  BucketChangeCounter counter;
+  std::vector<std::string> buckets; //< buckets selected for trim
+  BucketTrimStatus status;
+  RGWObjVersionTracker objv; //< version tracker for trim status object
+  std::string last_cold_marker; //< position for next trim marker
+  const DoutPrefixProvider *dpp;
+
+  static const std::string section; //< metadata section for bucket instances
+ public:
+  BucketTrimCR(rgw::sal::RadosStore* store, RGWHTTPManager *http,
+               const BucketTrimConfig& config, BucketTrimObserver *observer,
+               const rgw_raw_obj& obj, const DoutPrefixProvider *dpp)
+    : RGWCoroutine(store->ctx()), store(store), http(http), config(config),
+      observer(observer), obj(obj), counter(config.counter_size), dpp(dpp)
+  {}
+
+  int operate(const DoutPrefixProvider *dpp) override;
+};
+
+const std::string BucketTrimCR::section{"bucket.instance"};
+
+int BucketTrimCR::operate(const DoutPrefixProvider *dpp)
+{
+  reenter(this) {
+    start_time = ceph::mono_clock::now();
+
+    if (config.buckets_per_interval) {
+      // query watch/notify for hot buckets
+      ldpp_dout(dpp, 10) << "fetching active bucket counters" << dendl;
+      set_status("fetching active bucket counters");
+      yield {
+        // request the top bucket counters from each peer gateway
+        const TrimNotifyType type = NotifyTrimCounters;
+        TrimCounters::Request request{32};
+        bufferlist bl;
+        encode(type, bl);
+        encode(request, bl);
+        call(new RGWRadosNotifyCR(store, obj, bl, config.notify_timeout_ms,
+                                  &notify_replies));
+      }
+      if (retcode < 0) {
+        ldpp_dout(dpp, 10) << "failed to fetch peer bucket counters" << dendl;
+        return set_cr_error(retcode);
+      }
+
+      // select the hottest buckets for trim
+      retcode = accumulate_peer_counters(notify_replies, counter);
+      if (retcode < 0) {
+        ldout(cct, 4) << "failed to correlate peer bucket counters" << dendl;
+        return set_cr_error(retcode);
+      }
+      buckets.reserve(config.buckets_per_interval);
+
+      const int max_count = config.buckets_per_interval -
+                            config.min_cold_buckets_per_interval;
+      counter.get_highest(max_count,
+        [this] (const std::string& bucket, int count) {
+          buckets.push_back(bucket);
+        });
+    }
+
+    if (buckets.size() < config.buckets_per_interval) {
+      // read BucketTrimStatus for marker position
+      set_status("reading trim status");
+      using ReadStatus = RGWSimpleRadosReadCR<BucketTrimStatus>;
+      yield call(new ReadStatus(dpp, store->svc()->rados->get_async_processor(), store->svc()->sysobj, obj,
+                                &status, true, &objv));
+      if (retcode < 0) {
+        ldpp_dout(dpp, 10) << "failed to read bilog trim status: "
+            << cpp_strerror(retcode) << dendl;
+        return set_cr_error(retcode);
+      }
+      if (status.marker == "MAX") {
+        status.marker.clear(); // restart at the beginning
+      }
+      ldpp_dout(dpp, 10) << "listing cold buckets from marker="
+          << status.marker << dendl;
+
+      set_status("listing cold buckets for trim");
+      yield {
+        // capture a reference so 'this' remains valid in the callback
+        auto ref = boost::intrusive_ptr<RGWCoroutine>{this};
+        // list cold buckets to consider for trim
+        auto cb = [this, ref] (std::string&& bucket, std::string&& marker) {
+          // filter out keys that we trimmed recently
+          if (observer->trimmed_recently(bucket)) {
+            return true;
+          }
+          // filter out active buckets that we've already selected
+          auto i = std::find(buckets.begin(), buckets.end(), bucket);
+          if (i != buckets.end()) {
+            return true;
+          }
+          buckets.emplace_back(std::move(bucket));
+          // remember the last cold bucket spawned to update the status marker
+          last_cold_marker = std::move(marker);
+          // return true if there's room for more
+          return buckets.size() < config.buckets_per_interval;
+        };
+
+        call(new MetadataListCR(cct, store->svc()->rados->get_async_processor(),
+                                store->ctl()->meta.mgr,
+                                section, status.marker, cb));
+      }
+      if (retcode < 0) {
+        ldout(cct, 4) << "failed to list bucket instance metadata: "
+            << cpp_strerror(retcode) << dendl;
+        return set_cr_error(retcode);
+      }
+    }
+
+    // trim bucket instances with limited concurrency
+    set_status("trimming buckets");
+    ldpp_dout(dpp, 4) << "collected " << buckets.size() << " buckets for trim" << dendl;
+    yield call(new BucketTrimInstanceCollectCR(store, http, observer, buckets,
+                                               config.concurrent_buckets, dpp));
+    // ignore errors from individual buckets
+
+    // write updated trim status
+    if (!last_cold_marker.empty() && status.marker != last_cold_marker) {
+      set_status("writing updated trim status");
+      status.marker = std::move(last_cold_marker);
+      ldpp_dout(dpp, 20) << "writing bucket trim marker=" << status.marker << dendl;
+      using WriteStatus = RGWSimpleRadosWriteCR<BucketTrimStatus>;
+      yield call(new WriteStatus(dpp, store->svc()->rados->get_async_processor(), store->svc()->sysobj, obj,
+                                 status, &objv));
+      if (retcode < 0) {
+        ldpp_dout(dpp, 4) << "failed to write updated trim status: "
+            << cpp_strerror(retcode) << dendl;
+        return set_cr_error(retcode);
+      }
+    }
+
+    // notify peers that trim completed
+    set_status("trim completed");
+    yield {
+      const TrimNotifyType type = NotifyTrimComplete;
+      TrimComplete::Request request;
+      bufferlist bl;
+      encode(type, bl);
+      encode(request, bl);
+      call(new RGWRadosNotifyCR(store, obj, bl, config.notify_timeout_ms,
+                                nullptr));
+    }
+    if (retcode < 0) {
+      ldout(cct, 10) << "failed to notify peers of trim completion" << dendl;
+      return set_cr_error(retcode);
+    }
+
+    ldpp_dout(dpp, 4) << "bucket index log processing completed in "
+        << ceph::mono_clock::now() - start_time << dendl;
+    return set_cr_done();
+  }
+  return 0;
+}
+
+class BucketTrimPollCR : public RGWCoroutine {
+  rgw::sal::RadosStore* const store;
+  RGWHTTPManager *const http;
+  const BucketTrimConfig& config;
+  BucketTrimObserver *const observer;
+  const rgw_raw_obj& obj;
+  const std::string name{"trim"}; //< lock name
+  const std::string cookie;
+  const DoutPrefixProvider *dpp;
+
+ public:
+  BucketTrimPollCR(rgw::sal::RadosStore* store, RGWHTTPManager *http,
+                   const BucketTrimConfig& config,
+                   BucketTrimObserver *observer, const rgw_raw_obj& obj,
+                   const DoutPrefixProvider *dpp)
+    : RGWCoroutine(store->ctx()), store(store), http(http),
+      config(config), observer(observer), obj(obj),
+      cookie(RGWSimpleRadosLockCR::gen_random_cookie(cct)),
+      dpp(dpp) {}
+
+  int operate(const DoutPrefixProvider *dpp) override;
+};
+
+int BucketTrimPollCR::operate(const DoutPrefixProvider *dpp)
+{
+  reenter(this) {
+    for (;;) {
+      set_status("sleeping");
+      wait(utime_t{static_cast<time_t>(config.trim_interval_sec), 0});
+
+      // prevent others from trimming for our entire wait interval
+      set_status("acquiring trim lock");
+      yield call(new RGWSimpleRadosLockCR(store->svc()->rados->get_async_processor(), store,
+                                          obj, name, cookie,
+                                          config.trim_interval_sec));
+      if (retcode < 0) {
+        ldout(cct, 4) << "failed to lock: " << cpp_strerror(retcode) << dendl;
+        continue;
+      }
+
+      set_status("trimming");
+      yield call(new BucketTrimCR(store, http, config, observer, obj, dpp));
+      if (retcode < 0) {
+        // on errors, unlock so other gateways can try
+        set_status("unlocking");
+        yield call(new RGWSimpleRadosUnlockCR(store->svc()->rados->get_async_processor(), store,
+                                              obj, name, cookie));
+      }
+    }
+  }
+  return 0;
+}
+
+/// tracks a bounded list of events with timestamps. old events can be expired,
+/// and recent events can be searched by key. expiration depends on events being
+/// inserted in temporal order
+template <typename T, typename Clock = ceph::coarse_mono_clock>
+class RecentEventList {
+ public:
+  using clock_type = Clock;
+  using time_point = typename clock_type::time_point;
+
+  RecentEventList(size_t max_size, const ceph::timespan& max_duration)
+    : events(max_size), max_duration(max_duration)
+  {}
+
+  /// insert an event at the given point in time. this time must be at least as
+  /// recent as the last inserted event
+  void insert(T&& value, const time_point& now) {
+    // ceph_assert(events.empty() || now >= events.back().time)
+    events.push_back(Event{std::move(value), now});
+  }
+
+  /// performs a linear search for an event matching the given key, whose type
+  /// U can be any that provides operator==(U, T)
+  template <typename U>
+  bool lookup(const U& key) const {
+    for (const auto& event : events) {
+      if (key == event.value) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  /// remove events that are no longer recent compared to the given point in time
+  void expire_old(const time_point& now) {
+    const auto expired_before = now - max_duration;
+    while (!events.empty() && events.front().time < expired_before) {
+      events.pop_front();
+    }
+  }
+
+ private:
+  struct Event {
+    T value;
+    time_point time;
+  };
+  boost::circular_buffer<Event> events;
+  const ceph::timespan max_duration;
+};
+
+namespace rgw {
+
+// read bucket trim configuration from ceph context
+void configure_bucket_trim(CephContext *cct, BucketTrimConfig& config)
+{
+  const auto& conf = cct->_conf;
+
+  config.trim_interval_sec =
+      conf.get_val<int64_t>("rgw_sync_log_trim_interval");
+  config.counter_size = 512;
+  config.buckets_per_interval =
+      conf.get_val<int64_t>("rgw_sync_log_trim_max_buckets");
+  config.min_cold_buckets_per_interval =
+      conf.get_val<int64_t>("rgw_sync_log_trim_min_cold_buckets");
+  config.concurrent_buckets =
+      conf.get_val<int64_t>("rgw_sync_log_trim_concurrent_buckets");
+  config.notify_timeout_ms = 10000;
+  config.recent_size = 128;
+  config.recent_duration = std::chrono::hours(2);
+}
+
+class BucketTrimManager::Impl : public TrimCounters::Server,
+                                public BucketTrimObserver {
+ public:
+   rgw::sal::RadosStore* const store;
+  const BucketTrimConfig config;
+
+  const rgw_raw_obj status_obj;
+
+  /// count frequency of bucket instance entries in the data changes log
+  BucketChangeCounter counter;
+
+  using RecentlyTrimmedBucketList = RecentEventList<std::string>;
+  using clock_type = RecentlyTrimmedBucketList::clock_type;
+  /// track recently trimmed buckets to focus trim activity elsewhere
+  RecentlyTrimmedBucketList trimmed;
+
+  /// serve the bucket trim watch/notify api
+  BucketTrimWatcher watcher;
+
+  /// protect data shared between data sync, trim, and watch/notify threads
+  std::mutex mutex;
+
+  Impl(rgw::sal::RadosStore* store, const BucketTrimConfig& config)
+    : store(store), config(config),
+      status_obj(store->svc()->zone->get_zone_params().log_pool, BucketTrimStatus::oid),
+      counter(config.counter_size),
+      trimmed(config.recent_size, config.recent_duration),
+      watcher(store, status_obj, this)
+  {}
+
+  /// TrimCounters::Server interface for watch/notify api
+  void get_bucket_counters(int count, TrimCounters::Vector& buckets) {
+    buckets.reserve(count);
+    std::lock_guard<std::mutex> lock(mutex);
+    counter.get_highest(count, [&buckets] (const std::string& key, int count) {
+                          buckets.emplace_back(key, count);
+                        });
+    ldout(store->ctx(), 20) << "get_bucket_counters: " << buckets << dendl;
+  }
+
+  void reset_bucket_counters() override {
+    ldout(store->ctx(), 20) << "bucket trim completed" << dendl;
+    std::lock_guard<std::mutex> lock(mutex);
+    counter.clear();
+    trimmed.expire_old(clock_type::now());
+  }
+
+  /// BucketTrimObserver interface to remember successfully-trimmed buckets
+  void on_bucket_trimmed(std::string&& bucket_instance) override {
+    ldout(store->ctx(), 20) << "trimmed bucket instance " << bucket_instance << dendl;
+    std::lock_guard<std::mutex> lock(mutex);
+    trimmed.insert(std::move(bucket_instance), clock_type::now());
+  }
+
+  bool trimmed_recently(const std::string_view& bucket_instance) override {
+    std::lock_guard<std::mutex> lock(mutex);
+    return trimmed.lookup(bucket_instance);
+  }
+};
+
+BucketTrimManager::BucketTrimManager(rgw::sal::RadosStore* store,
+                                     const BucketTrimConfig& config)
+  : impl(new Impl(store, config))
+{
+}
+BucketTrimManager::~BucketTrimManager() = default;
+
+int BucketTrimManager::init()
+{
+  return impl->watcher.start(this);
+}
+
+void BucketTrimManager::on_bucket_changed(const std::string_view& bucket)
+{
+  std::lock_guard<std::mutex> lock(impl->mutex);
+  // filter recently trimmed bucket instances out of bucket change counter
+  if (impl->trimmed.lookup(bucket)) {
+    return;
+  }
+  impl->counter.insert(std::string(bucket));
+}
+
+RGWCoroutine* BucketTrimManager::create_bucket_trim_cr(RGWHTTPManager *http)
+{
+  return new BucketTrimPollCR(impl->store, http, impl->config,
+                              impl.get(), impl->status_obj, this);
+}
+
+RGWCoroutine* BucketTrimManager::create_admin_bucket_trim_cr(RGWHTTPManager *http)
+{
+  // return the trim coroutine without any polling
+  return new BucketTrimCR(impl->store, http, impl->config,
+                          impl.get(), impl->status_obj, this);
+}
+
+CephContext* BucketTrimManager::get_cct() const
+{
+  return impl->store->ctx();
+}
+
+unsigned BucketTrimManager::get_subsys() const
+{
+  return dout_subsys;
+}
+
+std::ostream& BucketTrimManager::gen_prefix(std::ostream& out) const
+{
+  return out << "rgw bucket trim manager: ";
+}
+
+} // namespace rgw
+
+int bilog_trim(const DoutPrefixProvider* p, rgw::sal::RadosStore* store,
+              RGWBucketInfo& bucket_info, uint64_t gen, int shard_id,
+              std::string_view start_marker, std::string_view end_marker)
+{
+  auto& logs = bucket_info.layout.logs;
+  auto log = std::find_if(logs.begin(), logs.end(), rgw::matches_gen(gen));
+  if (log == logs.end()) {
+    ldpp_dout(p, 5) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                   << "ERROR: no log layout with gen=" << gen << dendl;
+    return -ENOENT;
+  }
+
+  auto log_layout = *log;
+
+  auto r = store->svc()->bilog_rados->log_trim(p, bucket_info, log_layout, shard_id, start_marker, end_marker);
+  if (r < 0) {
+    ldpp_dout(p, 5) << __PRETTY_FUNCTION__ << ":" << __LINE__
+                   << "ERROR: bilog_rados->log_trim returned r=" << r << dendl;
+  }
+  return r;
+}
diff --git a/src/rgw/driver/rados/rgw_trim_bilog.h b/src/rgw/driver/rados/rgw_trim_bilog.h
new file mode 100644 (file)
index 0000000..5b9c4cd
--- /dev/null
@@ -0,0 +1,124 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2017 Red Hat, Inc
+ *
+ * Author: Casey Bodley <cbodley@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ */
+
+#ifndef RGW_SYNC_LOG_TRIM_H
+#define RGW_SYNC_LOG_TRIM_H
+
+#include <memory>
+#include <string_view>
+
+#include "include/common_fwd.h"
+#include "include/encoding.h"
+#include "common/ceph_time.h"
+#include "common/dout.h"
+#include "rgw_common.h"
+
+class RGWCoroutine;
+class RGWHTTPManager;
+
+namespace rgw {
+
+namespace sal {
+  class RadosStore;
+}
+
+/// Interface to inform the trim process about which buckets are most active
+struct BucketChangeObserver {
+  virtual ~BucketChangeObserver() = default;
+
+  virtual void on_bucket_changed(const std::string_view& bucket_instance) = 0;
+};
+
+/// Configuration for BucketTrimManager
+struct BucketTrimConfig {
+  /// time interval in seconds between bucket trim attempts
+  uint32_t trim_interval_sec{0};
+  /// maximum number of buckets to track with BucketChangeObserver
+  size_t counter_size{0};
+  /// maximum number of buckets to process each trim interval
+  uint32_t buckets_per_interval{0};
+  /// minimum number of buckets to choose from the global bucket instance list
+  uint32_t min_cold_buckets_per_interval{0};
+  /// maximum number of buckets to process in parallel
+  uint32_t concurrent_buckets{0};
+  /// timeout in ms for bucket trim notify replies
+  uint64_t notify_timeout_ms{0};
+  /// maximum number of recently trimmed buckets to remember (should be small
+  /// enough for a linear search)
+  size_t recent_size{0};
+  /// maximum duration to consider a trim as 'recent' (should be some multiple
+  /// of the trim interval, at least)
+  ceph::timespan recent_duration{0};
+};
+
+/// fill out the BucketTrimConfig from the ceph context
+void configure_bucket_trim(CephContext *cct, BucketTrimConfig& config);
+
+/// Determines the buckets on which to focus trim activity, using two sources of
+/// input: the frequency of entries read from the data changes log, and a global
+/// listing of the bucket.instance metadata. This allows us to trim active
+/// buckets quickly, while also ensuring that all buckets will eventually trim
+class BucketTrimManager : public BucketChangeObserver, public DoutPrefixProvider {
+  class Impl;
+  std::unique_ptr<Impl> impl;
+ public:
+  BucketTrimManager(sal::RadosStore *store, const BucketTrimConfig& config);
+  ~BucketTrimManager();
+
+  int init();
+
+  /// increment a counter for the given bucket instance
+  void on_bucket_changed(const std::string_view& bucket_instance) override;
+
+  /// create a coroutine to run the bucket trim process every trim interval
+  RGWCoroutine* create_bucket_trim_cr(RGWHTTPManager *http);
+
+  /// create a coroutine to trim buckets directly via radosgw-admin
+  RGWCoroutine* create_admin_bucket_trim_cr(RGWHTTPManager *http);
+
+  CephContext *get_cct() const override;
+  unsigned get_subsys() const;
+  std::ostream& gen_prefix(std::ostream& out) const;
+};
+
+/// provides persistent storage for the trim manager's current position in the
+/// list of bucket instance metadata
+struct BucketTrimStatus {
+  std::string marker; //< metadata key of current bucket instance
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(1, 1, bl);
+    encode(marker, bl);
+    ENCODE_FINISH(bl);
+  }
+  void decode(bufferlist::const_iterator& p) {
+    DECODE_START(1, p);
+    decode(marker, p);
+    DECODE_FINISH(p);
+  }
+
+  static const std::string oid;
+};
+
+} // namespace rgw
+
+WRITE_CLASS_ENCODER(rgw::BucketTrimStatus);
+
+int bilog_trim(const DoutPrefixProvider* p, rgw::sal::RadosStore* store,
+              RGWBucketInfo& bucket_info, uint64_t gen, int shard_id,
+              std::string_view start_marker, std::string_view end_marker);
+
+#endif // RGW_SYNC_LOG_TRIM_H
diff --git a/src/rgw/driver/rados/rgw_trim_datalog.cc b/src/rgw/driver/rados/rgw_trim_datalog.cc
new file mode 100644 (file)
index 0000000..72a1600
--- /dev/null
@@ -0,0 +1,252 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include <vector>
+#include <string>
+
+#include "common/errno.h"
+
+#include "rgw_trim_datalog.h"
+#include "rgw_cr_rados.h"
+#include "rgw_cr_rest.h"
+#include "rgw_datalog.h"
+#include "rgw_data_sync.h"
+#include "rgw_zone.h"
+#include "rgw_bucket.h"
+
+#include "services/svc_zone.h"
+
+#include <boost/asio/yield.hpp>
+
+#define dout_subsys ceph_subsys_rgw
+
+#undef dout_prefix
+#define dout_prefix (*_dout << "data trim: ")
+
+namespace {
+
+class DatalogTrimImplCR : public RGWSimpleCoroutine {
+  const DoutPrefixProvider *dpp;
+  rgw::sal::RadosStore* store;
+  boost::intrusive_ptr<RGWAioCompletionNotifier> cn;
+  int shard;
+  std::string marker;
+  std::string* last_trim_marker;
+
+ public:
+  DatalogTrimImplCR(const DoutPrefixProvider *dpp, rgw::sal::RadosStore* store, int shard,
+                   const std::string& marker, std::string* last_trim_marker)
+  : RGWSimpleCoroutine(store->ctx()), dpp(dpp), store(store), shard(shard),
+    marker(marker), last_trim_marker(last_trim_marker) {
+    set_description() << "Datalog trim shard=" << shard
+                     << " marker=" << marker;
+  }
+
+  int send_request(const DoutPrefixProvider *dpp) override {
+    set_status() << "sending request";
+    cn = stack->create_completion_notifier();
+    return store->svc()->datalog_rados->trim_entries(dpp, shard, marker,
+                                                    cn->completion());
+  }
+  int request_complete() override {
+    int r = cn->completion()->get_return_value();
+    ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << "(): trim of shard=" << shard
+                 << " marker=" << marker << " returned r=" << r << dendl;
+
+    set_status() << "request complete; ret=" << r;
+    if (r != -ENODATA) {
+      return r;
+    }
+    // nothing left to trim, update last_trim_marker
+    if (*last_trim_marker < marker &&
+       marker != store->svc()->datalog_rados->max_marker()) {
+      *last_trim_marker = marker;
+    }
+    return 0;
+  }
+};
+
+/// return the marker that it's safe to trim up to
+const std::string& get_stable_marker(const rgw_data_sync_marker& m)
+{
+  return m.state == m.FullSync ? m.next_step_marker : m.marker;
+}
+
+/// populate the container starting with 'dest' with the minimum stable marker
+/// of each shard for all of the peers in [first, last)
+template <typename IterIn, typename IterOut>
+void take_min_markers(IterIn first, IterIn last, IterOut dest)
+{
+  if (first == last) {
+    return;
+  }
+  for (auto p = first; p != last; ++p) {
+    auto m = dest;
+    for (auto &shard : p->sync_markers) {
+      const auto& stable = get_stable_marker(shard.second);
+      if (*m > stable) {
+        *m = stable;
+      }
+      ++m;
+    }
+  }
+}
+
+} // anonymous namespace
+
+class DataLogTrimCR : public RGWCoroutine {
+  using TrimCR = DatalogTrimImplCR;
+  const DoutPrefixProvider *dpp;
+  rgw::sal::RadosStore* store;
+  RGWHTTPManager *http;
+  const int num_shards;
+  const std::string& zone_id; //< my zone id
+  std::vector<rgw_data_sync_status> peer_status; //< sync status for each peer
+  std::vector<std::string> min_shard_markers; //< min marker per shard
+  std::vector<std::string>& last_trim; //< last trimmed marker per shard
+  int ret{0};
+
+ public:
+  DataLogTrimCR(const DoutPrefixProvider *dpp, rgw::sal::RadosStore* store, RGWHTTPManager *http,
+                   int num_shards, std::vector<std::string>& last_trim)
+    : RGWCoroutine(store->ctx()), dpp(dpp), store(store), http(http),
+      num_shards(num_shards),
+      zone_id(store->svc()->zone->get_zone().id),
+      peer_status(store->svc()->zone->get_zone_data_notify_to_map().size()),
+      min_shard_markers(num_shards,
+                       std::string(store->svc()->datalog_rados->max_marker())),
+      last_trim(last_trim)
+  {}
+
+  int operate(const DoutPrefixProvider *dpp) override;
+};
+
+int DataLogTrimCR::operate(const DoutPrefixProvider *dpp)
+{
+  reenter(this) {
+    ldpp_dout(dpp, 10) << "fetching sync status for zone " << zone_id << dendl;
+    set_status("fetching sync status");
+    yield {
+      // query data sync status from each sync peer
+      rgw_http_param_pair params[] = {
+        { "type", "data" },
+        { "status", nullptr },
+        { "source-zone", zone_id.c_str() },
+        { nullptr, nullptr }
+      };
+
+      auto p = peer_status.begin();
+      for (auto& c : store->svc()->zone->get_zone_data_notify_to_map()) {
+        ldpp_dout(dpp, 20) << "query sync status from " << c.first << dendl;
+        using StatusCR = RGWReadRESTResourceCR<rgw_data_sync_status>;
+        spawn(new StatusCR(cct, c.second, http, "/admin/log/", params, &*p),
+              false);
+        ++p;
+      }
+    }
+
+    // must get a successful reply from all peers to consider trimming
+    ret = 0;
+    while (ret == 0 && num_spawned() > 0) {
+      yield wait_for_child();
+      collect_next(&ret);
+    }
+    drain_all();
+
+    if (ret < 0) {
+      ldpp_dout(dpp, 4) << "failed to fetch sync status from all peers" << dendl;
+      return set_cr_error(ret);
+    }
+
+    ldpp_dout(dpp, 10) << "trimming log shards" << dendl;
+    set_status("trimming log shards");
+    yield {
+      // determine the minimum marker for each shard
+      take_min_markers(peer_status.begin(), peer_status.end(),
+                       min_shard_markers.begin());
+
+      for (int i = 0; i < num_shards; i++) {
+        const auto& m = min_shard_markers[i];
+        if (m <= last_trim[i]) {
+          continue;
+        }
+        ldpp_dout(dpp, 10) << "trimming log shard " << i
+            << " at marker=" << m
+            << " last_trim=" << last_trim[i] << dendl;
+        spawn(new TrimCR(dpp, store, i, m, &last_trim[i]),
+              true);
+      }
+    }
+    return set_cr_done();
+  }
+  return 0;
+}
+
+RGWCoroutine* create_admin_data_log_trim_cr(const DoutPrefixProvider *dpp, rgw::sal::RadosStore* store,
+                                            RGWHTTPManager *http,
+                                            int num_shards,
+                                            std::vector<std::string>& markers)
+{
+  return new DataLogTrimCR(dpp, store, http, num_shards, markers);
+}
+
+class DataLogTrimPollCR : public RGWCoroutine {
+  const DoutPrefixProvider *dpp;
+  rgw::sal::RadosStore* store;
+  RGWHTTPManager *http;
+  const int num_shards;
+  const utime_t interval; //< polling interval
+  const std::string lock_oid; //< use first data log shard for lock
+  const std::string lock_cookie;
+  std::vector<std::string> last_trim; //< last trimmed marker per shard
+
+ public:
+  DataLogTrimPollCR(const DoutPrefixProvider *dpp, rgw::sal::RadosStore* store, RGWHTTPManager *http,
+                    int num_shards, utime_t interval)
+    : RGWCoroutine(store->ctx()), dpp(dpp), store(store), http(http),
+      num_shards(num_shards), interval(interval),
+      lock_oid(store->svc()->datalog_rados->get_oid(0, 0)),
+      lock_cookie(RGWSimpleRadosLockCR::gen_random_cookie(cct)),
+      last_trim(num_shards)
+  {}
+
+  int operate(const DoutPrefixProvider *dpp) override;
+};
+
+int DataLogTrimPollCR::operate(const DoutPrefixProvider *dpp)
+{
+  reenter(this) {
+    for (;;) {
+      set_status("sleeping");
+      wait(interval);
+
+      // request a 'data_trim' lock that covers the entire wait interval to
+      // prevent other gateways from attempting to trim for the duration
+      set_status("acquiring trim lock");
+      yield call(new RGWSimpleRadosLockCR(store->svc()->rados->get_async_processor(), store,
+                                          rgw_raw_obj(store->svc()->zone->get_zone_params().log_pool, lock_oid),
+                                          "data_trim", lock_cookie,
+                                          interval.sec()));
+      if (retcode < 0) {
+        // if the lock is already held, go back to sleep and try again later
+        ldpp_dout(dpp, 4) << "failed to lock " << lock_oid << ", trying again in "
+            << interval.sec() << "s" << dendl;
+        continue;
+      }
+
+      set_status("trimming");
+      yield call(new DataLogTrimCR(dpp, store, http, num_shards, last_trim));
+
+      // note that the lock is not released. this is intentional, as it avoids
+      // duplicating this work in other gateways
+    }
+  }
+  return 0;
+}
+
+RGWCoroutine* create_data_log_trim_cr(const DoutPrefixProvider *dpp, rgw::sal::RadosStore* store,
+                                      RGWHTTPManager *http,
+                                      int num_shards, utime_t interval)
+{
+  return new DataLogTrimPollCR(dpp, store, http, num_shards, interval);
+}
diff --git a/src/rgw/driver/rados/rgw_trim_datalog.h b/src/rgw/driver/rados/rgw_trim_datalog.h
new file mode 100644 (file)
index 0000000..9f5bf72
--- /dev/null
@@ -0,0 +1,28 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "common/dout.h"
+
+class RGWCoroutine;
+class RGWRados;
+class RGWHTTPManager;
+class utime_t;
+namespace rgw { namespace sal {
+  class RadosStore;
+} }
+
+// DataLogTrimCR factory function
+extern RGWCoroutine* create_data_log_trim_cr(const DoutPrefixProvider *dpp, rgw::sal::RadosStore* store,
+                                             RGWHTTPManager *http,
+                                             int num_shards, utime_t interval);
+
+// factory function for datalog trim via radosgw-admin
+RGWCoroutine* create_admin_data_log_trim_cr(const DoutPrefixProvider *dpp, rgw::sal::RadosStore* store,
+                                            RGWHTTPManager *http,
+                                            int num_shards,
+                                            std::vector<std::string>& markers);
diff --git a/src/rgw/driver/rados/rgw_trim_mdlog.cc b/src/rgw/driver/rados/rgw_trim_mdlog.cc
new file mode 100644 (file)
index 0000000..d8e1959
--- /dev/null
@@ -0,0 +1,795 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "common/errno.h"
+
+#include "rgw_trim_mdlog.h"
+#include "rgw_sync.h"
+#include "rgw_cr_rados.h"
+#include "rgw_cr_rest.h"
+#include "rgw_zone.h"
+#include "services/svc_zone.h"
+#include "services/svc_meta.h"
+#include "services/svc_mdlog.h"
+#include "services/svc_cls.h"
+
+#include <boost/asio/yield.hpp>
+
+#define dout_subsys ceph_subsys_rgw
+
+#undef dout_prefix
+#define dout_prefix (*_dout << "meta trim: ")
+
+/// purge all log shards for the given mdlog
+class PurgeLogShardsCR : public RGWShardCollectCR {
+  rgw::sal::RadosStore* const store;
+  const RGWMetadataLog* mdlog;
+  const int num_shards;
+  rgw_raw_obj obj;
+  int i{0};
+
+  static constexpr int max_concurrent = 16;
+
+  int handle_result(int r) override {
+    if (r == -ENOENT) { // ENOENT is not a fatal error
+      return 0;
+    }
+    if (r < 0) {
+      ldout(cct, 4) << "failed to remove mdlog shard: " << cpp_strerror(r) << dendl;
+    }
+    return r;
+  }
+ public:
+  PurgeLogShardsCR(rgw::sal::RadosStore* store, const RGWMetadataLog* mdlog,
+                   const rgw_pool& pool, int num_shards)
+    : RGWShardCollectCR(store->ctx(), max_concurrent),
+      store(store), mdlog(mdlog), num_shards(num_shards), obj(pool, "")
+  {}
+
+  bool spawn_next() override {
+    if (i == num_shards) {
+      return false;
+    }
+    mdlog->get_shard_oid(i++, obj.oid);
+    spawn(new RGWRadosRemoveCR(store, obj), false);
+    return true;
+  }
+};
+
+using Cursor = RGWPeriodHistory::Cursor;
+
+/// purge mdlogs from the oldest up to (but not including) the given realm_epoch
+class PurgePeriodLogsCR : public RGWCoroutine {
+  struct Svc {
+    RGWSI_Zone *zone;
+    RGWSI_MDLog *mdlog;
+  } svc;
+  const DoutPrefixProvider *dpp;
+  rgw::sal::RadosStore* const store;
+  RGWMetadataManager *const metadata;
+  RGWObjVersionTracker objv;
+  Cursor cursor;
+  epoch_t realm_epoch;
+  epoch_t *last_trim_epoch; //< update last trim on success
+
+ public:
+  PurgePeriodLogsCR(const DoutPrefixProvider *dpp, rgw::sal::RadosStore* store, epoch_t realm_epoch, epoch_t *last_trim)
+    : RGWCoroutine(store->ctx()), dpp(dpp), store(store), metadata(store->ctl()->meta.mgr),
+      realm_epoch(realm_epoch), last_trim_epoch(last_trim) {
+    svc.zone = store->svc()->zone;
+    svc.mdlog = store->svc()->mdlog;
+  }
+
+  int operate(const DoutPrefixProvider *dpp) override;
+};
+
+int PurgePeriodLogsCR::operate(const DoutPrefixProvider *dpp)
+{
+  reenter(this) {
+    // read our current oldest log period
+    yield call(svc.mdlog->read_oldest_log_period_cr(dpp, &cursor, &objv));
+    if (retcode < 0) {
+      return set_cr_error(retcode);
+    }
+    ceph_assert(cursor);
+    ldpp_dout(dpp, 20) << "oldest log realm_epoch=" << cursor.get_epoch()
+        << " period=" << cursor.get_period().get_id() << dendl;
+
+    // trim -up to- the given realm_epoch
+    while (cursor.get_epoch() < realm_epoch) {
+      ldpp_dout(dpp, 4) << "purging log shards for realm_epoch=" << cursor.get_epoch()
+          << " period=" << cursor.get_period().get_id() << dendl;
+      yield {
+        const auto mdlog = svc.mdlog->get_log(cursor.get_period().get_id());
+        const auto& pool = svc.zone->get_zone_params().log_pool;
+        auto num_shards = cct->_conf->rgw_md_log_max_shards;
+        call(new PurgeLogShardsCR(store, mdlog, pool, num_shards));
+      }
+      if (retcode < 0) {
+        ldpp_dout(dpp, 1) << "failed to remove log shards: "
+            << cpp_strerror(retcode) << dendl;
+        return set_cr_error(retcode);
+      }
+      ldpp_dout(dpp, 10) << "removed log shards for realm_epoch=" << cursor.get_epoch()
+          << " period=" << cursor.get_period().get_id() << dendl;
+
+      // update our mdlog history
+      yield call(svc.mdlog->trim_log_period_cr(dpp, cursor, &objv));
+      if (retcode == -ENOENT) {
+        // must have raced to update mdlog history. return success and allow the
+        // winner to continue purging
+        ldpp_dout(dpp, 10) << "already removed log shards for realm_epoch=" << cursor.get_epoch()
+            << " period=" << cursor.get_period().get_id() << dendl;
+        return set_cr_done();
+      } else if (retcode < 0) {
+        ldpp_dout(dpp, 1) << "failed to remove log shards for realm_epoch="
+            << cursor.get_epoch() << " period=" << cursor.get_period().get_id()
+            << " with: " << cpp_strerror(retcode) << dendl;
+        return set_cr_error(retcode);
+      }
+
+      if (*last_trim_epoch < cursor.get_epoch()) {
+        *last_trim_epoch = cursor.get_epoch();
+      }
+
+      ceph_assert(cursor.has_next()); // get_current() should always come after
+      cursor.next();
+    }
+    return set_cr_done();
+  }
+  return 0;
+}
+
+namespace {
+
+using connection_map = std::map<std::string, std::unique_ptr<RGWRESTConn>>;
+
+/// construct a RGWRESTConn for each zone in the realm
+template <typename Zonegroups>
+connection_map make_peer_connections(rgw::sal::RadosStore* store,
+                                     const Zonegroups& zonegroups)
+{
+  connection_map connections;
+  for (auto& g : zonegroups) {
+    for (auto& z : g.second.zones) {
+      std::unique_ptr<RGWRESTConn> conn{
+        new RGWRESTConn(store->ctx(), store, z.first.id, z.second.endpoints, g.second.api_name)};
+      connections.emplace(z.first.id, std::move(conn));
+    }
+  }
+  return connections;
+}
+
+/// return the marker that it's safe to trim up to
+const std::string& get_stable_marker(const rgw_meta_sync_marker& m)
+{
+  return m.state == m.FullSync ? m.next_step_marker : m.marker;
+}
+
+/// comparison operator for take_min_status()
+bool operator<(const rgw_meta_sync_marker& lhs, const rgw_meta_sync_marker& rhs)
+{
+  // sort by stable marker
+  return get_stable_marker(lhs) < get_stable_marker(rhs);
+}
+
+/// populate the status with the minimum stable marker of each shard for any
+/// peer whose realm_epoch matches the minimum realm_epoch in the input
+template <typename Iter>
+int take_min_status(CephContext *cct, Iter first, Iter last,
+                    rgw_meta_sync_status *status)
+{
+  if (first == last) {
+    return -EINVAL;
+  }
+  const size_t num_shards = cct->_conf->rgw_md_log_max_shards;
+
+  status->sync_info.realm_epoch = std::numeric_limits<epoch_t>::max();
+  for (auto p = first; p != last; ++p) {
+    // validate peer's shard count
+    if (p->sync_markers.size() != num_shards) {
+      ldout(cct, 1) << "take_min_status got peer status with "
+          << p->sync_markers.size() << " shards, expected "
+          << num_shards << dendl;
+      return -EINVAL;
+    }
+    if (p->sync_info.realm_epoch < status->sync_info.realm_epoch) {
+      // earlier epoch, take its entire status
+      *status = std::move(*p);
+    } else if (p->sync_info.realm_epoch == status->sync_info.realm_epoch) {
+      // same epoch, take any earlier markers
+      auto m = status->sync_markers.begin();
+      for (auto& shard : p->sync_markers) {
+        if (shard.second < m->second) {
+          m->second = std::move(shard.second);
+        }
+        ++m;
+      }
+    }
+  }
+  return 0;
+}
+
+struct TrimEnv {
+  const DoutPrefixProvider *dpp;
+  rgw::sal::RadosStore* const store;
+  RGWHTTPManager *const http;
+  int num_shards;
+  const rgw_zone_id& zone;
+  Cursor current; //< cursor to current period
+  epoch_t last_trim_epoch{0}; //< epoch of last mdlog that was purged
+
+  TrimEnv(const DoutPrefixProvider *dpp, rgw::sal::RadosStore* store, RGWHTTPManager *http, int num_shards)
+    : dpp(dpp), store(store), http(http), num_shards(num_shards),
+      zone(store->svc()->zone->zone_id()),
+      current(store->svc()->mdlog->get_period_history()->get_current())
+  {}
+};
+
+struct MasterTrimEnv : public TrimEnv {
+  connection_map connections; //< peer connections
+  std::vector<rgw_meta_sync_status> peer_status; //< sync status for each peer
+  /// last trim marker for each shard, only applies to current period's mdlog
+  std::vector<std::string> last_trim_markers;
+
+  MasterTrimEnv(const DoutPrefixProvider *dpp, rgw::sal::RadosStore* store, RGWHTTPManager *http, int num_shards)
+    : TrimEnv(dpp, store, http, num_shards),
+      last_trim_markers(num_shards)
+  {
+    auto& period = current.get_period();
+    connections = make_peer_connections(store, period.get_map().zonegroups);
+    connections.erase(zone.id);
+    peer_status.resize(connections.size());
+  }
+};
+
+struct PeerTrimEnv : public TrimEnv {
+  /// last trim timestamp for each shard, only applies to current period's mdlog
+  std::vector<ceph::real_time> last_trim_timestamps;
+
+  PeerTrimEnv(const DoutPrefixProvider *dpp, rgw::sal::RadosStore* store, RGWHTTPManager *http, int num_shards)
+    : TrimEnv(dpp, store, http, num_shards),
+      last_trim_timestamps(num_shards)
+  {}
+
+  void set_num_shards(int num_shards) {
+    this->num_shards = num_shards;
+    last_trim_timestamps.resize(num_shards);
+  }
+};
+
+} // anonymous namespace
+
+
+/// spawn a trim cr for each shard that needs it, while limiting the number
+/// of concurrent shards
+class MetaMasterTrimShardCollectCR : public RGWShardCollectCR {
+ private:
+  static constexpr int MAX_CONCURRENT_SHARDS = 16;
+
+  MasterTrimEnv& env;
+  RGWMetadataLog *mdlog;
+  int shard_id{0};
+  std::string oid;
+  const rgw_meta_sync_status& sync_status;
+
+  int handle_result(int r) override {
+    if (r == -ENOENT) { // ENOENT is not a fatal error
+      return 0;
+    }
+    if (r < 0) {
+      ldout(cct, 4) << "failed to trim mdlog shard: " << cpp_strerror(r) << dendl;
+    }
+    return r;
+  }
+ public:
+  MetaMasterTrimShardCollectCR(MasterTrimEnv& env, RGWMetadataLog *mdlog,
+                               const rgw_meta_sync_status& sync_status)
+    : RGWShardCollectCR(env.store->ctx(), MAX_CONCURRENT_SHARDS),
+      env(env), mdlog(mdlog), sync_status(sync_status)
+  {}
+
+  bool spawn_next() override;
+};
+
+bool MetaMasterTrimShardCollectCR::spawn_next()
+{
+  while (shard_id < env.num_shards) {
+    auto m = sync_status.sync_markers.find(shard_id);
+    if (m == sync_status.sync_markers.end()) {
+      shard_id++;
+      continue;
+    }
+    auto& stable = get_stable_marker(m->second);
+    auto& last_trim = env.last_trim_markers[shard_id];
+
+    if (stable <= last_trim) {
+      // already trimmed
+      ldpp_dout(env.dpp, 20) << "skipping log shard " << shard_id
+          << " at marker=" << stable
+          << " last_trim=" << last_trim
+          << " realm_epoch=" << sync_status.sync_info.realm_epoch << dendl;
+      shard_id++;
+      continue;
+    }
+
+    mdlog->get_shard_oid(shard_id, oid);
+
+    ldpp_dout(env.dpp, 10) << "trimming log shard " << shard_id
+        << " at marker=" << stable
+        << " last_trim=" << last_trim
+        << " realm_epoch=" << sync_status.sync_info.realm_epoch << dendl;
+    spawn(new RGWSyncLogTrimCR(env.dpp, env.store, oid, stable, &last_trim), false);
+    shard_id++;
+    return true;
+  }
+  return false;
+}
+
+/// spawn rest requests to read each peer's sync status
+class MetaMasterStatusCollectCR : public RGWShardCollectCR {
+  static constexpr int MAX_CONCURRENT_SHARDS = 16;
+
+  MasterTrimEnv& env;
+  connection_map::iterator c;
+  std::vector<rgw_meta_sync_status>::iterator s;
+
+  int handle_result(int r) override {
+    if (r == -ENOENT) { // ENOENT is not a fatal error
+      return 0;
+    }
+    if (r < 0) {
+      ldout(cct, 4) << "failed to fetch metadata sync status: "
+          << cpp_strerror(r) << dendl;
+    }
+    return r;
+  }
+ public:
+  explicit MetaMasterStatusCollectCR(MasterTrimEnv& env)
+    : RGWShardCollectCR(env.store->ctx(), MAX_CONCURRENT_SHARDS),
+      env(env), c(env.connections.begin()), s(env.peer_status.begin())
+  {}
+
+  bool spawn_next() override {
+    if (c == env.connections.end()) {
+      return false;
+    }
+    static rgw_http_param_pair params[] = {
+      { "type", "metadata" },
+      { "status", nullptr },
+      { nullptr, nullptr }
+    };
+
+    ldout(cct, 20) << "query sync status from " << c->first << dendl;
+    auto conn = c->second.get();
+    using StatusCR = RGWReadRESTResourceCR<rgw_meta_sync_status>;
+    spawn(new StatusCR(cct, conn, env.http, "/admin/log/", params, &*s),
+          false);
+    ++c;
+    ++s;
+    return true;
+  }
+};
+
+class MetaMasterTrimCR : public RGWCoroutine {
+  MasterTrimEnv& env;
+  rgw_meta_sync_status min_status; //< minimum sync status of all peers
+  int ret{0};
+
+ public:
+  explicit MetaMasterTrimCR(MasterTrimEnv& env)
+    : RGWCoroutine(env.store->ctx()), env(env)
+  {}
+
+  int operate(const DoutPrefixProvider *dpp) override;
+};
+
+int MetaMasterTrimCR::operate(const DoutPrefixProvider *dpp)
+{
+  reenter(this) {
+    // TODO: detect this and fail before we spawn the trim thread?
+    if (env.connections.empty()) {
+      ldpp_dout(dpp, 4) << "no peers, exiting" << dendl;
+      return set_cr_done();
+    }
+
+    ldpp_dout(dpp, 10) << "fetching sync status for zone " << env.zone << dendl;
+    // query mdlog sync status from peers
+    yield call(new MetaMasterStatusCollectCR(env));
+
+    // must get a successful reply from all peers to consider trimming
+    if (ret < 0) {
+      ldpp_dout(dpp, 4) << "failed to fetch sync status from all peers" << dendl;
+      return set_cr_error(ret);
+    }
+
+    // determine the minimum epoch and markers
+    ret = take_min_status(env.store->ctx(), env.peer_status.begin(),
+                          env.peer_status.end(), &min_status);
+    if (ret < 0) {
+      ldpp_dout(dpp, 4) << "failed to calculate min sync status from peers" << dendl;
+      return set_cr_error(ret);
+    }
+    yield {
+      auto store = env.store;
+      auto epoch = min_status.sync_info.realm_epoch;
+      ldpp_dout(dpp, 4) << "realm epoch min=" << epoch
+          << " current=" << env.current.get_epoch()<< dendl;
+      if (epoch > env.last_trim_epoch + 1) {
+        // delete any prior mdlog periods
+        spawn(new PurgePeriodLogsCR(dpp, store, epoch, &env.last_trim_epoch), true);
+      } else {
+        ldpp_dout(dpp, 10) << "mdlogs already purged up to realm_epoch "
+            << env.last_trim_epoch << dendl;
+      }
+
+      // if realm_epoch == current, trim mdlog based on markers
+      if (epoch == env.current.get_epoch()) {
+        auto mdlog = store->svc()->mdlog->get_log(env.current.get_period().get_id());
+        spawn(new MetaMasterTrimShardCollectCR(env, mdlog, min_status), true);
+      }
+    }
+    // ignore any errors during purge/trim because we want to hold the lock open
+    return set_cr_done();
+  }
+  return 0;
+}
+
+
+/// read the first entry of the master's mdlog shard and trim to that position
+class MetaPeerTrimShardCR : public RGWCoroutine {
+  RGWMetaSyncEnv& env;
+  RGWMetadataLog *mdlog;
+  const std::string& period_id;
+  const int shard_id;
+  RGWMetadataLogInfo info;
+  ceph::real_time stable; //< safe timestamp to trim, according to master
+  ceph::real_time *last_trim; //< last trimmed timestamp, updated on trim
+  rgw_mdlog_shard_data result; //< result from master's mdlog listing
+
+ public:
+  MetaPeerTrimShardCR(RGWMetaSyncEnv& env, RGWMetadataLog *mdlog,
+                      const std::string& period_id, int shard_id,
+                      ceph::real_time *last_trim)
+    : RGWCoroutine(env.store->ctx()), env(env), mdlog(mdlog),
+      period_id(period_id), shard_id(shard_id), last_trim(last_trim)
+  {}
+
+  int operate(const DoutPrefixProvider *dpp) override;
+};
+
+int MetaPeerTrimShardCR::operate(const DoutPrefixProvider *dpp)
+{
+  reenter(this) {
+    // query master's first mdlog entry for this shard
+    yield call(create_list_remote_mdlog_shard_cr(&env, period_id, shard_id,
+                                                 "", 1, &result));
+    if (retcode < 0) {
+      ldpp_dout(dpp, 5) << "failed to read first entry from master's mdlog shard "
+          << shard_id << " for period " << period_id
+          << ": " << cpp_strerror(retcode) << dendl;
+      return set_cr_error(retcode);
+    }
+    if (result.entries.empty()) {
+      // if there are no mdlog entries, we don't have a timestamp to compare. we
+      // can't just trim everything, because there could be racing updates since
+      // this empty reply. query the mdlog shard info to read its max timestamp,
+      // then retry the listing to make sure it's still empty before trimming to
+      // that
+      ldpp_dout(dpp, 10) << "empty master mdlog shard " << shard_id
+          << ", reading last timestamp from shard info" << dendl;
+      // read the mdlog shard info for the last timestamp
+      yield call(create_read_remote_mdlog_shard_info_cr(&env, period_id, shard_id, &info));
+      if (retcode < 0) {
+        ldpp_dout(dpp, 5) << "failed to read info from master's mdlog shard "
+            << shard_id << " for period " << period_id
+            << ": " << cpp_strerror(retcode) << dendl;
+        return set_cr_error(retcode);
+      }
+      if (ceph::real_clock::is_zero(info.last_update)) {
+        return set_cr_done(); // nothing to trim
+      }
+      ldpp_dout(dpp, 10) << "got mdlog shard info with last update="
+          << info.last_update << dendl;
+      // re-read the master's first mdlog entry to make sure it hasn't changed
+      yield call(create_list_remote_mdlog_shard_cr(&env, period_id, shard_id,
+                                                   "", 1, &result));
+      if (retcode < 0) {
+        ldpp_dout(dpp, 5) << "failed to read first entry from master's mdlog shard "
+            << shard_id << " for period " << period_id
+            << ": " << cpp_strerror(retcode) << dendl;
+        return set_cr_error(retcode);
+      }
+      // if the mdlog is still empty, trim to max marker
+      if (result.entries.empty()) {
+        stable = info.last_update;
+      } else {
+        stable = result.entries.front().timestamp;
+
+        // can only trim -up to- master's first timestamp, so subtract a second.
+        // (this is why we use timestamps instead of markers for the peers)
+        stable -= std::chrono::seconds(1);
+      }
+    } else {
+      stable = result.entries.front().timestamp;
+      stable -= std::chrono::seconds(1);
+    }
+
+    if (stable <= *last_trim) {
+      ldpp_dout(dpp, 10) << "skipping log shard " << shard_id
+          << " at timestamp=" << stable
+          << " last_trim=" << *last_trim << dendl;
+      return set_cr_done();
+    }
+
+    ldpp_dout(dpp, 10) << "trimming log shard " << shard_id
+        << " at timestamp=" << stable
+        << " last_trim=" << *last_trim << dendl;
+    yield {
+      std::string oid;
+      mdlog->get_shard_oid(shard_id, oid);
+      call(new RGWRadosTimelogTrimCR(dpp, env.store, oid, real_time{}, stable, "", ""));
+    }
+    if (retcode < 0 && retcode != -ENODATA) {
+      ldpp_dout(dpp, 1) << "failed to trim mdlog shard " << shard_id
+          << ": " << cpp_strerror(retcode) << dendl;
+      return set_cr_error(retcode);
+    }
+    *last_trim = stable;
+    return set_cr_done();
+  }
+  return 0;
+}
+
+class MetaPeerTrimShardCollectCR : public RGWShardCollectCR {
+  static constexpr int MAX_CONCURRENT_SHARDS = 16;
+
+  PeerTrimEnv& env;
+  RGWMetadataLog *mdlog;
+  const std::string& period_id;
+  RGWMetaSyncEnv meta_env; //< for RGWListRemoteMDLogShardCR
+  int shard_id{0};
+
+  int handle_result(int r) override {
+    if (r == -ENOENT) { // ENOENT is not a fatal error
+      return 0;
+    }
+    if (r < 0) {
+      ldout(cct, 4) << "failed to trim mdlog shard: " << cpp_strerror(r) << dendl;
+    }
+    return r;
+  }
+ public:
+  MetaPeerTrimShardCollectCR(PeerTrimEnv& env, RGWMetadataLog *mdlog)
+    : RGWShardCollectCR(env.store->ctx(), MAX_CONCURRENT_SHARDS),
+      env(env), mdlog(mdlog), period_id(env.current.get_period().get_id())
+  {
+    meta_env.init(env.dpp, cct, env.store, env.store->svc()->zone->get_master_conn(),
+                  env.store->svc()->rados->get_async_processor(), env.http, nullptr,
+                  env.store->getRados()->get_sync_tracer());
+  }
+
+  bool spawn_next() override;
+};
+
+bool MetaPeerTrimShardCollectCR::spawn_next()
+{
+  if (shard_id >= env.num_shards) {
+    return false;
+  }
+  auto& last_trim = env.last_trim_timestamps[shard_id];
+  spawn(new MetaPeerTrimShardCR(meta_env, mdlog, period_id, shard_id, &last_trim),
+        false);
+  shard_id++;
+  return true;
+}
+
+class MetaPeerTrimCR : public RGWCoroutine {
+  PeerTrimEnv& env;
+  rgw_mdlog_info mdlog_info; //< master's mdlog info
+
+ public:
+  explicit MetaPeerTrimCR(PeerTrimEnv& env) : RGWCoroutine(env.store->ctx()), env(env) {}
+
+  int operate(const DoutPrefixProvider *dpp) override;
+};
+
+int MetaPeerTrimCR::operate(const DoutPrefixProvider *dpp)
+{
+  reenter(this) {
+    ldpp_dout(dpp, 10) << "fetching master mdlog info" << dendl;
+    yield {
+      // query mdlog_info from master for oldest_log_period
+      rgw_http_param_pair params[] = {
+        { "type", "metadata" },
+        { nullptr, nullptr }
+      };
+
+      using LogInfoCR = RGWReadRESTResourceCR<rgw_mdlog_info>;
+      call(new LogInfoCR(cct, env.store->svc()->zone->get_master_conn(), env.http,
+                         "/admin/log/", params, &mdlog_info));
+    }
+    if (retcode < 0) {
+      ldpp_dout(dpp, 4) << "failed to read mdlog info from master" << dendl;
+      return set_cr_error(retcode);
+    }
+    // use master's shard count instead
+    env.set_num_shards(mdlog_info.num_shards);
+
+    if (mdlog_info.realm_epoch > env.last_trim_epoch + 1) {
+      // delete any prior mdlog periods
+      yield call(new PurgePeriodLogsCR(dpp, env.store, mdlog_info.realm_epoch,
+                                       &env.last_trim_epoch));
+    } else {
+      ldpp_dout(dpp, 10) << "mdlogs already purged through realm_epoch "
+          << env.last_trim_epoch << dendl;
+    }
+
+    // if realm_epoch == current, trim mdlog based on master's markers
+    if (mdlog_info.realm_epoch == env.current.get_epoch()) {
+      yield {
+        auto mdlog = env.store->svc()->mdlog->get_log(env.current.get_period().get_id());
+        call(new MetaPeerTrimShardCollectCR(env, mdlog));
+        // ignore any errors during purge/trim because we want to hold the lock open
+      }
+    }
+    return set_cr_done();
+  }
+  return 0;
+}
+
+class MetaTrimPollCR : public RGWCoroutine {
+  rgw::sal::RadosStore* const store;
+  const utime_t interval; //< polling interval
+  const rgw_raw_obj obj;
+  const std::string name{"meta_trim"}; //< lock name
+  const std::string cookie;
+
+ protected:
+  /// allocate the coroutine to run within the lease
+  virtual RGWCoroutine* alloc_cr() = 0;
+
+ public:
+  MetaTrimPollCR(rgw::sal::RadosStore* store, utime_t interval)
+    : RGWCoroutine(store->ctx()), store(store), interval(interval),
+      obj(store->svc()->zone->get_zone_params().log_pool, RGWMetadataLogHistory::oid),
+      cookie(RGWSimpleRadosLockCR::gen_random_cookie(cct))
+  {}
+
+  int operate(const DoutPrefixProvider *dpp) override;
+};
+
+int MetaTrimPollCR::operate(const DoutPrefixProvider *dpp)
+{
+  reenter(this) {
+    for (;;) {
+      set_status("sleeping");
+      wait(interval);
+
+      // prevent others from trimming for our entire wait interval
+      set_status("acquiring trim lock");
+      yield call(new RGWSimpleRadosLockCR(store->svc()->rados->get_async_processor(), store,
+                                          obj, name, cookie, interval.sec()));
+      if (retcode < 0) {
+        ldout(cct, 4) << "failed to lock: " << cpp_strerror(retcode) << dendl;
+        continue;
+      }
+
+      set_status("trimming");
+      yield call(alloc_cr());
+
+      if (retcode < 0) {
+        // on errors, unlock so other gateways can try
+        set_status("unlocking");
+        yield call(new RGWSimpleRadosUnlockCR(store->svc()->rados->get_async_processor(), store,
+                                              obj, name, cookie));
+      }
+    }
+  }
+  return 0;
+}
+
+class MetaMasterTrimPollCR : public MetaTrimPollCR  {
+  MasterTrimEnv env; //< trim state to share between calls
+  RGWCoroutine* alloc_cr() override {
+    return new MetaMasterTrimCR(env);
+  }
+ public:
+  MetaMasterTrimPollCR(const DoutPrefixProvider *dpp, rgw::sal::RadosStore* store, RGWHTTPManager *http,
+                       int num_shards, utime_t interval)
+    : MetaTrimPollCR(store, interval),
+      env(dpp, store, http, num_shards)
+  {}
+};
+
+class MetaPeerTrimPollCR : public MetaTrimPollCR {
+  PeerTrimEnv env; //< trim state to share between calls
+  RGWCoroutine* alloc_cr() override {
+    return new MetaPeerTrimCR(env);
+  }
+ public:
+  MetaPeerTrimPollCR(const DoutPrefixProvider *dpp, rgw::sal::RadosStore* store, RGWHTTPManager *http,
+                     int num_shards, utime_t interval)
+    : MetaTrimPollCR(store, interval),
+      env(dpp, store, http, num_shards)
+  {}
+};
+
+namespace {
+bool sanity_check_endpoints(const DoutPrefixProvider *dpp, rgw::sal::RadosStore* store) {
+  bool retval = true;
+  auto current = store->svc()->mdlog->get_period_history()->get_current();
+  const auto& period = current.get_period();
+  for (const auto& [_, zonegroup] : period.get_map().zonegroups) {
+    if (zonegroup.endpoints.empty()) {
+      ldpp_dout(dpp, -1)
+       << __PRETTY_FUNCTION__ << ":" << __LINE__
+       << " WARNING: Cluster is is misconfigured! "
+       << " Zonegroup " << zonegroup.get_name()
+       << " (" << zonegroup.get_id() << ") in Realm "
+       << period.get_realm_name() << " ( " << period.get_realm() << ") "
+       << " has no endpoints!" << dendl;
+    }
+    for (const auto& [_, zone] : zonegroup.zones) {
+      if (zone.endpoints.empty()) {
+       ldpp_dout(dpp, -1)
+         << __PRETTY_FUNCTION__ << ":" << __LINE__
+         << " ERROR: Cluster is is misconfigured! "
+         << " Zone " << zone.name << " (" << zone.id << ") in Zonegroup "
+         << zonegroup.get_name() << " ( " << zonegroup.get_id()
+         << ") in Realm " << period.get_realm_name()
+         << " ( " << period.get_realm() << ") "
+         << " has no endpoints! Trimming is impossible." << dendl;
+       retval = false;
+      }
+    }
+  }
+  return retval;
+}
+}
+
+RGWCoroutine* create_meta_log_trim_cr(const DoutPrefixProvider *dpp, rgw::sal::RadosStore* store, RGWHTTPManager *http,
+                                      int num_shards, utime_t interval)
+{
+  if (!sanity_check_endpoints(dpp, store)) {
+    ldpp_dout(dpp, -1)
+      << __PRETTY_FUNCTION__ << ":" << __LINE__
+      << " ERROR: Cluster is is misconfigured! Refusing to trim." << dendl;
+      return nullptr;
+  }
+  if (store->svc()->zone->is_meta_master()) {
+    return new MetaMasterTrimPollCR(dpp, store, http, num_shards, interval);
+  }
+  return new MetaPeerTrimPollCR(dpp, store, http, num_shards, interval);
+}
+
+
+struct MetaMasterAdminTrimCR : private MasterTrimEnv, public MetaMasterTrimCR {
+  MetaMasterAdminTrimCR(const DoutPrefixProvider *dpp, rgw::sal::RadosStore* store, RGWHTTPManager *http, int num_shards)
+    : MasterTrimEnv(dpp, store, http, num_shards),
+      MetaMasterTrimCR(*static_cast<MasterTrimEnv*>(this))
+  {}
+};
+
+struct MetaPeerAdminTrimCR : private PeerTrimEnv, public MetaPeerTrimCR {
+  MetaPeerAdminTrimCR(const DoutPrefixProvider *dpp, rgw::sal::RadosStore* store, RGWHTTPManager *http, int num_shards)
+    : PeerTrimEnv(dpp, store, http, num_shards),
+      MetaPeerTrimCR(*static_cast<PeerTrimEnv*>(this))
+  {}
+};
+
+RGWCoroutine* create_admin_meta_log_trim_cr(const DoutPrefixProvider *dpp, rgw::sal::RadosStore* store,
+                                            RGWHTTPManager *http,
+                                            int num_shards)
+{
+  if (!sanity_check_endpoints(dpp, store)) {
+    ldpp_dout(dpp, -1)
+      << __PRETTY_FUNCTION__ << ":" << __LINE__
+      << " ERROR: Cluster is is misconfigured! Refusing to trim." << dendl;
+      return nullptr;
+  }
+  if (store->svc()->zone->is_meta_master()) {
+    return new MetaMasterAdminTrimCR(dpp, store, http, num_shards);
+  }
+  return new MetaPeerAdminTrimCR(dpp, store, http, num_shards);
+}
diff --git a/src/rgw/driver/rados/rgw_trim_mdlog.h b/src/rgw/driver/rados/rgw_trim_mdlog.h
new file mode 100644 (file)
index 0000000..1dba861
--- /dev/null
@@ -0,0 +1,25 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+class RGWCoroutine;
+class DoutPrefixProvider;
+class RGWRados;
+class RGWHTTPManager;
+class utime_t;
+namespace rgw { namespace sal {
+  class RadosStore;
+} }
+
+// MetaLogTrimCR factory function
+RGWCoroutine* create_meta_log_trim_cr(const DoutPrefixProvider *dpp,
+                                      rgw::sal::RadosStore* store,
+                                      RGWHTTPManager *http,
+                                      int num_shards, utime_t interval);
+
+// factory function for mdlog trim via radosgw-admin
+RGWCoroutine* create_admin_meta_log_trim_cr(const DoutPrefixProvider *dpp,
+                                            rgw::sal::RadosStore* store,
+                                            RGWHTTPManager *http,
+                                            int num_shards);
diff --git a/src/rgw/driver/rados/rgw_user.cc b/src/rgw/driver/rados/rgw_user.cc
new file mode 100644 (file)
index 0000000..7c36a52
--- /dev/null
@@ -0,0 +1,2768 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "common/errno.h"
+
+#include "rgw_user.h"
+
+#include "rgw_bucket.h"
+
+#include "services/svc_user.h"
+#include "services/svc_meta.h"
+
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+
+extern void op_type_to_str(uint32_t mask, char *buf, int len);
+
+static string key_type_to_str(int key_type) {
+  switch (key_type) {
+    case KEY_TYPE_SWIFT:
+      return "swift";
+      break;
+
+    default:
+      return "s3";
+      break;
+  }
+}
+
+static bool char_is_unreserved_url(char c)
+{
+  if (isalnum(c))
+    return true;
+
+  switch (c) {
+  case '-':
+  case '.':
+  case '_':
+  case '~':
+    return true;
+  default:
+    return false;
+  }
+}
+
+static bool validate_access_key(string& key)
+{
+  const char *p = key.c_str();
+  while (*p) {
+    if (!char_is_unreserved_url(*p))
+      return false;
+    p++;
+  }
+  return true;
+}
+
+static void set_err_msg(std::string *sink, std::string msg)
+{
+  if (sink && !msg.empty())
+    *sink = msg;
+}
+
+/*
+ * Dump either the full user info or a subset to a formatter.
+ *
+ * NOTE: It is the caller's responsibility to ensure that the
+ * formatter is flushed at the correct time.
+ */
+
+static void dump_subusers_info(Formatter *f, RGWUserInfo &info)
+{
+  map<string, RGWSubUser>::iterator uiter;
+
+  f->open_array_section("subusers");
+  for (uiter = info.subusers.begin(); uiter != info.subusers.end(); ++uiter) {
+    RGWSubUser& u = uiter->second;
+    f->open_object_section("user");
+    string s;
+    info.user_id.to_str(s);
+    f->dump_format("id", "%s:%s", s.c_str(), u.name.c_str());
+    char buf[256];
+    rgw_perm_to_str(u.perm_mask, buf, sizeof(buf));
+    f->dump_string("permissions", buf);
+    f->close_section();
+  }
+  f->close_section();
+}
+
+static void dump_access_keys_info(Formatter *f, RGWUserInfo &info)
+{
+  map<string, RGWAccessKey>::iterator kiter;
+  f->open_array_section("keys");
+  for (kiter = info.access_keys.begin(); kiter != info.access_keys.end(); ++kiter) {
+    RGWAccessKey& k = kiter->second;
+    const char *sep = (k.subuser.empty() ? "" : ":");
+    const char *subuser = (k.subuser.empty() ? "" : k.subuser.c_str());
+    f->open_object_section("key");
+    string s;
+    info.user_id.to_str(s);
+    f->dump_format("user", "%s%s%s", s.c_str(), sep, subuser);
+    f->dump_string("access_key", k.id);
+    f->dump_string("secret_key", k.key);
+    f->close_section();
+  }
+  f->close_section();
+}
+
+static void dump_swift_keys_info(Formatter *f, RGWUserInfo &info)
+{
+  map<string, RGWAccessKey>::iterator kiter;
+  f->open_array_section("swift_keys");
+  for (kiter = info.swift_keys.begin(); kiter != info.swift_keys.end(); ++kiter) {
+    RGWAccessKey& k = kiter->second;
+    const char *sep = (k.subuser.empty() ? "" : ":");
+    const char *subuser = (k.subuser.empty() ? "" : k.subuser.c_str());
+    f->open_object_section("key");
+    string s;
+    info.user_id.to_str(s);
+    f->dump_format("user", "%s%s%s", s.c_str(), sep, subuser);
+    f->dump_string("secret_key", k.key);
+    f->close_section();
+  }
+  f->close_section();
+}
+
+static void dump_user_info(Formatter *f, RGWUserInfo &info,
+                           RGWStorageStats *stats = NULL)
+{
+  f->open_object_section("user_info");
+  encode_json("tenant", info.user_id.tenant, f);
+  encode_json("user_id", info.user_id.id, f);
+  encode_json("display_name", info.display_name, f);
+  encode_json("email", info.user_email, f);
+  encode_json("suspended", (int)info.suspended, f);
+  encode_json("max_buckets", (int)info.max_buckets, f);
+
+  dump_subusers_info(f, info);
+  dump_access_keys_info(f, info);
+  dump_swift_keys_info(f, info);
+
+  encode_json("caps", info.caps, f);
+
+  char buf[256];
+  op_type_to_str(info.op_mask, buf, sizeof(buf));
+  encode_json("op_mask", (const char *)buf, f);
+  encode_json("system", (bool)info.system, f);
+  encode_json("admin", (bool)info.admin, f);
+  encode_json("default_placement", info.default_placement.name, f);
+  encode_json("default_storage_class", info.default_placement.storage_class, f);
+  encode_json("placement_tags", info.placement_tags, f);
+  encode_json("bucket_quota", info.quota.bucket_quota, f);
+  encode_json("user_quota", info.quota.user_quota, f);
+  encode_json("temp_url_keys", info.temp_url_keys, f);
+
+  string user_source_type;
+  switch ((RGWIdentityType)info.type) {
+  case TYPE_RGW:
+    user_source_type = "rgw";
+    break;
+  case TYPE_KEYSTONE:
+    user_source_type = "keystone";
+    break;
+  case TYPE_LDAP:
+    user_source_type = "ldap";
+    break;
+  case TYPE_NONE:
+    user_source_type = "none";
+    break;
+  default:
+    user_source_type = "none";
+    break;
+  }
+  encode_json("type", user_source_type, f);
+  encode_json("mfa_ids", info.mfa_ids, f);
+  if (stats) {
+    encode_json("stats", *stats, f);
+  }
+  f->close_section();
+}
+
+static int user_add_helper(RGWUserAdminOpState& op_state, std::string *err_msg)
+{
+  int ret = 0;
+  const rgw_user& uid = op_state.get_user_id();
+  std::string user_email = op_state.get_user_email();
+  std::string display_name = op_state.get_display_name();
+
+  // fail if the user exists already
+  if (op_state.has_existing_user()) {
+    if (op_state.found_by_email) {
+      set_err_msg(err_msg, "email: " + user_email +
+          " is the email address of an existing user");
+      ret = -ERR_EMAIL_EXIST;
+    } else if (op_state.found_by_key) {
+      set_err_msg(err_msg, "duplicate key provided");
+      ret = -ERR_KEY_EXIST;
+    } else {
+      set_err_msg(err_msg, "user: " + uid.to_str() + " exists");
+      ret = -EEXIST;
+    }
+    return ret;
+  }
+
+  // fail if the user_info has already been populated
+  if (op_state.is_populated()) {
+    set_err_msg(err_msg, "cannot overwrite already populated user");
+    return -EEXIST;
+  }
+
+  // fail if the display name was not included
+  if (display_name.empty()) {
+    set_err_msg(err_msg, "no display name specified");
+    return -EINVAL;
+  }
+
+  return ret;
+}
+
+RGWAccessKeyPool::RGWAccessKeyPool(RGWUser* usr)
+{
+  if (!usr) {
+    return;
+  }
+
+  user = usr;
+
+  driver = user->get_driver();
+}
+
+int RGWAccessKeyPool::init(RGWUserAdminOpState& op_state)
+{
+  if (!op_state.is_initialized()) {
+    keys_allowed = false;
+    return -EINVAL;
+  }
+
+  const rgw_user& uid = op_state.get_user_id();
+  if (uid.compare(RGW_USER_ANON_ID) == 0) {
+    keys_allowed = false;
+    return -EINVAL;
+  }
+
+  swift_keys = op_state.get_swift_keys();
+  access_keys = op_state.get_access_keys();
+
+  keys_allowed = true;
+
+  return 0;
+}
+
+RGWUserAdminOpState::RGWUserAdminOpState(rgw::sal::Driver* driver)
+{
+  user = driver->get_user(rgw_user(RGW_USER_ANON_ID));
+}
+
+void RGWUserAdminOpState::set_user_id(const rgw_user& id)
+{
+  if (id.empty())
+    return;
+
+  user->get_info().user_id = id;
+}
+
+void RGWUserAdminOpState::set_subuser(std::string& _subuser)
+{
+  if (_subuser.empty())
+    return;
+
+  size_t pos = _subuser.find(":");
+  if (pos != string::npos) {
+    rgw_user tmp_id;
+    tmp_id.from_str(_subuser.substr(0, pos));
+    if (tmp_id.tenant.empty()) {
+      user->get_info().user_id.id = tmp_id.id;
+    } else {
+      user->get_info().user_id = tmp_id;
+    }
+    subuser = _subuser.substr(pos+1);
+  } else {
+    subuser = _subuser;
+  }
+
+  subuser_specified = true;
+}
+
+void RGWUserAdminOpState::set_user_info(RGWUserInfo& user_info)
+{
+  user->get_info() = user_info;
+}
+
+void RGWUserAdminOpState::set_user_version_tracker(RGWObjVersionTracker& objv_tracker)
+{
+  user->get_version_tracker() = objv_tracker;
+}
+
+const rgw_user& RGWUserAdminOpState::get_user_id()
+{
+  return user->get_id();
+}
+
+RGWUserInfo& RGWUserAdminOpState::get_user_info()
+{
+  return user->get_info();
+}
+
+map<std::string, RGWAccessKey>* RGWUserAdminOpState::get_swift_keys()
+{
+  return &user->get_info().swift_keys;
+}
+
+map<std::string, RGWAccessKey>* RGWUserAdminOpState::get_access_keys()
+{
+  return &user->get_info().access_keys;
+}
+
+map<std::string, RGWSubUser>* RGWUserAdminOpState::get_subusers()
+{
+  return &user->get_info().subusers;
+}
+
+RGWUserCaps *RGWUserAdminOpState::get_caps_obj()
+{
+  return &user->get_info().caps;
+}
+
+std::string RGWUserAdminOpState::build_default_swift_kid()
+{
+  if (user->get_id().empty() || subuser.empty())
+    return "";
+
+  std::string kid;
+  user->get_id().to_str(kid);
+  kid.append(":");
+  kid.append(subuser);
+
+  return kid;
+}
+
+std::string RGWUserAdminOpState::generate_subuser() {
+  if (user->get_id().empty())
+    return "";
+
+  std::string generated_subuser;
+  user->get_id().to_str(generated_subuser);
+  std::string rand_suffix;
+
+  int sub_buf_size = RAND_SUBUSER_LEN + 1;
+  char sub_buf[RAND_SUBUSER_LEN + 1];
+
+  gen_rand_alphanumeric_upper(g_ceph_context, sub_buf, sub_buf_size);
+
+  rand_suffix = sub_buf;
+  if (rand_suffix.empty())
+    return "";
+
+  generated_subuser.append(rand_suffix);
+  subuser = generated_subuser;
+
+  return generated_subuser;
+}
+
+/*
+ * Do a fairly exhaustive search for an existing key matching the parameters
+ * given. Also handles the case where no key type was specified and updates
+ * the operation state if needed.
+ */
+
+bool RGWAccessKeyPool::check_existing_key(RGWUserAdminOpState& op_state)
+{
+  bool existing_key = false;
+
+  int key_type = op_state.get_key_type();
+  std::string kid = op_state.get_access_key();
+  std::map<std::string, RGWAccessKey>::iterator kiter;
+  std::string swift_kid = op_state.build_default_swift_kid();
+
+  RGWUserInfo dup_info;
+
+  if (kid.empty() && swift_kid.empty())
+    return false;
+
+  switch (key_type) {
+  case KEY_TYPE_SWIFT:
+    kiter = swift_keys->find(swift_kid);
+
+    existing_key = (kiter != swift_keys->end());
+    if (existing_key)
+      op_state.set_access_key(swift_kid);
+
+    break;
+  case KEY_TYPE_S3:
+    kiter = access_keys->find(kid);
+    existing_key = (kiter != access_keys->end());
+
+    break;
+  default:
+    kiter = access_keys->find(kid);
+
+    existing_key = (kiter != access_keys->end());
+    if (existing_key) {
+      op_state.set_key_type(KEY_TYPE_S3);
+      break;
+    }
+
+    kiter = swift_keys->find(kid);
+
+    existing_key = (kiter != swift_keys->end());
+    if (existing_key) {
+      op_state.set_key_type(KEY_TYPE_SWIFT);
+      break;
+    }
+
+    // handle the case where the access key was not provided in user:key format
+    if (swift_kid.empty())
+      return false;
+
+    kiter = swift_keys->find(swift_kid);
+
+    existing_key = (kiter != swift_keys->end());
+    if (existing_key) {
+      op_state.set_access_key(swift_kid);
+      op_state.set_key_type(KEY_TYPE_SWIFT);
+    }
+  }
+
+  op_state.set_existing_key(existing_key);
+
+  return existing_key;
+}
+
+int RGWAccessKeyPool::check_op(RGWUserAdminOpState& op_state,
+     std::string *err_msg)
+{
+  RGWUserInfo dup_info;
+
+  if (!op_state.is_populated()) {
+    set_err_msg(err_msg, "user info was not populated");
+    return -EINVAL;
+  }
+
+  if (!keys_allowed) {
+    set_err_msg(err_msg, "keys not allowed for this user");
+    return -EACCES;
+  }
+
+  int32_t key_type = op_state.get_key_type();
+
+  // if a key type wasn't specified
+  if (key_type < 0) {
+      if (op_state.has_subuser()) {
+        key_type = KEY_TYPE_SWIFT;
+      } else {
+        key_type = KEY_TYPE_S3;
+      }
+  }
+
+  op_state.set_key_type(key_type);
+
+  /* see if the access key was specified */
+  if (key_type == KEY_TYPE_S3 && !op_state.will_gen_access() && 
+      op_state.get_access_key().empty()) {
+    set_err_msg(err_msg, "empty access key");
+    return -ERR_INVALID_ACCESS_KEY;
+  }
+
+  // don't check for secret key because we may be doing a removal
+
+  if (check_existing_key(op_state)) {
+    op_state.set_access_key_exist();
+  }
+  return 0;
+}
+
+// Generate a new random key
+int RGWAccessKeyPool::generate_key(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state,
+                                  optional_yield y, std::string *err_msg)
+{
+  std::string id;
+  std::string key;
+
+  std::pair<std::string, RGWAccessKey> key_pair;
+  RGWAccessKey new_key;
+  std::unique_ptr<rgw::sal::User> duplicate_check;
+
+  int key_type = op_state.get_key_type();
+  bool gen_access = op_state.will_gen_access();
+  bool gen_secret = op_state.will_gen_secret();
+
+  if (!keys_allowed) {
+    set_err_msg(err_msg, "access keys not allowed for this user");
+    return -EACCES;
+  }
+
+  if (op_state.has_existing_key()) {
+    set_err_msg(err_msg, "cannot create existing key");
+    return -ERR_KEY_EXIST;
+  }
+
+  if (!gen_access) {
+    id = op_state.get_access_key();
+  }
+
+  if (!id.empty()) {
+    switch (key_type) {
+    case KEY_TYPE_SWIFT:
+      if (driver->get_user_by_swift(dpp, id, y, &duplicate_check) >= 0) {
+        set_err_msg(err_msg, "existing swift key in RGW system:" + id);
+        return -ERR_KEY_EXIST;
+      }
+      break;
+    case KEY_TYPE_S3:
+      if (driver->get_user_by_access_key(dpp, id, y, &duplicate_check) >= 0) {
+        set_err_msg(err_msg, "existing S3 key in RGW system:" + id);
+        return -ERR_KEY_EXIST;
+      }
+    }
+  }
+
+  //key's subuser
+  if (op_state.has_subuser()) {
+    //create user and subuser at the same time, user's s3 key should not be set this
+    if (!op_state.key_type_setbycontext || (key_type == KEY_TYPE_SWIFT)) {
+      new_key.subuser = op_state.get_subuser();
+    }
+  }
+
+  //Secret key
+  if (!gen_secret) {
+    if (op_state.get_secret_key().empty()) {
+      set_err_msg(err_msg, "empty secret key");
+      return -ERR_INVALID_SECRET_KEY;
+    }
+
+    key = op_state.get_secret_key();
+  } else {
+    char secret_key_buf[SECRET_KEY_LEN + 1];
+    gen_rand_alphanumeric_plain(g_ceph_context, secret_key_buf, sizeof(secret_key_buf));
+    key = secret_key_buf;
+  }
+
+  // Generate the access key
+  if (key_type == KEY_TYPE_S3 && gen_access) {
+    char public_id_buf[PUBLIC_ID_LEN + 1];
+
+    do {
+      int id_buf_size = sizeof(public_id_buf);
+      gen_rand_alphanumeric_upper(g_ceph_context, public_id_buf, id_buf_size);
+      id = public_id_buf;
+      if (!validate_access_key(id))
+        continue;
+
+    } while (!driver->get_user_by_access_key(dpp, id, y, &duplicate_check));
+  }
+
+  if (key_type == KEY_TYPE_SWIFT) {
+    id = op_state.build_default_swift_kid();
+    if (id.empty()) {
+      set_err_msg(err_msg, "empty swift access key");
+      return -ERR_INVALID_ACCESS_KEY;
+    }
+
+    // check that the access key doesn't exist
+    if (driver->get_user_by_swift(dpp, id, y, &duplicate_check) >= 0) {
+      set_err_msg(err_msg, "cannot create existing swift key");
+      return -ERR_KEY_EXIST;
+    }
+  }
+
+  // finally create the new key
+  new_key.id = id;
+  new_key.key = key;
+
+  key_pair.first = id;
+  key_pair.second = new_key;
+
+  if (key_type == KEY_TYPE_S3) {
+    access_keys->insert(key_pair);
+  } else if (key_type == KEY_TYPE_SWIFT) {
+    swift_keys->insert(key_pair);
+  }
+
+  return 0;
+}
+
+// modify an existing key
+int RGWAccessKeyPool::modify_key(RGWUserAdminOpState& op_state, std::string *err_msg)
+{
+  std::string id;
+  std::string key = op_state.get_secret_key();
+  int key_type = op_state.get_key_type();
+
+  RGWAccessKey modify_key;
+
+  pair<string, RGWAccessKey> key_pair;
+  map<std::string, RGWAccessKey>::iterator kiter;
+
+  switch (key_type) {
+  case KEY_TYPE_S3:
+    id = op_state.get_access_key();
+    if (id.empty()) {
+      set_err_msg(err_msg, "no access key specified");
+      return -ERR_INVALID_ACCESS_KEY;
+    }
+    break;
+  case KEY_TYPE_SWIFT:
+    id = op_state.build_default_swift_kid();
+    if (id.empty()) {
+      set_err_msg(err_msg, "no subuser specified");
+      return -EINVAL;
+    }
+    break;
+  default:
+    set_err_msg(err_msg, "invalid key type");
+    return -ERR_INVALID_KEY_TYPE;
+  }
+
+  if (!op_state.has_existing_key()) {
+    set_err_msg(err_msg, "key does not exist");
+    return -ERR_INVALID_ACCESS_KEY;
+  }
+
+  key_pair.first = id;
+
+  if (key_type == KEY_TYPE_SWIFT) {
+    modify_key.id = id;
+    modify_key.subuser = op_state.get_subuser();
+  } else if (key_type == KEY_TYPE_S3) {
+    kiter = access_keys->find(id);
+    if (kiter != access_keys->end()) {
+      modify_key = kiter->second;
+    }
+  }
+
+  if (op_state.will_gen_secret()) {
+    char secret_key_buf[SECRET_KEY_LEN + 1];
+    int key_buf_size = sizeof(secret_key_buf);
+    gen_rand_alphanumeric_plain(g_ceph_context, secret_key_buf, key_buf_size);
+    key = secret_key_buf;
+  }
+
+  if (key.empty()) {
+      set_err_msg(err_msg, "empty secret key");
+      return -ERR_INVALID_SECRET_KEY;
+  }
+
+  // update the access key with the new secret key
+  modify_key.key = key;
+
+  key_pair.second = modify_key;
+
+
+  if (key_type == KEY_TYPE_S3) {
+    (*access_keys)[id] = modify_key;
+  } else if (key_type == KEY_TYPE_SWIFT) {
+    (*swift_keys)[id] = modify_key;
+  }
+
+  return 0;
+}
+
+int RGWAccessKeyPool::execute_add(const DoutPrefixProvider *dpp, 
+                                  RGWUserAdminOpState& op_state,
+                                 std::string *err_msg, bool defer_user_update,
+                                 optional_yield y)
+{
+  int ret = 0;
+
+  std::string subprocess_msg;
+  int key_op = GENERATE_KEY;
+
+  // set the op
+  if (op_state.has_existing_key())
+    key_op = MODIFY_KEY;
+
+  switch (key_op) {
+  case GENERATE_KEY:
+    ret = generate_key(dpp, op_state, y, &subprocess_msg);
+    break;
+  case MODIFY_KEY:
+    ret = modify_key(op_state, &subprocess_msg);
+    break;
+  }
+
+  if (ret < 0) {
+    set_err_msg(err_msg, subprocess_msg);
+    return ret;
+  }
+
+  // store the updated info
+  if (!defer_user_update)
+    ret = user->update(dpp, op_state, err_msg, y);
+
+  if (ret < 0)
+    return ret;
+
+  return 0;
+}
+
+int RGWAccessKeyPool::add(const DoutPrefixProvider *dpp, 
+                          RGWUserAdminOpState& op_state, optional_yield y,
+                         std::string *err_msg)
+{
+  return add(dpp, op_state, err_msg, false, y);
+}
+
+int RGWAccessKeyPool::add(const DoutPrefixProvider *dpp, 
+                          RGWUserAdminOpState& op_state, std::string *err_msg,
+                         bool defer_user_update, optional_yield y)
+{
+  int ret;
+  std::string subprocess_msg;
+
+  ret = check_op(op_state, &subprocess_msg);
+  if (ret < 0) {
+    set_err_msg(err_msg, "unable to parse request, " + subprocess_msg);
+    return ret;
+  }
+
+  ret = execute_add(dpp, op_state, &subprocess_msg, defer_user_update, y);
+  if (ret < 0) {
+    set_err_msg(err_msg, "unable to add access key, " + subprocess_msg);
+    return ret;
+  }
+
+  return 0;
+}
+
+int RGWAccessKeyPool::execute_remove(const DoutPrefixProvider *dpp, 
+                                     RGWUserAdminOpState& op_state,
+                                    std::string *err_msg,
+                                    bool defer_user_update,
+                                    optional_yield y)
+{
+  int ret = 0;
+
+  int key_type = op_state.get_key_type();
+  std::string id = op_state.get_access_key();
+  map<std::string, RGWAccessKey>::iterator kiter;
+  map<std::string, RGWAccessKey> *keys_map;
+
+  if (!op_state.has_existing_key()) {
+    set_err_msg(err_msg, "unable to find access key,  with key type: " +
+                             key_type_to_str(key_type));
+    return -ERR_INVALID_ACCESS_KEY;
+  }
+
+  if (key_type == KEY_TYPE_S3) {
+    keys_map = access_keys;
+  } else if (key_type == KEY_TYPE_SWIFT) {
+    keys_map = swift_keys;
+  } else {
+    keys_map = NULL;
+    set_err_msg(err_msg, "invalid access key");
+    return -ERR_INVALID_ACCESS_KEY;
+  }
+
+  kiter = keys_map->find(id);
+  if (kiter == keys_map->end()) {
+    set_err_msg(err_msg, "key not found");
+    return -ERR_INVALID_ACCESS_KEY;
+  }
+
+  keys_map->erase(kiter);
+
+  if (!defer_user_update)
+    ret = user->update(dpp, op_state, err_msg, y);
+
+  if (ret < 0)
+    return ret;
+
+  return 0;
+}
+
+int RGWAccessKeyPool::remove(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, optional_yield y,
+                            std::string *err_msg)
+{
+  return remove(dpp, op_state, err_msg, false, y);
+}
+
+int RGWAccessKeyPool::remove(const DoutPrefixProvider *dpp, 
+                             RGWUserAdminOpState& op_state,
+                            std::string *err_msg, bool defer_user_update,
+                            optional_yield y)
+{
+  int ret;
+
+  std::string subprocess_msg;
+
+  ret = check_op(op_state, &subprocess_msg);
+  if (ret < 0) {
+    set_err_msg(err_msg, "unable to parse request, " + subprocess_msg);
+    return ret;
+  }
+
+  ret = execute_remove(dpp, op_state, &subprocess_msg, defer_user_update, y);
+  if (ret < 0) {
+    set_err_msg(err_msg, "unable to remove access key, " + subprocess_msg);
+    return ret;
+  }
+
+  return 0;
+}
+
+// remove all keys associated with a subuser
+int RGWAccessKeyPool::remove_subuser_keys(const DoutPrefixProvider *dpp, 
+                                          RGWUserAdminOpState& op_state,
+                                         std::string *err_msg,
+                                         bool defer_user_update,
+                                         optional_yield y)
+{
+  int ret = 0;
+
+  if (!op_state.is_populated()) {
+    set_err_msg(err_msg, "user info was not populated");
+    return -EINVAL;
+  }
+
+  if (!op_state.has_subuser()) {
+    set_err_msg(err_msg, "no subuser specified");
+    return -EINVAL;
+  }
+
+  std::string swift_kid = op_state.build_default_swift_kid();
+  if (swift_kid.empty()) {
+    set_err_msg(err_msg, "empty swift access key");
+    return -EINVAL;
+  }
+
+  map<std::string, RGWAccessKey>::iterator kiter;
+  map<std::string, RGWAccessKey> *keys_map;
+
+  // a subuser can have at most one swift key
+  keys_map = swift_keys;
+  kiter = keys_map->find(swift_kid);
+  if (kiter != keys_map->end()) {
+    keys_map->erase(kiter);
+  }
+
+  // a subuser may have multiple s3 key pairs
+  std::string subuser_str = op_state.get_subuser();
+  keys_map = access_keys;
+  RGWUserInfo user_info = op_state.get_user_info();
+  auto user_kiter = user_info.access_keys.begin();
+  for (; user_kiter != user_info.access_keys.end(); ++user_kiter) {
+    if (user_kiter->second.subuser == subuser_str) {
+      kiter = keys_map->find(user_kiter->first);
+      if (kiter != keys_map->end()) {
+        keys_map->erase(kiter);
+      }
+    }
+  }
+
+  if (!defer_user_update)
+    ret = user->update(dpp, op_state, err_msg, y);
+
+  if (ret < 0)
+    return ret;
+
+  return 0;
+}
+
+RGWSubUserPool::RGWSubUserPool(RGWUser *usr)
+{
+  if (!usr) {
+    return;
+  }
+
+  user = usr;
+
+  subusers_allowed = true;
+  driver = user->get_driver();
+}
+
+int RGWSubUserPool::init(RGWUserAdminOpState& op_state)
+{
+  if (!op_state.is_initialized()) {
+    subusers_allowed = false;
+    return -EINVAL;
+  }
+
+  const rgw_user& uid = op_state.get_user_id();
+  if (uid.compare(RGW_USER_ANON_ID) == 0) {
+    subusers_allowed = false;
+    return -EACCES;
+  }
+
+  subuser_map = op_state.get_subusers();
+  if (subuser_map == NULL) {
+    subusers_allowed = false;
+    return -EINVAL;
+  }
+
+  subusers_allowed = true;
+
+  return 0;
+}
+
+bool RGWSubUserPool::exists(std::string subuser)
+{
+  if (subuser.empty())
+    return false;
+
+  if (!subuser_map)
+    return false;
+
+  if (subuser_map->count(subuser))
+    return true;
+
+  return false;
+}
+
+int RGWSubUserPool::check_op(RGWUserAdminOpState& op_state,
+        std::string *err_msg)
+{
+  bool existing = false;
+  std::string subuser = op_state.get_subuser();
+
+  if (!op_state.is_populated()) {
+    set_err_msg(err_msg, "user info was not populated");
+    return -EINVAL;
+  }
+
+  if (!subusers_allowed) {
+    set_err_msg(err_msg, "subusers not allowed for this user");
+    return -EACCES;
+  }
+
+  if (subuser.empty() && !op_state.will_gen_subuser()) {
+    set_err_msg(err_msg, "empty subuser name");
+    return -EINVAL;
+  }
+
+  if (op_state.get_subuser_perm() == RGW_PERM_INVALID) {
+    set_err_msg(err_msg, "invalid subuser access");
+    return -EINVAL;
+  }
+
+  //set key type when it not set or set by context
+  if ((op_state.get_key_type() < 0) || op_state.key_type_setbycontext) {
+    op_state.set_key_type(KEY_TYPE_SWIFT);
+    op_state.key_type_setbycontext = true;
+  }
+
+  // check if the subuser exists
+  if (!subuser.empty())
+    existing = exists(subuser);
+
+  op_state.set_existing_subuser(existing);
+
+  return 0;
+}
+
+int RGWSubUserPool::execute_add(const DoutPrefixProvider *dpp, 
+                                RGWUserAdminOpState& op_state,
+                               std::string *err_msg, bool defer_user_update,
+                               optional_yield y)
+{
+  int ret = 0;
+  std::string subprocess_msg;
+
+  RGWSubUser subuser;
+  std::pair<std::string, RGWSubUser> subuser_pair;
+  std::string subuser_str = op_state.get_subuser();
+
+  subuser_pair.first = subuser_str;
+
+  // assumes key should be created
+  if (op_state.has_key_op()) {
+    ret = user->keys.add(dpp, op_state, &subprocess_msg, true, y);
+    if (ret < 0) {
+      set_err_msg(err_msg, "unable to create subuser key, " + subprocess_msg);
+      return ret;
+    }
+  }
+
+  // create the subuser
+  subuser.name = subuser_str;
+
+  if (op_state.has_subuser_perm())
+    subuser.perm_mask = op_state.get_subuser_perm();
+
+  // insert the subuser into user info
+  subuser_pair.second = subuser;
+  subuser_map->insert(subuser_pair);
+
+  // attempt to save the subuser
+  if (!defer_user_update)
+    ret = user->update(dpp, op_state, err_msg, y);
+
+  if (ret < 0)
+    return ret;
+
+  return 0;
+}
+
+int RGWSubUserPool::add(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, optional_yield y,
+                       std::string *err_msg)
+{
+  return add(dpp, op_state, err_msg, false, y);
+}
+
+int RGWSubUserPool::add(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, std::string *err_msg, bool defer_user_update, optional_yield y)
+{
+  std::string subprocess_msg;
+  int ret;
+  int32_t key_type = op_state.get_key_type();
+
+  ret = check_op(op_state, &subprocess_msg);
+  if (ret < 0) {
+    set_err_msg(err_msg, "unable to parse request, " + subprocess_msg);
+    return ret;
+  }
+
+  if (op_state.get_access_key_exist()) {
+    set_err_msg(err_msg, "cannot create existing key");
+    return -ERR_KEY_EXIST;
+  }
+
+  if (key_type == KEY_TYPE_S3 && op_state.get_access_key().empty()) {
+    op_state.set_gen_access();
+  }
+
+  if (op_state.get_secret_key().empty()) {
+    op_state.set_gen_secret();
+  }
+
+  ret = execute_add(dpp, op_state, &subprocess_msg, defer_user_update, y);
+  if (ret < 0) {
+    set_err_msg(err_msg, "unable to create subuser, " + subprocess_msg);
+    return ret;
+  }
+
+  return 0;
+}
+
+int RGWSubUserPool::execute_remove(const DoutPrefixProvider *dpp, 
+                                   RGWUserAdminOpState& op_state,
+                                  std::string *err_msg, bool defer_user_update,
+                                  optional_yield y)
+{
+  int ret = 0;
+  std::string subprocess_msg;
+
+  std::string subuser_str = op_state.get_subuser();
+
+  map<std::string, RGWSubUser>::iterator siter;
+  siter = subuser_map->find(subuser_str);
+  if (siter == subuser_map->end()){
+    set_err_msg(err_msg, "subuser not found: " + subuser_str);
+    return -ERR_NO_SUCH_SUBUSER;
+  }
+  if (!op_state.has_existing_subuser()) {
+    set_err_msg(err_msg, "subuser not found: " + subuser_str);
+    return -ERR_NO_SUCH_SUBUSER;
+  }
+
+  // always purge all associate keys
+  user->keys.remove_subuser_keys(dpp, op_state, &subprocess_msg, true, y);
+
+  // remove the subuser from the user info
+  subuser_map->erase(siter);
+
+  // attempt to save the subuser
+  if (!defer_user_update)
+    ret = user->update(dpp, op_state, err_msg, y);
+
+  if (ret < 0)
+    return ret;
+
+  return 0;
+}
+
+int RGWSubUserPool::remove(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, optional_yield y,
+                          std::string *err_msg)
+{
+  return remove(dpp, op_state, err_msg, false, y);
+}
+
+int RGWSubUserPool::remove(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, std::string *err_msg,
+                          bool defer_user_update, optional_yield y)
+{
+  std::string subprocess_msg;
+  int ret;
+
+  ret = check_op(op_state, &subprocess_msg);
+  if (ret < 0) {
+    set_err_msg(err_msg, "unable to parse request, " + subprocess_msg);
+    return ret;
+  }
+
+  ret = execute_remove(dpp, op_state, &subprocess_msg, defer_user_update, y);
+  if (ret < 0) {
+    set_err_msg(err_msg, "unable to remove subuser, " + subprocess_msg);
+    return ret;
+  }
+
+  return 0;
+}
+
+int RGWSubUserPool::execute_modify(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, std::string *err_msg, bool defer_user_update, optional_yield y)
+{
+  int ret = 0;
+  std::string subprocess_msg;
+  std::map<std::string, RGWSubUser>::iterator siter;
+  std::pair<std::string, RGWSubUser> subuser_pair;
+
+  std::string subuser_str = op_state.get_subuser();
+  RGWSubUser subuser;
+
+  if (!op_state.has_existing_subuser()) {
+    set_err_msg(err_msg, "subuser does not exist");
+    return -ERR_NO_SUCH_SUBUSER;
+  }
+
+  subuser_pair.first = subuser_str;
+
+  siter = subuser_map->find(subuser_str);
+  subuser = siter->second;
+
+  if (op_state.has_key_op()) {
+    ret = user->keys.add(dpp, op_state, &subprocess_msg, true, y);
+    if (ret < 0) {
+      set_err_msg(err_msg, "unable to create subuser keys, " + subprocess_msg);
+      return ret;
+    }
+  }
+
+  if (op_state.has_subuser_perm())
+    subuser.perm_mask = op_state.get_subuser_perm();
+
+  subuser_pair.second = subuser;
+
+  subuser_map->erase(siter);
+  subuser_map->insert(subuser_pair);
+
+  // attempt to save the subuser
+  if (!defer_user_update)
+    ret = user->update(dpp, op_state, err_msg, y);
+
+  if (ret < 0)
+    return ret;
+
+  return 0;
+}
+
+int RGWSubUserPool::modify(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, optional_yield y, std::string *err_msg)
+{
+  return RGWSubUserPool::modify(dpp, op_state, y, err_msg, false);
+}
+
+int RGWSubUserPool::modify(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, optional_yield y, std::string *err_msg, bool defer_user_update)
+{
+  std::string subprocess_msg;
+  int ret;
+
+  RGWSubUser subuser;
+
+  ret = check_op(op_state, &subprocess_msg);
+  if (ret < 0) {
+    set_err_msg(err_msg, "unable to parse request, " + subprocess_msg);
+    return ret;
+  }
+
+  ret = execute_modify(dpp, op_state, &subprocess_msg, defer_user_update, y);
+  if (ret < 0) {
+    set_err_msg(err_msg, "unable to modify subuser, " + subprocess_msg);
+    return ret;
+  }
+
+  return 0;
+}
+
+RGWUserCapPool::RGWUserCapPool(RGWUser *usr)
+{
+  if (!usr) {
+    return;
+  }
+  user = usr;
+  caps_allowed = true;
+}
+
+int RGWUserCapPool::init(RGWUserAdminOpState& op_state)
+{
+  if (!op_state.is_initialized()) {
+    caps_allowed = false;
+    return -EINVAL;
+  }
+
+  const rgw_user& uid = op_state.get_user_id();
+  if (uid.compare(RGW_USER_ANON_ID) == 0) {
+    caps_allowed = false;
+    return -EACCES;
+  }
+
+  caps = op_state.get_caps_obj();
+  if (!caps) {
+    caps_allowed = false;
+    return -ERR_INVALID_CAP;
+  }
+
+  caps_allowed = true;
+
+  return 0;
+}
+
+int RGWUserCapPool::add(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, optional_yield y,
+                       std::string *err_msg)
+{
+  return add(dpp, op_state, err_msg, false, y);
+}
+
+int RGWUserCapPool::add(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, std::string *err_msg,
+                       bool defer_save, optional_yield y)
+{
+  int ret = 0;
+  std::string caps_str = op_state.get_caps();
+
+  if (!op_state.is_populated()) {
+    set_err_msg(err_msg, "user info was not populated");
+    return -EINVAL;
+  }
+
+  if (!caps_allowed) {
+    set_err_msg(err_msg, "caps not allowed for this user");
+    return -EACCES;
+  }
+
+  if (caps_str.empty()) {
+    set_err_msg(err_msg, "empty user caps");
+    return -ERR_INVALID_CAP;
+  }
+
+  int r = caps->add_from_string(caps_str);
+  if (r < 0) {
+    set_err_msg(err_msg, "unable to add caps: " + caps_str);
+    return r;
+  }
+
+  if (!defer_save)
+    ret = user->update(dpp, op_state, err_msg, y);
+
+  if (ret < 0)
+    return ret;
+
+  return 0;
+}
+
+int RGWUserCapPool::remove(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, optional_yield y,
+                          std::string *err_msg)
+{
+  return remove(dpp, op_state, err_msg, false, y);
+}
+
+int RGWUserCapPool::remove(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, std::string *err_msg,
+                          bool defer_save, optional_yield y)
+{
+  int ret = 0;
+
+  std::string caps_str = op_state.get_caps();
+
+  if (!op_state.is_populated()) {
+    set_err_msg(err_msg, "user info was not populated");
+    return -EINVAL;
+  }
+
+  if (!caps_allowed) {
+    set_err_msg(err_msg, "caps not allowed for this user");
+    return -EACCES;
+  }
+
+  if (caps_str.empty()) {
+    set_err_msg(err_msg, "empty user caps");
+    return -ERR_INVALID_CAP;
+  }
+
+  int r = caps->remove_from_string(caps_str);
+  if (r < 0) {
+    set_err_msg(err_msg, "unable to remove caps: " + caps_str);
+    return r;
+  }
+
+  if (!defer_save)
+    ret = user->update(dpp, op_state, err_msg, y);
+
+  if (ret < 0)
+    return ret;
+
+  return 0;
+}
+
+RGWUser::RGWUser() : caps(this), keys(this), subusers(this)
+{
+  init_default();
+}
+
+int RGWUser::init(const DoutPrefixProvider *dpp, rgw::sal::Driver* _driver,
+                 RGWUserAdminOpState& op_state, optional_yield y)
+{
+  init_default();
+  int ret = init_storage(_driver);
+  if (ret < 0)
+    return ret;
+
+  ret = init(dpp, op_state, y);
+  if (ret < 0)
+    return ret;
+
+  return 0;
+}
+
+void RGWUser::init_default()
+{
+  // use anonymous user info as a placeholder
+  rgw_get_anon_user(old_info);
+  user_id = RGW_USER_ANON_ID;
+
+  clear_populated();
+}
+
+int RGWUser::init_storage(rgw::sal::Driver* _driver)
+{
+  if (!_driver) {
+    return -EINVAL;
+  }
+
+  driver = _driver;
+
+  clear_populated();
+
+  /* API wrappers */
+  keys = RGWAccessKeyPool(this);
+  caps = RGWUserCapPool(this);
+  subusers = RGWSubUserPool(this);
+
+  return 0;
+}
+
+int RGWUser::init(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, optional_yield y)
+{
+  bool found = false;
+  std::string swift_user;
+  user_id = op_state.get_user_id();
+  std::string user_email = op_state.get_user_email();
+  std::string access_key = op_state.get_access_key();
+  std::string subuser = op_state.get_subuser();
+
+  int key_type = op_state.get_key_type();
+  if (key_type == KEY_TYPE_SWIFT) {
+    swift_user = op_state.get_access_key();
+    access_key.clear();
+  }
+
+  std::unique_ptr<rgw::sal::User> user;
+
+  clear_populated();
+
+  if (user_id.empty() && !subuser.empty()) {
+    size_t pos = subuser.find(':');
+    if (pos != string::npos) {
+      user_id = subuser.substr(0, pos);
+      op_state.set_user_id(user_id);
+    }
+  }
+
+  if (!user_id.empty() && (user_id.compare(RGW_USER_ANON_ID) != 0)) {
+    user = driver->get_user(user_id);
+    found = (user->load_user(dpp, y) >= 0);
+    op_state.found_by_uid = found;
+  }
+  if (driver->ctx()->_conf.get_val<bool>("rgw_user_unique_email")) {
+    if (!user_email.empty() && !found) {
+      found = (driver->get_user_by_email(dpp, user_email, y, &user) >= 0);
+      op_state.found_by_email = found;
+    }
+  }
+  if (!swift_user.empty() && !found) {
+    found = (driver->get_user_by_swift(dpp, swift_user, y, &user) >= 0);
+    op_state.found_by_key = found;
+  }
+  if (!access_key.empty() && !found) {
+    found = (driver->get_user_by_access_key(dpp, access_key, y, &user) >= 0);
+    op_state.found_by_key = found;
+  }
+  
+  op_state.set_existing_user(found);
+  if (found) {
+    op_state.set_user_info(user->get_info());
+    op_state.set_populated();
+    op_state.objv = user->get_version_tracker();
+    op_state.set_user_version_tracker(user->get_version_tracker());
+
+    old_info = user->get_info();
+    set_populated();
+  }
+
+  if (user_id.empty()) {
+    user_id = user->get_id();
+  }
+  op_state.set_initialized();
+
+  // this may have been called by a helper object
+  int ret = init_members(op_state);
+  if (ret < 0)
+    return ret;
+
+  return 0;
+}
+
+int RGWUser::init_members(RGWUserAdminOpState& op_state)
+{
+  int ret = 0;
+
+  ret = keys.init(op_state);
+  if (ret < 0)
+    return ret;
+
+  ret = subusers.init(op_state);
+  if (ret < 0)
+    return ret;
+
+  ret = caps.init(op_state);
+  if (ret < 0)
+    return ret;
+
+  return 0;
+}
+
+int RGWUser::update(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, std::string *err_msg,
+                   optional_yield y)
+{
+  int ret;
+  std::string subprocess_msg;
+  rgw::sal::User* user = op_state.get_user();
+
+  if (!driver) {
+    set_err_msg(err_msg, "couldn't initialize storage");
+    return -EINVAL;
+  }
+
+  RGWUserInfo *pold_info = (is_populated() ? &old_info : nullptr);
+
+  ret = user->store_user(dpp, y, false, pold_info);
+  op_state.objv = user->get_version_tracker();
+  op_state.set_user_version_tracker(user->get_version_tracker());
+
+  if (ret < 0) {
+    set_err_msg(err_msg, "unable to store user info");
+    return ret;
+  }
+
+  old_info = user->get_info();
+  set_populated();
+
+  return 0;
+}
+
+int RGWUser::check_op(RGWUserAdminOpState& op_state, std::string *err_msg)
+{
+  int ret = 0;
+  const rgw_user& uid = op_state.get_user_id();
+
+  if (uid.compare(RGW_USER_ANON_ID) == 0) {
+    set_err_msg(err_msg, "unable to perform operations on the anonymous user");
+    return -EINVAL;
+  }
+
+  if (is_populated() && user_id.compare(uid) != 0) {
+    set_err_msg(err_msg, "user id mismatch, operation id: " + uid.to_str()
+            + " does not match: " + user_id.to_str());
+
+    return -EINVAL;
+  }
+
+  ret = rgw_validate_tenant_name(uid.tenant);
+  if (ret) {
+    set_err_msg(err_msg,
+               "invalid tenant only alphanumeric and _ characters are allowed");
+    return ret;
+  }
+
+  //set key type when it not set or set by context
+  if ((op_state.get_key_type() < 0) || op_state.key_type_setbycontext) {
+    op_state.set_key_type(KEY_TYPE_S3);
+    op_state.key_type_setbycontext = true;
+  }
+
+  return 0;
+}
+
+// update swift_keys with new user id
+static void rename_swift_keys(const rgw_user& user,
+                              std::map<std::string, RGWAccessKey>& keys)
+{
+  std::string user_id;
+  user.to_str(user_id);
+
+  auto modify_keys = std::move(keys);
+  for ([[maybe_unused]] auto& [k, key] : modify_keys) {
+    std::string id = user_id + ":" + key.subuser;
+    key.id = id;
+    keys[id] = std::move(key);
+  }
+}
+
+int RGWUser::execute_rename(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, std::string *err_msg, optional_yield y)
+{
+  int ret;
+  bool populated = op_state.is_populated();
+
+  if (!op_state.has_existing_user() && !populated) {
+    set_err_msg(err_msg, "user not found");
+    return -ENOENT;
+  }
+
+  if (!populated) {
+    ret = init(dpp, op_state, y);
+    if (ret < 0) {
+      set_err_msg(err_msg, "unable to retrieve user info");
+      return ret;
+    }
+  }
+
+  std::unique_ptr<rgw::sal::User> old_user = driver->get_user(op_state.get_user_info().user_id);
+  std::unique_ptr<rgw::sal::User> new_user = driver->get_user(op_state.get_new_uid());
+  if (old_user->get_tenant() != new_user->get_tenant()) {
+    set_err_msg(err_msg, "users have to be under the same tenant namespace "
+                + old_user->get_tenant() + " != " + new_user->get_tenant());
+    return -EINVAL;
+  }
+
+  // create a stub user and write only the uid index and buckets object
+  std::unique_ptr<rgw::sal::User> user;
+  user = driver->get_user(new_user->get_id());
+
+  const bool exclusive = !op_state.get_overwrite_new_user(); // overwrite if requested
+
+  ret = user->store_user(dpp, y, exclusive);
+  if (ret == -EEXIST) {
+    set_err_msg(err_msg, "user name given by --new-uid already exists");
+    return ret;
+  }
+  if (ret < 0) {
+    set_err_msg(err_msg, "unable to store new user info");
+    return ret;
+  }
+
+  RGWAccessControlPolicy policy_instance;
+  policy_instance.create_default(new_user->get_id(), old_user->get_display_name());
+
+  //unlink and link buckets to new user
+  string marker;
+  CephContext *cct = driver->ctx();
+  size_t max_buckets = cct->_conf->rgw_list_buckets_max_chunk;
+  rgw::sal::BucketList buckets;
+
+  do {
+    ret = old_user->list_buckets(dpp, marker, "", max_buckets, false, buckets, y);
+    if (ret < 0) {
+      set_err_msg(err_msg, "unable to list user buckets");
+      return ret;
+    }
+
+    auto& m = buckets.get_buckets();
+
+    for (auto it = m.begin(); it != m.end(); ++it) {
+      auto& bucket = it->second;
+      marker = it->first;
+
+      ret = bucket->load_bucket(dpp, y);
+      if (ret < 0) {
+        set_err_msg(err_msg, "failed to fetch bucket info for bucket=" + bucket->get_name());
+        return ret;
+      }
+
+      ret = bucket->set_acl(dpp, policy_instance, y);
+      if (ret < 0) {
+        set_err_msg(err_msg, "failed to set acl on bucket " + bucket->get_name());
+        return ret;
+      }
+
+      ret = bucket->chown(dpp, new_user.get(), old_user.get(), y);
+      if (ret < 0) {
+        set_err_msg(err_msg, "failed to run bucket chown" + cpp_strerror(-ret));
+        return ret;
+      }
+    }
+
+  } while (buckets.is_truncated());
+
+  // update the 'stub user' with all of the other fields and rewrite all of the
+  // associated index objects
+  RGWUserInfo& user_info = op_state.get_user_info();
+  user_info.user_id = new_user->get_id();
+  op_state.objv = user->get_version_tracker();
+  op_state.set_user_version_tracker(user->get_version_tracker());
+
+  rename_swift_keys(new_user->get_id(), user_info.swift_keys);
+
+  return update(dpp, op_state, err_msg, y);
+}
+
+int RGWUser::execute_add(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, std::string *err_msg,
+                        optional_yield y)
+{
+  const rgw_user& uid = op_state.get_user_id();
+  std::string user_email = op_state.get_user_email();
+  std::string display_name = op_state.get_display_name();
+
+  // set the user info
+  RGWUserInfo user_info;
+  user_id = uid;
+  user_info.user_id = user_id;
+  user_info.display_name = display_name;
+  user_info.type = TYPE_RGW;
+
+  if (!user_email.empty())
+    user_info.user_email = user_email;
+
+  CephContext *cct = driver->ctx();
+  if (op_state.max_buckets_specified) {
+    user_info.max_buckets = op_state.get_max_buckets();
+  } else {
+    user_info.max_buckets =
+      cct->_conf.get_val<int64_t>("rgw_user_max_buckets");
+  }
+
+  user_info.suspended = op_state.get_suspension_status();
+  user_info.admin = op_state.admin;
+  user_info.system = op_state.system;
+
+  if (op_state.op_mask_specified)
+    user_info.op_mask = op_state.get_op_mask();
+
+  if (op_state.has_bucket_quota()) {
+    user_info.quota.bucket_quota = op_state.get_bucket_quota();
+  } else {
+    rgw_apply_default_bucket_quota(user_info.quota.bucket_quota, cct->_conf);
+  }
+
+  if (op_state.temp_url_key_specified) {
+    map<int, string>::iterator iter;
+    for (iter = op_state.temp_url_keys.begin();
+         iter != op_state.temp_url_keys.end(); ++iter) {
+      user_info.temp_url_keys[iter->first] = iter->second;
+    }
+  }
+
+  if (op_state.has_user_quota()) {
+    user_info.quota.user_quota = op_state.get_user_quota();
+  } else {
+    rgw_apply_default_user_quota(user_info.quota.user_quota, cct->_conf);
+  }
+
+  if (op_state.default_placement_specified) {
+    user_info.default_placement = op_state.default_placement;
+  }
+
+  if (op_state.placement_tags_specified) {
+    user_info.placement_tags = op_state.placement_tags;
+  }
+
+  // update the request
+  op_state.set_user_info(user_info);
+  op_state.set_populated();
+
+  // update the helper objects
+  int ret = init_members(op_state);
+  if (ret < 0) {
+    set_err_msg(err_msg, "unable to initialize user");
+    return ret;
+  }
+
+  // see if we need to add an access key
+  std::string subprocess_msg;
+  bool defer_user_update = true;
+  if (op_state.has_key_op()) {
+    ret = keys.add(dpp, op_state, &subprocess_msg, defer_user_update, y);
+    if (ret < 0) {
+      set_err_msg(err_msg, "unable to create access key, " + subprocess_msg);
+      return ret;
+    }
+  }
+
+  // see if we need to add some caps
+  if (op_state.has_caps_op()) {
+    ret = caps.add(dpp, op_state, &subprocess_msg, defer_user_update, y);
+    if (ret < 0) {
+      set_err_msg(err_msg, "unable to add user capabilities, " + subprocess_msg);
+      return ret;
+    }
+  }
+
+  ret = update(dpp, op_state, err_msg, y);
+  if (ret < 0)
+    return ret;
+
+  return 0;
+}
+
+int RGWUser::add(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, optional_yield y, std::string *err_msg)
+{
+  std::string subprocess_msg;
+  int ret = user_add_helper(op_state, &subprocess_msg);
+  if (ret != 0) {
+    set_err_msg(err_msg, "unable to parse parameters, " + subprocess_msg);
+    return ret;
+  }
+
+  ret = check_op(op_state, &subprocess_msg);
+  if (ret < 0) {
+    set_err_msg(err_msg, "unable to parse parameters, " + subprocess_msg);
+    return ret;
+  }
+
+  ret = execute_add(dpp, op_state, &subprocess_msg, y);
+  if (ret < 0) {
+    set_err_msg(err_msg, "unable to create user, " + subprocess_msg);
+    return ret;
+  }
+
+  return 0;
+}
+
+int RGWUser::rename(RGWUserAdminOpState& op_state, optional_yield y, const DoutPrefixProvider *dpp, std::string *err_msg)
+{
+  std::string subprocess_msg;
+  int ret;
+
+  ret = check_op(op_state, &subprocess_msg);
+  if (ret < 0) {
+    set_err_msg(err_msg, "unable to parse parameters, " + subprocess_msg);
+    return ret;
+  }
+
+  ret = execute_rename(dpp, op_state, &subprocess_msg, y);
+  if (ret < 0) {
+    set_err_msg(err_msg, "unable to rename user, " + subprocess_msg);
+    return ret;
+  }
+
+  return 0;
+}
+
+int RGWUser::execute_remove(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, std::string *err_msg, optional_yield y)
+{
+  int ret;
+
+  bool purge_data = op_state.will_purge_data();
+  rgw::sal::User* user = op_state.get_user();
+
+  if (!op_state.has_existing_user()) {
+    set_err_msg(err_msg, "user does not exist");
+    return -ENOENT;
+  }
+
+  rgw::sal::BucketList buckets;
+  string marker;
+  CephContext *cct = driver->ctx();
+  size_t max_buckets = cct->_conf->rgw_list_buckets_max_chunk;
+  do {
+    ret = user->list_buckets(dpp, marker, string(), max_buckets, false, buckets, y);
+    if (ret < 0) {
+      set_err_msg(err_msg, "unable to read user bucket info");
+      return ret;
+    }
+
+    auto& m = buckets.get_buckets();
+    if (!m.empty() && !purge_data) {
+      set_err_msg(err_msg, "must specify purge data to remove user with buckets");
+      return -EEXIST; // change to code that maps to 409: conflict
+    }
+
+    for (auto it = m.begin(); it != m.end(); ++it) {
+      ret = it->second->remove_bucket(dpp, true, false, nullptr, y);
+      if (ret < 0) {
+        set_err_msg(err_msg, "unable to delete user data");
+        return ret;
+      }
+
+      marker = it->first;
+    }
+
+  } while (buckets.is_truncated());
+
+  ret = user->remove_user(dpp, y);
+  if (ret < 0) {
+    set_err_msg(err_msg, "unable to remove user from RADOS");
+    return ret;
+  }
+
+  op_state.clear_populated();
+  clear_populated();
+
+  return 0;
+}
+
+int RGWUser::remove(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, optional_yield y, std::string *err_msg)
+{
+  std::string subprocess_msg;
+  int ret;
+
+  ret = check_op(op_state, &subprocess_msg);
+  if (ret < 0) {
+    set_err_msg(err_msg, "unable to parse parameters, " + subprocess_msg);
+    return ret;
+  }
+
+  ret = execute_remove(dpp, op_state, &subprocess_msg, y);
+  if (ret < 0) {
+    set_err_msg(err_msg, "unable to remove user, " + subprocess_msg);
+    return ret;
+  }
+
+  return 0;
+}
+
+int RGWUser::execute_modify(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, std::string *err_msg, optional_yield y)
+{
+  bool populated = op_state.is_populated();
+  int ret = 0;
+  std::string subprocess_msg;
+  std::string op_email = op_state.get_user_email();
+  std::string display_name = op_state.get_display_name();
+
+  RGWUserInfo user_info;
+  std::unique_ptr<rgw::sal::User> duplicate_check;
+
+  // ensure that the user info has been populated or is populate-able
+  if (!op_state.has_existing_user() && !populated) {
+    set_err_msg(err_msg, "user not found");
+    return -ENOENT;
+  }
+
+  // if the user hasn't already been populated...attempt to
+  if (!populated) {
+    ret = init(dpp, op_state, y);
+    if (ret < 0) {
+      set_err_msg(err_msg, "unable to retrieve user info");
+      return ret;
+    }
+  }
+
+  // ensure that we can modify the user's attributes
+  if (user_id.compare(RGW_USER_ANON_ID) == 0) {
+    set_err_msg(err_msg, "unable to modify anonymous user's info");
+    return -EACCES;
+  }
+
+  user_info = old_info;
+
+  std::string old_email = old_info.user_email;
+  if (!op_email.empty()) {
+    // make sure we are not adding a duplicate email
+    if (old_email != op_email) {
+      ret = driver->get_user_by_email(dpp, op_email, y, &duplicate_check);
+      if (ret >= 0 && duplicate_check->get_id().compare(user_id) != 0) {
+        set_err_msg(err_msg, "cannot add duplicate email");
+        return -ERR_EMAIL_EXIST;
+      }
+    }
+    user_info.user_email = op_email;
+  } else if (op_email.empty() && op_state.user_email_specified) {
+    ldpp_dout(dpp, 10) << "removing email index: " << user_info.user_email << dendl;
+    /* will be physically removed later when calling update() */
+    user_info.user_email.clear();
+  }
+
+  // update the remaining user info
+  if (!display_name.empty())
+    user_info.display_name = display_name;
+
+  if (op_state.max_buckets_specified)
+    user_info.max_buckets = op_state.get_max_buckets();
+
+  if (op_state.admin_specified)
+    user_info.admin = op_state.admin;
+
+  if (op_state.system_specified)
+    user_info.system = op_state.system;
+
+  if (op_state.temp_url_key_specified) {
+    map<int, string>::iterator iter;
+    for (iter = op_state.temp_url_keys.begin();
+         iter != op_state.temp_url_keys.end(); ++iter) {
+      user_info.temp_url_keys[iter->first] = iter->second;
+    }
+  }
+
+  if (op_state.op_mask_specified)
+    user_info.op_mask = op_state.get_op_mask();
+
+  if (op_state.has_bucket_quota())
+    user_info.quota.bucket_quota = op_state.get_bucket_quota();
+
+  if (op_state.has_user_quota())
+    user_info.quota.user_quota = op_state.get_user_quota();
+
+  if (op_state.has_suspension_op()) {
+    __u8 suspended = op_state.get_suspension_status();
+    user_info.suspended = suspended;
+
+    rgw::sal::BucketList buckets;
+
+    if (user_id.empty()) {
+      set_err_msg(err_msg, "empty user id passed...aborting");
+      return -EINVAL;
+    }
+
+    string marker;
+    CephContext *cct = driver->ctx();
+    size_t max_buckets = cct->_conf->rgw_list_buckets_max_chunk;
+    std::unique_ptr<rgw::sal::User> user = driver->get_user(user_id);
+    do {
+      ret = user->list_buckets(dpp, marker, string(), max_buckets, false, buckets, y);
+      if (ret < 0) {
+        set_err_msg(err_msg, "could not get buckets for uid:  " + user_id.to_str());
+        return ret;
+      }
+
+      auto& m = buckets.get_buckets();
+
+      vector<rgw_bucket> bucket_names;
+      for (auto iter = m.begin(); iter != m.end(); ++iter) {
+       auto& bucket = iter->second;
+        bucket_names.push_back(bucket->get_key());
+
+        marker = iter->first;
+      }
+
+      ret = driver->set_buckets_enabled(dpp, bucket_names, !suspended);
+      if (ret < 0) {
+        set_err_msg(err_msg, "failed to modify bucket");
+        return ret;
+      }
+
+    } while (buckets.is_truncated());
+  }
+
+  if (op_state.mfa_ids_specified) {
+    user_info.mfa_ids = op_state.mfa_ids;
+  }
+
+  if (op_state.default_placement_specified) {
+    user_info.default_placement = op_state.default_placement;
+  }
+
+  if (op_state.placement_tags_specified) {
+    user_info.placement_tags = op_state.placement_tags;
+  }
+
+  op_state.set_user_info(user_info);
+
+  // if we're supposed to modify keys, do so
+  if (op_state.has_key_op()) {
+    ret = keys.add(dpp, op_state, &subprocess_msg, true, y);
+    if (ret < 0) {
+      set_err_msg(err_msg, "unable to create or modify keys, " + subprocess_msg);
+      return ret;
+    }
+  }
+
+  ret = update(dpp, op_state, err_msg, y);
+  if (ret < 0)
+    return ret;
+
+  return 0;
+}
+
+int RGWUser::modify(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, optional_yield y, std::string *err_msg)
+{
+  std::string subprocess_msg;
+  int ret;
+
+  ret = check_op(op_state, &subprocess_msg);
+  if (ret < 0) {
+    set_err_msg(err_msg, "unable to parse parameters, " + subprocess_msg);
+    return ret;
+  }
+
+  ret = execute_modify(dpp, op_state, &subprocess_msg, y);
+  if (ret < 0) {
+    set_err_msg(err_msg, "unable to modify user, " + subprocess_msg);
+    return ret;
+  }
+
+  return 0;
+}
+
+int RGWUser::info(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, RGWUserInfo& fetched_info,
+                 optional_yield y, std::string *err_msg)
+{
+  int ret = init(dpp, op_state, y);
+  if (ret < 0) {
+    set_err_msg(err_msg, "unable to fetch user info");
+    return ret;
+  }
+
+  fetched_info = op_state.get_user_info();
+
+  return 0;
+}
+
+int RGWUser::info(RGWUserInfo& fetched_info, std::string *err_msg)
+{
+  if (!is_populated()) {
+    set_err_msg(err_msg, "no user info saved");
+    return -EINVAL;
+  }
+
+  fetched_info = old_info;
+
+  return 0;
+}
+
+int RGWUser::list(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, RGWFormatterFlusher& flusher)
+{
+  Formatter *formatter = flusher.get_formatter();
+  void *handle = nullptr;
+  std::string metadata_key = "user";
+  if (op_state.max_entries > 1000) {
+    op_state.max_entries = 1000;
+  }
+
+  int ret = driver->meta_list_keys_init(dpp, metadata_key, op_state.marker, &handle);
+  if (ret < 0) {
+    return ret;
+  }
+
+  bool truncated = false;
+  uint64_t count = 0;
+  uint64_t left = 0;
+  flusher.start(0);
+
+  // open the result object section
+  formatter->open_object_section("result");
+
+  // open the user id list array section
+  formatter->open_array_section("keys");
+  do {
+    std::list<std::string> keys;
+    left = op_state.max_entries - count;
+    ret = driver->meta_list_keys_next(dpp, handle, left, keys, &truncated);
+    if (ret < 0 && ret != -ENOENT) {
+      return ret;
+    } if (ret != -ENOENT) {
+      for (std::list<std::string>::iterator iter = keys.begin(); iter != keys.end(); ++iter) {
+      formatter->dump_string("key", *iter);
+        ++count;
+      }
+    }
+  } while (truncated && left > 0);
+  // close user id list section
+  formatter->close_section();
+
+  formatter->dump_bool("truncated", truncated);
+  formatter->dump_int("count", count);
+  if (truncated) {
+    formatter->dump_string("marker", driver->meta_get_marker(handle));
+  }
+
+  // close result object section
+  formatter->close_section();
+
+  driver->meta_list_keys_complete(handle);
+
+  flusher.flush();
+  return 0;
+}
+
+int RGWUserAdminOp_User::list(const DoutPrefixProvider *dpp, rgw::sal::Driver* driver, RGWUserAdminOpState& op_state,
+                  RGWFormatterFlusher& flusher)
+{
+  RGWUser user;
+
+  int ret = user.init_storage(driver);
+  if (ret < 0)
+    return ret;
+
+  ret = user.list(dpp, op_state, flusher);
+  if (ret < 0)
+    return ret;
+
+  return 0;
+}
+
+int RGWUserAdminOp_User::info(const DoutPrefixProvider *dpp,
+                             rgw::sal::Driver* driver, RGWUserAdminOpState& op_state,
+                             RGWFormatterFlusher& flusher,
+                             optional_yield y)
+{
+  RGWUserInfo info;
+  RGWUser user;
+  std::unique_ptr<rgw::sal::User> ruser;
+
+  int ret = user.init(dpp, driver, op_state, y);
+  if (ret < 0)
+    return ret;
+
+  if (!op_state.has_existing_user())
+    return -ERR_NO_SUCH_USER;
+
+  Formatter *formatter = flusher.get_formatter();
+
+  ret = user.info(info, NULL);
+  if (ret < 0)
+    return ret;
+
+  ruser = driver->get_user(info.user_id);
+
+  if (op_state.sync_stats) {
+    ret = rgw_user_sync_all_stats(dpp, driver, ruser.get(), y);
+    if (ret < 0) {
+      return ret;
+    }
+  }
+
+  RGWStorageStats stats;
+  RGWStorageStats *arg_stats = NULL;
+  if (op_state.fetch_stats) {
+    int ret = ruser->read_stats(dpp, y, &stats);
+    if (ret < 0 && ret != -ENOENT) {
+      return ret;
+    }
+
+    arg_stats = &stats;
+  }
+
+  if (formatter) {
+    flusher.start(0);
+
+    dump_user_info(formatter, info, arg_stats);
+    flusher.flush();
+  }
+
+  return 0;
+}
+
+int RGWUserAdminOp_User::create(const DoutPrefixProvider *dpp,
+                               rgw::sal::Driver* driver,
+                               RGWUserAdminOpState& op_state,
+                               RGWFormatterFlusher& flusher, optional_yield y)
+{
+  RGWUserInfo info;
+  RGWUser user;
+  int ret = user.init(dpp, driver, op_state, y);
+  if (ret < 0)
+    return ret;
+
+  Formatter *formatter = flusher.get_formatter();
+
+  ret = user.add(dpp, op_state, y, NULL);
+  if (ret < 0) {
+    if (ret == -EEXIST)
+      ret = -ERR_USER_EXIST;
+    return ret;
+  }
+
+  ret = user.info(info, NULL);
+  if (ret < 0)
+    return ret;
+
+  if (formatter) {
+    flusher.start(0);
+
+    dump_user_info(formatter, info);
+    flusher.flush();
+  }
+
+  return 0;
+}
+
+int RGWUserAdminOp_User::modify(const DoutPrefixProvider *dpp,
+                               rgw::sal::Driver* driver,
+                               RGWUserAdminOpState& op_state,
+                               RGWFormatterFlusher& flusher, optional_yield y)
+{
+  RGWUserInfo info;
+  RGWUser user;
+  int ret = user.init(dpp, driver, op_state, y);
+  if (ret < 0)
+    return ret;
+  Formatter *formatter = flusher.get_formatter();
+
+  ret = user.modify(dpp, op_state, y, NULL);
+  if (ret < 0) {
+    if (ret == -ENOENT)
+      ret = -ERR_NO_SUCH_USER;
+    return ret;
+  }
+
+  ret = user.info(info, NULL);
+  if (ret < 0)
+    return ret;
+
+  if (formatter) {
+    flusher.start(0);
+
+    dump_user_info(formatter, info);
+    flusher.flush();
+  }
+
+  return 0;
+}
+
+int RGWUserAdminOp_User::remove(const DoutPrefixProvider *dpp,
+                               rgw::sal::Driver* driver, RGWUserAdminOpState& op_state,
+                               RGWFormatterFlusher& flusher, optional_yield y)
+{
+  RGWUserInfo info;
+  RGWUser user;
+  int ret = user.init(dpp, driver, op_state, y);
+  if (ret < 0)
+    return ret;
+
+
+  ret = user.remove(dpp, op_state, y, NULL);
+
+  if (ret == -ENOENT)
+    ret = -ERR_NO_SUCH_USER;
+  return ret;
+}
+
+int RGWUserAdminOp_Subuser::create(const DoutPrefixProvider *dpp,
+                                  rgw::sal::Driver* driver,
+                                  RGWUserAdminOpState& op_state,
+                                  RGWFormatterFlusher& flusher,
+                                  optional_yield y)
+{
+  RGWUserInfo info;
+  RGWUser user;
+  int ret = user.init(dpp, driver, op_state, y);
+  if (ret < 0)
+    return ret;
+
+  if (!op_state.has_existing_user())
+    return -ERR_NO_SUCH_USER;
+
+  Formatter *formatter = flusher.get_formatter();
+
+  ret = user.subusers.add(dpp, op_state, y, NULL);
+  if (ret < 0)
+    return ret;
+
+  ret = user.info(info, NULL);
+  if (ret < 0)
+    return ret;
+
+  if (formatter) {
+    flusher.start(0);
+
+    dump_subusers_info(formatter, info);
+    flusher.flush();
+  }
+
+  return 0;
+}
+
+int RGWUserAdminOp_Subuser::modify(const DoutPrefixProvider *dpp,
+                                  rgw::sal::Driver* driver, RGWUserAdminOpState& op_state,
+                                  RGWFormatterFlusher& flusher, optional_yield y)
+{
+  RGWUserInfo info;
+  RGWUser user;
+  int ret = user.init(dpp, driver, op_state, y);
+  if (ret < 0)
+    return ret;
+
+  if (!op_state.has_existing_user())
+    return -ERR_NO_SUCH_USER;
+
+  Formatter *formatter = flusher.get_formatter();
+
+  ret = user.subusers.modify(dpp, op_state, y, NULL);
+  if (ret < 0)
+    return ret;
+
+  ret = user.info(info, NULL);
+  if (ret < 0)
+    return ret;
+  if (formatter) {
+    flusher.start(0);
+
+    dump_subusers_info(formatter, info);
+    flusher.flush();
+  }
+
+  return 0;
+}
+
+int RGWUserAdminOp_Subuser::remove(const DoutPrefixProvider *dpp,
+                                  rgw::sal::Driver* driver,
+                                  RGWUserAdminOpState& op_state,
+                                  RGWFormatterFlusher& flusher,
+                                  optional_yield y)
+{
+  RGWUserInfo info;
+  RGWUser user;
+  int ret = user.init(dpp, driver, op_state, y);
+  if (ret < 0)
+    return ret;
+
+
+  if (!op_state.has_existing_user())
+    return -ERR_NO_SUCH_USER;
+
+  ret = user.subusers.remove(dpp, op_state, y, NULL);
+  if (ret < 0)
+    return ret;
+
+  return 0;
+}
+
+int RGWUserAdminOp_Key::create(const DoutPrefixProvider *dpp,
+                              rgw::sal::Driver* driver, RGWUserAdminOpState& op_state,
+                              RGWFormatterFlusher& flusher,
+                              optional_yield y)
+{
+  RGWUserInfo info;
+  RGWUser user;
+  int ret = user.init(dpp, driver, op_state, y);
+  if (ret < 0)
+    return ret;
+
+  if (!op_state.has_existing_user())
+    return -ERR_NO_SUCH_USER;
+
+  Formatter *formatter = flusher.get_formatter();
+
+  ret = user.keys.add(dpp, op_state, y, NULL);
+  if (ret < 0)
+    return ret;
+
+  ret = user.info(info, NULL);
+  if (ret < 0)
+    return ret;
+
+  if (formatter) {
+    flusher.start(0);
+
+    int key_type = op_state.get_key_type();
+
+    if (key_type == KEY_TYPE_SWIFT)
+      dump_swift_keys_info(formatter, info);
+
+    else if (key_type == KEY_TYPE_S3)
+      dump_access_keys_info(formatter, info);
+
+    flusher.flush();
+  }
+
+  return 0;
+}
+
+int RGWUserAdminOp_Key::remove(const DoutPrefixProvider *dpp,
+                              rgw::sal::Driver* driver,
+                              RGWUserAdminOpState& op_state,
+                              RGWFormatterFlusher& flusher,
+                              optional_yield y)
+{
+  RGWUserInfo info;
+  RGWUser user;
+  int ret = user.init(dpp, driver, op_state, y);
+  if (ret < 0)
+    return ret;
+
+  if (!op_state.has_existing_user())
+    return -ERR_NO_SUCH_USER;
+
+
+  ret = user.keys.remove(dpp, op_state, y, NULL);
+  if (ret < 0)
+    return ret;
+
+  return 0;
+}
+
+int RGWUserAdminOp_Caps::add(const DoutPrefixProvider *dpp,
+                            rgw::sal::Driver* driver,
+                            RGWUserAdminOpState& op_state,
+                            RGWFormatterFlusher& flusher, optional_yield y)
+{
+  RGWUserInfo info;
+  RGWUser user;
+  int ret = user.init(dpp, driver, op_state, y);
+  if (ret < 0)
+    return ret;
+
+  if (!op_state.has_existing_user())
+    return -ERR_NO_SUCH_USER;
+
+  Formatter *formatter = flusher.get_formatter();
+
+  ret = user.caps.add(dpp, op_state, y, NULL);
+  if (ret < 0)
+    return ret;
+
+  ret = user.info(info, NULL);
+  if (ret < 0)
+    return ret;
+
+  if (formatter) {
+    flusher.start(0);
+
+    info.caps.dump(formatter);
+    flusher.flush();
+  }
+
+  return 0;
+}
+
+
+int RGWUserAdminOp_Caps::remove(const DoutPrefixProvider *dpp,
+                               rgw::sal::Driver* driver,
+                               RGWUserAdminOpState& op_state,
+                               RGWFormatterFlusher& flusher, optional_yield y)
+{
+  RGWUserInfo info;
+  RGWUser user;
+  int ret = user.init(dpp, driver, op_state, y);
+  if (ret < 0)
+    return ret;
+
+  if (!op_state.has_existing_user())
+    return -ERR_NO_SUCH_USER;
+
+  Formatter *formatter = flusher.get_formatter();
+
+  ret = user.caps.remove(dpp, op_state, y, NULL);
+  if (ret < 0)
+    return ret;
+
+  ret = user.info(info, NULL);
+  if (ret < 0)
+    return ret;
+
+  if (formatter) {
+    flusher.start(0);
+
+    info.caps.dump(formatter);
+    flusher.flush();
+  }
+
+  return 0;
+}
+
+class RGWUserMetadataHandler : public RGWMetadataHandler_GenericMetaBE {
+public:
+  struct Svc {
+    RGWSI_User *user{nullptr};
+  } svc;
+
+  RGWUserMetadataHandler(RGWSI_User *user_svc) {
+    base_init(user_svc->ctx(), user_svc->get_be_handler());
+    svc.user = user_svc;
+  }
+
+  ~RGWUserMetadataHandler() {}
+
+  string get_type() override { return "user"; }
+
+  int do_get(RGWSI_MetaBackend_Handler::Op *op, string& entry, RGWMetadataObject **obj, optional_yield y, const DoutPrefixProvider *dpp) override {
+    RGWUserCompleteInfo uci;
+    RGWObjVersionTracker objv_tracker;
+    real_time mtime;
+
+    rgw_user user = RGWSI_User::user_from_meta_key(entry);
+
+    int ret = svc.user->read_user_info(op->ctx(), user, &uci.info, &objv_tracker,
+                                       &mtime, nullptr, &uci.attrs,
+                                       y, dpp);
+    if (ret < 0) {
+      return ret;
+    }
+
+    RGWUserMetadataObject *mdo = new RGWUserMetadataObject(uci, objv_tracker.read_version, mtime);
+    *obj = mdo;
+
+    return 0;
+  }
+
+  RGWMetadataObject *get_meta_obj(JSONObj *jo, const obj_version& objv, const ceph::real_time& mtime) override {
+    RGWUserCompleteInfo uci;
+
+    try {
+      decode_json_obj(uci, jo);
+    } catch (JSONDecoder::err& e) {
+      return nullptr;
+    }
+
+    return new RGWUserMetadataObject(uci, objv, mtime);
+  }
+
+  int do_put(RGWSI_MetaBackend_Handler::Op *op, string& entry,
+             RGWMetadataObject *obj,
+             RGWObjVersionTracker& objv_tracker,
+             optional_yield y, const DoutPrefixProvider *dpp,
+             RGWMDLogSyncType type, bool from_remote_zone) override;
+
+  int do_remove(RGWSI_MetaBackend_Handler::Op *op, string& entry, RGWObjVersionTracker& objv_tracker,
+                optional_yield y, const DoutPrefixProvider *dpp) override {
+    RGWUserInfo info;
+
+    rgw_user user = RGWSI_User::user_from_meta_key(entry);
+
+    int ret = svc.user->read_user_info(op->ctx(), user, &info, nullptr,
+                                       nullptr, nullptr, nullptr,
+                                       y, dpp);
+    if (ret < 0) {
+      return ret;
+    }
+
+    return svc.user->remove_user_info(op->ctx(), info, &objv_tracker,
+                                      y, dpp);
+  }
+};
+
+class RGWMetadataHandlerPut_User : public RGWMetadataHandlerPut_SObj
+{
+  RGWUserMetadataHandler *uhandler;
+  RGWUserMetadataObject *uobj;
+public:
+  RGWMetadataHandlerPut_User(RGWUserMetadataHandler *_handler,
+                             RGWSI_MetaBackend_Handler::Op *op, string& entry,
+                             RGWMetadataObject *obj, RGWObjVersionTracker& objv_tracker,
+                             optional_yield y,
+                             RGWMDLogSyncType type, bool from_remote_zone) : RGWMetadataHandlerPut_SObj(_handler, op, entry, obj, objv_tracker, y, type, from_remote_zone),
+                                                                uhandler(_handler) {
+    uobj = static_cast<RGWUserMetadataObject *>(obj);
+  }
+
+  int put_checked(const DoutPrefixProvider *dpp) override;
+};
+
+int RGWUserMetadataHandler::do_put(RGWSI_MetaBackend_Handler::Op *op, string& entry,
+                                   RGWMetadataObject *obj,
+                                   RGWObjVersionTracker& objv_tracker,
+                                   optional_yield y, const DoutPrefixProvider *dpp,
+                                   RGWMDLogSyncType type, bool from_remote_zone)
+{
+  RGWMetadataHandlerPut_User put_op(this, op, entry, obj, objv_tracker, y, type, from_remote_zone);
+  return do_put_operate(&put_op, dpp);
+}
+
+int RGWMetadataHandlerPut_User::put_checked(const DoutPrefixProvider *dpp)
+{
+  RGWUserMetadataObject *orig_obj = static_cast<RGWUserMetadataObject *>(old_obj);
+  RGWUserCompleteInfo& uci = uobj->get_uci();
+
+  map<string, bufferlist> *pattrs{nullptr};
+  if (uci.has_attrs) {
+    pattrs = &uci.attrs;
+  }
+
+  RGWUserInfo *pold_info = (orig_obj ? &orig_obj->get_uci().info : nullptr);
+
+  auto mtime = obj->get_mtime();
+
+  int ret = uhandler->svc.user->store_user_info(op->ctx(), uci.info, pold_info,
+                                               &objv_tracker, mtime,
+                                               false, pattrs, y, dpp);
+  if (ret < 0) {
+    return ret;
+  }
+
+  return STATUS_APPLIED;
+}
+
+
+RGWUserCtl::RGWUserCtl(RGWSI_Zone *zone_svc,
+                       RGWSI_User *user_svc,
+                       RGWUserMetadataHandler *_umhandler) : umhandler(_umhandler) {
+  svc.zone = zone_svc;
+  svc.user = user_svc;
+  be_handler = umhandler->get_be_handler();
+}
+
+template <class T>
+class optional_default
+{
+  const std::optional<T>& opt;
+  std::optional<T> def;
+  const T *p;
+public:
+  optional_default(const std::optional<T>& _o) : opt(_o) {
+    if (opt) {
+      p = &(*opt);
+    } else {
+      def = T();
+      p = &(*def);
+    }
+  }
+
+  const T *operator->() {
+    return p;
+  }
+
+  const T& operator*() {
+    return *p;
+  }
+};
+
+int RGWUserCtl::get_info_by_uid(const DoutPrefixProvider *dpp, 
+                                const rgw_user& uid,
+                                RGWUserInfo *info,
+                                optional_yield y,
+                                const GetParams& params)
+
+{
+  return be_handler->call([&](RGWSI_MetaBackend_Handler::Op *op) {
+    return svc.user->read_user_info(op->ctx(),
+                                    uid,
+                                    info,
+                                    params.objv_tracker,
+                                    params.mtime,
+                                    params.cache_info,
+                                    params.attrs,
+                                    y,
+                                    dpp);
+  });
+}
+
+int RGWUserCtl::get_info_by_email(const DoutPrefixProvider *dpp, 
+                                  const string& email,
+                                  RGWUserInfo *info,
+                                  optional_yield y,
+                                  const GetParams& params)
+{
+  return be_handler->call([&](RGWSI_MetaBackend_Handler::Op *op) {
+    return svc.user->get_user_info_by_email(op->ctx(), email,
+                                            info,
+                                            params.objv_tracker,
+                                            params.mtime,
+                                            y,
+                                            dpp);
+  });
+}
+
+int RGWUserCtl::get_info_by_swift(const DoutPrefixProvider *dpp, 
+                                  const string& swift_name,
+                                  RGWUserInfo *info,
+                                  optional_yield y,
+                                  const GetParams& params)
+{
+  return be_handler->call([&](RGWSI_MetaBackend_Handler::Op *op) {
+    return svc.user->get_user_info_by_swift(op->ctx(), swift_name,
+                                            info,
+                                            params.objv_tracker,
+                                            params.mtime,
+                                            y,
+                                            dpp);
+  });
+}
+
+int RGWUserCtl::get_info_by_access_key(const DoutPrefixProvider *dpp, 
+                                       const string& access_key,
+                                       RGWUserInfo *info,
+                                       optional_yield y,
+                                       const GetParams& params)
+{
+  return be_handler->call([&](RGWSI_MetaBackend_Handler::Op *op) {
+    return svc.user->get_user_info_by_access_key(op->ctx(), access_key,
+                                                 info,
+                                                 params.objv_tracker,
+                                                 params.mtime,
+                                                 y,
+                                                 dpp);
+  });
+}
+
+int RGWUserCtl::get_attrs_by_uid(const DoutPrefixProvider *dpp, 
+                                 const rgw_user& user_id,
+                                 map<string, bufferlist> *pattrs,
+                                 optional_yield y,
+                                 RGWObjVersionTracker *objv_tracker)
+{
+  RGWUserInfo user_info;
+
+  return get_info_by_uid(dpp, user_id, &user_info, y, RGWUserCtl::GetParams()
+                         .set_attrs(pattrs)
+                         .set_objv_tracker(objv_tracker));
+}
+
+int RGWUserCtl::store_info(const DoutPrefixProvider *dpp, 
+                           const RGWUserInfo& info, optional_yield y,
+                           const PutParams& params)
+{
+  string key = RGWSI_User::get_meta_key(info.user_id);
+
+  return be_handler->call([&](RGWSI_MetaBackend_Handler::Op *op) {
+    return svc.user->store_user_info(op->ctx(), info,
+                                     params.old_info,
+                                     params.objv_tracker,
+                                     params.mtime,
+                                     params.exclusive,
+                                     params.attrs,
+                                     y,
+                                     dpp);
+  });
+}
+
+int RGWUserCtl::remove_info(const DoutPrefixProvider *dpp, 
+                            const RGWUserInfo& info, optional_yield y,
+                            const RemoveParams& params)
+
+{
+  string key = RGWSI_User::get_meta_key(info.user_id);
+
+  return be_handler->call([&](RGWSI_MetaBackend_Handler::Op *op) {
+    return svc.user->remove_user_info(op->ctx(), info,
+                                      params.objv_tracker,
+                                      y, dpp);
+  });
+}
+
+int RGWUserCtl::list_buckets(const DoutPrefixProvider *dpp, 
+                             const rgw_user& user,
+                             const string& marker,
+                             const string& end_marker,
+                             uint64_t max,
+                             bool need_stats,
+                             RGWUserBuckets *buckets,
+                             bool *is_truncated,
+                            optional_yield y,
+                             uint64_t default_max)
+{
+  if (!max) {
+    max = default_max;
+  }
+
+  int ret = svc.user->list_buckets(dpp, user, marker, end_marker,
+                                   max, buckets, is_truncated, y);
+  if (ret < 0) {
+    return ret;
+  }
+  if (need_stats) {
+    map<string, RGWBucketEnt>& m = buckets->get_buckets();
+    ret = ctl.bucket->read_buckets_stats(m, y, dpp);
+    if (ret < 0 && ret != -ENOENT) {
+      ldpp_dout(dpp, 0) << "ERROR: could not get stats for buckets" << dendl;
+      return ret;
+    }
+  }
+  return 0;
+}
+
+int RGWUserCtl::read_stats(const DoutPrefixProvider *dpp, 
+                           const rgw_user& user, RGWStorageStats *stats,
+                          optional_yield y,
+                          ceph::real_time *last_stats_sync,
+                          ceph::real_time *last_stats_update)
+{
+  return be_handler->call([&](RGWSI_MetaBackend_Handler::Op *op) {
+    return svc.user->read_stats(dpp, op->ctx(), user, stats,
+                               last_stats_sync, last_stats_update, y);
+  });
+}
+
+RGWMetadataHandler *RGWUserMetaHandlerAllocator::alloc(RGWSI_User *user_svc) {
+  return new RGWUserMetadataHandler(user_svc);
+}
+
+void rgw_user::dump(Formatter *f) const
+{
+  ::encode_json("user", *this, f);
+}
+
diff --git a/src/rgw/driver/rados/rgw_user.h b/src/rgw/driver/rados/rgw_user.h
new file mode 100644 (file)
index 0000000..110124c
--- /dev/null
@@ -0,0 +1,887 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#ifndef CEPH_RGW_USER_H
+#define CEPH_RGW_USER_H
+
+#include <string>
+#include <boost/algorithm/string.hpp>
+#include "include/ceph_assert.h"
+
+#include "include/types.h"
+#include "rgw_common.h"
+#include "rgw_tools.h"
+
+#include "rgw_string.h"
+
+#include "common/Formatter.h"
+#include "rgw_formats.h"
+#include "rgw_metadata.h"
+#include "rgw_sal_fwd.h"
+
+#define RGW_USER_ANON_ID "anonymous"
+
+#define SECRET_KEY_LEN 40
+#define PUBLIC_ID_LEN 20
+#define RAND_SUBUSER_LEN 5
+
+#define XMLNS_AWS_S3 "http://s3.amazonaws.com/doc/2006-03-01/"
+
+class RGWUserCtl;
+class RGWBucketCtl;
+class RGWUserBuckets;
+
+class RGWGetUserStats_CB;
+
+/**
+ * A string wrapper that includes encode/decode functions
+ * for easily accessing a UID in all forms
+ */
+struct RGWUID
+{
+  rgw_user user_id;
+  void encode(bufferlist& bl) const {
+    std::string s;
+    user_id.to_str(s);
+    using ceph::encode;
+    encode(s, bl);
+  }
+  void decode(bufferlist::const_iterator& bl) {
+    std::string s;
+    using ceph::decode;
+    decode(s, bl);
+    user_id.from_str(s);
+  }
+};
+WRITE_CLASS_ENCODER(RGWUID)
+
+/** Entry for bucket metadata collection */
+struct bucket_meta_entry {
+  size_t size;
+  size_t size_rounded;
+  ceph::real_time creation_time;
+  uint64_t count;
+};
+
+extern int rgw_user_sync_all_stats(const DoutPrefixProvider *dpp, rgw::sal::Driver* driver, rgw::sal::User* user, optional_yield y);
+extern int rgw_user_get_all_buckets_stats(const DoutPrefixProvider *dpp,
+  rgw::sal::Driver* driver, rgw::sal::User* user,
+  std::map<std::string, bucket_meta_entry>& buckets_usage_map, optional_yield y);
+
+/**
+ * Get the anonymous (ie, unauthenticated) user info.
+ */
+extern void rgw_get_anon_user(RGWUserInfo& info);
+
+extern void rgw_perm_to_str(uint32_t mask, char *buf, int len);
+extern uint32_t rgw_str_to_perm(const char *str);
+
+extern int rgw_validate_tenant_name(const std::string& t);
+
+enum ObjectKeyType {
+  KEY_TYPE_SWIFT,
+  KEY_TYPE_S3,
+  KEY_TYPE_UNDEFINED
+};
+
+enum RGWKeyPoolOp {
+  GENERATE_KEY,
+  MODIFY_KEY
+};
+
+enum RGWUserId {
+  RGW_USER_ID,
+  RGW_SWIFT_USERNAME,
+  RGW_USER_EMAIL,
+  RGW_ACCESS_KEY,
+};
+
+/*
+ * An RGWUser class along with supporting classes created
+ * to support the creation of an RESTful administrative API
+ */
+struct RGWUserAdminOpState {
+  // user attributes
+  std::unique_ptr<rgw::sal::User> user;
+  std::string user_email;
+  std::string display_name;
+  rgw_user new_user_id;
+  bool overwrite_new_user = false;
+  int32_t max_buckets{RGW_DEFAULT_MAX_BUCKETS};
+  __u8 suspended{0};
+  __u8 admin{0};
+  __u8 system{0};
+  __u8 exclusive{0};
+  __u8 fetch_stats{0};
+  __u8 sync_stats{0};
+  std::string caps;
+  RGWObjVersionTracker objv;
+  uint32_t op_mask{0};
+  std::map<int, std::string> temp_url_keys;
+
+  // subuser attributes
+  std::string subuser;
+  uint32_t perm_mask{RGW_PERM_NONE};
+
+  // key_attributes
+  std::string id; // access key
+  std::string key; // secret key
+  int32_t key_type{-1};
+  bool access_key_exist = false;
+
+  std::set<std::string> mfa_ids;
+
+  // operation attributes
+  bool existing_user{false};
+  bool existing_key{false};
+  bool existing_subuser{false};
+  bool existing_email{false};
+  bool subuser_specified{false};
+  bool gen_secret{false};
+  bool gen_access{false};
+  bool gen_subuser{false};
+  bool id_specified{false};
+  bool key_specified{false};
+  bool type_specified{false};
+  bool key_type_setbycontext{false};   // key type set by user or subuser context
+  bool purge_data{false};
+  bool purge_keys{false};
+  bool display_name_specified{false};
+  bool user_email_specified{false};
+  bool max_buckets_specified{false};
+  bool perm_specified{false};
+  bool op_mask_specified{false};
+  bool caps_specified{false};
+  bool suspension_op{false};
+  bool admin_specified{false};
+  bool system_specified{false};
+  bool key_op{false};
+  bool temp_url_key_specified{false};
+  bool found_by_uid{false};
+  bool found_by_email{false};
+  bool found_by_key{false};
+  bool mfa_ids_specified{false};
+  // req parameters
+  bool populated{false};
+  bool initialized{false};
+  bool key_params_checked{false};
+  bool subuser_params_checked{false};
+  bool user_params_checked{false};
+
+  bool bucket_quota_specified{false};
+  bool user_quota_specified{false};
+  bool bucket_ratelimit_specified{false};
+  bool user_ratelimit_specified{false};
+
+  RGWQuota quota;
+  RGWRateLimitInfo user_ratelimit;
+  RGWRateLimitInfo bucket_ratelimit;
+
+  // req parameters for listing user
+  std::string marker{""};
+  uint32_t max_entries{1000};
+  rgw_placement_rule default_placement; // user default placement
+  bool default_placement_specified{false};
+
+  std::list<std::string> placement_tags;  // user default placement_tags
+  bool placement_tags_specified{false};
+
+  void set_access_key(const std::string& access_key) {
+    if (access_key.empty())
+      return;
+
+    id = access_key;
+    id_specified = true;
+    gen_access = false;
+    key_op = true;
+  }
+
+  void set_secret_key(const std::string& secret_key) {
+    if (secret_key.empty())
+      return;
+
+    key = secret_key;
+    key_specified = true;
+    gen_secret = false;
+    key_op = true;
+  }
+
+  void set_user_id(const rgw_user& id);
+
+  void set_new_user_id(const rgw_user& id) {
+    if (id.empty())
+      return;
+
+    new_user_id = id;
+  }
+  void set_overwrite_new_user(bool b) {
+    overwrite_new_user = b;
+  }
+
+  void set_user_email(std::string& email) {
+   /* always lowercase email address */
+    boost::algorithm::to_lower(email);
+    user_email = email;
+    user_email_specified = true;
+  }
+
+  void set_display_name(const std::string& name) {
+    if (name.empty())
+      return;
+
+    display_name = name;
+    display_name_specified = true;
+  }
+
+  void set_subuser(std::string& _subuser);
+
+  void set_caps(const std::string& _caps) {
+    if (_caps.empty())
+      return;
+
+    caps = _caps;
+    caps_specified = true;
+  }
+
+  void set_perm(uint32_t perm) {
+    perm_mask = perm;
+    perm_specified = true;
+  }
+
+  void set_op_mask(uint32_t mask) {
+    op_mask = mask;
+    op_mask_specified = true;
+  }
+
+  void set_temp_url_key(const std::string& key, int index) {
+    temp_url_keys[index] = key;
+    temp_url_key_specified = true;
+  }
+
+  void set_key_type(int32_t type) {
+    key_type = type;
+    type_specified = true;
+  }
+
+  void set_access_key_exist() {
+    access_key_exist = true;
+  }
+
+  void set_suspension(__u8 is_suspended) {
+    suspended = is_suspended;
+    suspension_op = true;
+  }
+
+  void set_admin(__u8 is_admin) {
+    admin = is_admin;
+    admin_specified = true;
+  }
+
+  void set_system(__u8 is_system) {
+    system = is_system;
+    system_specified = true;
+  }
+
+  void set_exclusive(__u8 is_exclusive) {
+    exclusive = is_exclusive;
+  }
+
+  void set_fetch_stats(__u8 is_fetch_stats) {
+    fetch_stats = is_fetch_stats;
+  }
+
+  void set_sync_stats(__u8 is_sync_stats) {
+    sync_stats = is_sync_stats;
+  }
+
+  void set_user_info(RGWUserInfo& user_info);
+
+  void set_user_version_tracker(RGWObjVersionTracker& objv_tracker);
+
+  void set_max_buckets(int32_t mb) {
+    max_buckets = mb;
+    max_buckets_specified = true;
+  }
+
+  void set_gen_access() {
+    gen_access = true;
+    key_op = true;
+  }
+
+  void set_gen_secret() {
+    gen_secret = true;
+    key_op = true;
+  }
+
+  void set_generate_key() {
+    if (id.empty())
+      gen_access = true;
+    if (key.empty())
+      gen_secret = true;
+    key_op = true;
+  }
+
+  void clear_generate_key() {
+    gen_access = false;
+    gen_secret = false;
+  }
+
+  void set_purge_keys() {
+    purge_keys = true;
+    key_op = true;
+  }
+
+  void set_bucket_quota(RGWQuotaInfo& quotas) {
+    quota.bucket_quota = quotas;
+    bucket_quota_specified = true;
+  }
+
+  void set_user_quota(RGWQuotaInfo& quotas) {
+    quota.user_quota = quotas;
+    user_quota_specified = true;
+  }
+
+  void set_bucket_ratelimit(RGWRateLimitInfo& ratelimit) {
+    bucket_ratelimit = ratelimit;
+    bucket_ratelimit_specified = true;
+  }
+
+  void set_user_ratelimit(RGWRateLimitInfo& ratelimit) {
+    user_ratelimit = ratelimit;
+    user_ratelimit_specified = true;
+  }
+
+  void set_mfa_ids(const std::set<std::string>& ids) {
+    mfa_ids = ids;
+    mfa_ids_specified = true;
+  }
+
+  void set_default_placement(const rgw_placement_rule& _placement) {
+    default_placement = _placement;
+    default_placement_specified = true;
+  }
+
+  void set_placement_tags(const std::list<std::string>& _tags) {
+    placement_tags = _tags;
+    placement_tags_specified = true;
+  }
+
+  bool is_populated() { return populated; }
+  bool is_initialized() { return initialized; }
+  bool has_existing_user() { return existing_user; }
+  bool has_existing_key() { return existing_key; }
+  bool has_existing_subuser() { return existing_subuser; }
+  bool has_existing_email() { return existing_email; }
+  bool has_subuser() { return subuser_specified; }
+  bool has_key_op() { return key_op; }
+  bool has_caps_op() { return caps_specified; }
+  bool has_suspension_op() { return suspension_op; }
+  bool has_subuser_perm() { return perm_specified; }
+  bool has_op_mask() { return op_mask_specified; }
+  bool will_gen_access() { return gen_access; }
+  bool will_gen_secret() { return gen_secret; }
+  bool will_gen_subuser() { return gen_subuser; }
+  bool will_purge_keys() { return purge_keys; }
+  bool will_purge_data() { return purge_data; }
+  bool will_generate_subuser() { return gen_subuser; }
+  bool has_bucket_quota() { return bucket_quota_specified; }
+  bool has_user_quota() { return user_quota_specified; }
+  void set_populated() { populated = true; }
+  void clear_populated() { populated = false; }
+  void set_initialized() { initialized = true; }
+  void set_existing_user(bool flag) { existing_user = flag; }
+  void set_existing_key(bool flag) { existing_key = flag; }
+  void set_existing_subuser(bool flag) { existing_subuser = flag; }
+  void set_existing_email(bool flag) { existing_email = flag; }
+  void set_purge_data(bool flag) { purge_data = flag; }
+  void set_generate_subuser(bool flag) { gen_subuser = flag; }
+  __u8 get_suspension_status() { return suspended; }
+  int32_t get_key_type() {return key_type; }
+  bool get_access_key_exist() {return access_key_exist; }
+  uint32_t get_subuser_perm() { return perm_mask; }
+  int32_t get_max_buckets() { return max_buckets; }
+  uint32_t get_op_mask() { return op_mask; }
+  RGWQuotaInfo& get_bucket_quota() { return quota.bucket_quota; }
+  RGWQuotaInfo& get_user_quota() { return quota.user_quota; }
+  std::set<std::string>& get_mfa_ids() { return mfa_ids; }
+
+  rgw::sal::User* get_user() { return user.get(); }
+  const rgw_user& get_user_id();
+  std::string get_subuser() { return subuser; }
+  std::string get_access_key() { return id; }
+  std::string get_secret_key() { return key; }
+  std::string get_caps() { return caps; }
+  std::string get_user_email() { return user_email; }
+  std::string get_display_name() { return display_name; }
+  rgw_user& get_new_uid() { return new_user_id; }
+  bool get_overwrite_new_user() const { return overwrite_new_user; }
+  std::map<int, std::string>& get_temp_url_keys() { return temp_url_keys; }
+
+  RGWUserInfo&  get_user_info();
+
+  std::map<std::string, RGWAccessKey>* get_swift_keys();
+  std::map<std::string, RGWAccessKey>* get_access_keys();
+  std::map<std::string, RGWSubUser>* get_subusers();
+
+  RGWUserCaps* get_caps_obj();
+
+  std::string build_default_swift_kid();
+
+  std::string generate_subuser();
+
+  RGWUserAdminOpState(rgw::sal::Driver* driver);
+};
+
+class RGWUser;
+
+class RGWAccessKeyPool
+{
+  RGWUser *user{nullptr};
+
+  std::map<std::string, int, ltstr_nocase> key_type_map;
+  rgw_user user_id;
+  rgw::sal::Driver* driver{nullptr};
+
+  std::map<std::string, RGWAccessKey> *swift_keys{nullptr};
+  std::map<std::string, RGWAccessKey> *access_keys{nullptr};
+
+  // we don't want to allow keys for the anonymous user or a null user
+  bool keys_allowed{false};
+
+private:
+  int create_key(RGWUserAdminOpState& op_state, std::string *err_msg = NULL);
+  int generate_key(const DoutPrefixProvider *dpp, 
+                   RGWUserAdminOpState& op_state, optional_yield y,
+                  std::string *err_msg = NULL);
+  int modify_key(RGWUserAdminOpState& op_state, std::string *err_msg = NULL);
+
+  int check_key_owner(RGWUserAdminOpState& op_state);
+  bool check_existing_key(RGWUserAdminOpState& op_state);
+  int check_op(RGWUserAdminOpState& op_state, std::string *err_msg = NULL);
+
+  /* API Contract Fulfilment */
+  int execute_add(const DoutPrefixProvider *dpp, 
+                  RGWUserAdminOpState& op_state, std::string *err_msg,
+                 bool defer_save, optional_yield y);
+  int execute_remove(const DoutPrefixProvider *dpp, 
+                     RGWUserAdminOpState& op_state, std::string *err_msg,
+                    bool defer_save, optional_yield y);
+  int remove_subuser_keys(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, std::string *err_msg,
+                         bool defer_save, optional_yield y);
+
+  int add(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, std::string *err_msg, bool defer_save,
+         optional_yield y);
+  int remove(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, std::string *err_msg,
+            bool defer_save, optional_yield y);
+public:
+  explicit RGWAccessKeyPool(RGWUser* usr);
+
+  int init(RGWUserAdminOpState& op_state);
+
+  /* API Contracted Methods */
+  int add(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, optional_yield y,
+         std::string *err_msg = NULL);
+  int remove(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, optional_yield y,
+            std::string *err_msg = NULL);
+
+  friend class RGWUser;
+  friend class RGWSubUserPool;
+};
+
+class RGWSubUserPool
+{
+  RGWUser *user{nullptr};
+
+  rgw_user user_id;
+  rgw::sal::Driver* driver{nullptr};
+  bool subusers_allowed{false};
+
+  std::map<std::string, RGWSubUser> *subuser_map{nullptr};
+
+private:
+  int check_op(RGWUserAdminOpState& op_state, std::string *err_msg = NULL);
+
+  /* API Contract Fulfillment */
+  int execute_add(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, std::string *err_msg, bool defer_save, optional_yield y);
+  int execute_remove(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, std::string *err_msg, bool defer_save, optional_yield y);
+  int execute_modify(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, std::string *err_msg, bool defer_save, optional_yield y);
+
+  int add(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, std::string *err_msg, bool defer_save,
+         optional_yield y);
+  int remove(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, std::string *err_msg, bool defer_save, optional_yield y);
+  int modify(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, optional_yield y, std::string *err_msg, bool defer_save);
+public:
+  explicit RGWSubUserPool(RGWUser *user);
+
+  bool exists(std::string subuser);
+  int init(RGWUserAdminOpState& op_state);
+
+  /* API contracted methods */
+  int add(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, optional_yield y,
+         std::string *err_msg = NULL);
+  int remove(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, optional_yield y, std::string *err_msg = NULL);
+  int modify(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, optional_yield y, std::string *err_msg = NULL);
+
+  friend class RGWUser;
+};
+
+class RGWUserCapPool
+{
+  RGWUserCaps *caps{nullptr};
+  bool caps_allowed{false};
+  RGWUser *user{nullptr};
+
+private:
+  int add(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, std::string *err_msg, bool defer_save,
+         optional_yield y);
+  int remove(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, std::string *err_msg, bool defer_save,
+            optional_yield y);
+
+public:
+  explicit RGWUserCapPool(RGWUser *user);
+
+  int init(RGWUserAdminOpState& op_state);
+
+  /* API contracted methods */
+  int add(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, optional_yield y,
+         std::string *err_msg = NULL);
+  int remove(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, optional_yield y, std::string *err_msg = NULL);
+
+  friend class RGWUser;
+};
+
+class RGWUser
+{
+
+private:
+  RGWUserInfo old_info;
+  rgw::sal::Driver* driver{nullptr};
+
+  rgw_user user_id;
+  bool info_stored{false};
+
+  void set_populated() { info_stored = true; }
+  void clear_populated() { info_stored = false; }
+  bool is_populated() { return info_stored; }
+
+  int check_op(RGWUserAdminOpState&  req, std::string *err_msg);
+  int update(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, std::string *err_msg, optional_yield y);
+
+  void clear_members();
+  void init_default();
+
+  /* API Contract Fulfillment */
+  int execute_add(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, std::string *err_msg,
+                 optional_yield y);
+  int execute_remove(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state,
+                    std::string *err_msg, optional_yield y);
+  int execute_modify(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, std::string *err_msg, optional_yield y);
+  int execute_rename(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, std::string *err_msg, optional_yield y);
+
+public:
+  RGWUser();
+
+  int init(const DoutPrefixProvider *dpp, rgw::sal::Driver* storage, RGWUserAdminOpState& op_state,
+          optional_yield y);
+
+  int init_storage(rgw::sal::Driver* storage);
+  int init(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, optional_yield y);
+  int init_members(RGWUserAdminOpState& op_state);
+
+  rgw::sal::Driver* get_driver() { return driver; }
+
+  /* API Contracted Members */
+  RGWUserCapPool caps;
+  RGWAccessKeyPool keys;
+  RGWSubUserPool subusers;
+
+  /* API Contracted Methods */
+  int add(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, optional_yield y, std::string *err_msg = NULL);
+
+  int remove(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, optional_yield y, std::string *err_msg = NULL);
+
+  int rename(RGWUserAdminOpState& op_state, optional_yield y, const DoutPrefixProvider *dpp, std::string *err_msg = NULL);
+
+  /* remove an already populated RGWUser */
+  int remove(std::string *err_msg = NULL);
+
+  int modify(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, optional_yield y, std::string *err_msg = NULL);
+
+  /* retrieve info from an existing user in the RGW system */
+  int info(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, RGWUserInfo& fetched_info, optional_yield y,
+          std::string *err_msg = NULL);
+
+  /* info from an already populated RGWUser */
+  int info (RGWUserInfo& fetched_info, std::string *err_msg = NULL);
+
+  /* list the existing users */
+  int list(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, RGWFormatterFlusher& flusher);
+
+  friend class RGWAccessKeyPool;
+  friend class RGWSubUserPool;
+  friend class RGWUserCapPool;
+};
+
+/* Wrappers for admin API functionality */
+
+class RGWUserAdminOp_User
+{
+public:
+  static int list(const DoutPrefixProvider *dpp, rgw::sal::Driver* driver,
+                  RGWUserAdminOpState& op_state, RGWFormatterFlusher& flusher);
+
+  static int info(const DoutPrefixProvider *dpp,
+                 rgw::sal::Driver* driver,
+                  RGWUserAdminOpState& op_state, RGWFormatterFlusher& flusher,
+                 optional_yield y);
+
+  static int create(const DoutPrefixProvider *dpp,
+                   rgw::sal::Driver* driver,
+                   RGWUserAdminOpState& op_state, RGWFormatterFlusher& flusher,
+                   optional_yield y);
+
+  static int modify(const DoutPrefixProvider *dpp,
+                   rgw::sal::Driver* driver,
+                   RGWUserAdminOpState& op_state, RGWFormatterFlusher& flusher, optional_yield y);
+
+  static int remove(const DoutPrefixProvider *dpp, rgw::sal::Driver* driver,
+                  RGWUserAdminOpState& op_state, RGWFormatterFlusher& flusher, optional_yield y);
+};
+
+class RGWUserAdminOp_Subuser
+{
+public:
+  static int create(const DoutPrefixProvider *dpp,
+                   rgw::sal::Driver* driver,
+                   RGWUserAdminOpState& op_state, RGWFormatterFlusher& flusher,
+                   optional_yield y);
+
+  static int modify(const DoutPrefixProvider *dpp,
+                   rgw::sal::Driver* driver,
+                   RGWUserAdminOpState& op_state, RGWFormatterFlusher& flusher,
+                   optional_yield y);
+
+  static int remove(const DoutPrefixProvider *dpp,
+                   rgw::sal::Driver* driver,
+                   RGWUserAdminOpState& op_state, RGWFormatterFlusher& flusher,
+                   optional_yield y);
+};
+
+class RGWUserAdminOp_Key
+{
+public:
+  static int create(const DoutPrefixProvider *dpp, rgw::sal::Driver* driver,
+                   RGWUserAdminOpState& op_state, RGWFormatterFlusher& flusher,
+                   optional_yield y);
+
+  static int remove(const DoutPrefixProvider *dpp,
+                   rgw::sal::Driver* driver,
+                   RGWUserAdminOpState& op_state, RGWFormatterFlusher& flusher,
+                   optional_yield y);
+};
+
+class RGWUserAdminOp_Caps
+{
+public:
+  static int add(const DoutPrefixProvider *dpp,
+                rgw::sal::Driver* driver,
+                RGWUserAdminOpState& op_state, RGWFormatterFlusher& flusher,
+                optional_yield y);
+
+  static int remove(const DoutPrefixProvider *dpp,
+                   rgw::sal::Driver* driver,
+                   RGWUserAdminOpState& op_state, RGWFormatterFlusher& flusher,
+                   optional_yield y);
+};
+
+struct RGWUserCompleteInfo {
+  RGWUserInfo info;
+  std::map<std::string, bufferlist> attrs;
+  bool has_attrs{false};
+
+  void dump(Formatter * const f) const {
+    info.dump(f);
+    encode_json("attrs", attrs, f);
+  }
+
+  void decode_json(JSONObj *obj) {
+    decode_json_obj(info, obj);
+    has_attrs = JSONDecoder::decode_json("attrs", attrs, obj);
+  }
+};
+
+class RGWUserMetadataObject : public RGWMetadataObject {
+  RGWUserCompleteInfo uci;
+public:
+  RGWUserMetadataObject() {}
+  RGWUserMetadataObject(const RGWUserCompleteInfo& _uci, const obj_version& v, real_time m)
+      : uci(_uci) {
+    objv = v;
+    mtime = m;
+  }
+
+  void dump(Formatter *f) const override {
+    uci.dump(f);
+  }
+
+  RGWUserCompleteInfo& get_uci() {
+    return uci;
+  }
+};
+
+class RGWUserMetadataHandler;
+
+class RGWUserCtl
+{
+  struct Svc {
+    RGWSI_Zone *zone{nullptr};
+    RGWSI_User *user{nullptr};
+  } svc;
+
+  struct Ctl {
+    RGWBucketCtl *bucket{nullptr};
+  } ctl;
+
+  RGWUserMetadataHandler *umhandler;
+  RGWSI_MetaBackend_Handler *be_handler{nullptr};
+  
+public:
+  RGWUserCtl(RGWSI_Zone *zone_svc,
+             RGWSI_User *user_svc,
+             RGWUserMetadataHandler *_umhandler);
+
+  void init(RGWBucketCtl *bucket_ctl) {
+    ctl.bucket = bucket_ctl;
+  }
+
+  RGWBucketCtl *get_bucket_ctl() {
+    return ctl.bucket;
+  }
+
+  struct GetParams {
+    RGWObjVersionTracker *objv_tracker{nullptr};
+    ceph::real_time *mtime{nullptr};
+    rgw_cache_entry_info *cache_info{nullptr};
+    std::map<std::string, bufferlist> *attrs{nullptr};
+
+    GetParams() {}
+
+    GetParams& set_objv_tracker(RGWObjVersionTracker *_objv_tracker) {
+      objv_tracker = _objv_tracker;
+      return *this;
+    }
+
+    GetParams& set_mtime(ceph::real_time *_mtime) {
+      mtime = _mtime;
+      return *this;
+    }
+
+    GetParams& set_cache_info(rgw_cache_entry_info *_cache_info) {
+      cache_info = _cache_info;
+      return *this;
+    }
+
+    GetParams& set_attrs(std::map<std::string, bufferlist> *_attrs) {
+      attrs = _attrs;
+      return *this;
+    }
+  };
+
+  struct PutParams {
+    RGWUserInfo *old_info{nullptr};
+    RGWObjVersionTracker *objv_tracker{nullptr};
+    ceph::real_time mtime;
+    bool exclusive{false};
+    std::map<std::string, bufferlist> *attrs{nullptr};
+
+    PutParams() {}
+
+    PutParams& set_old_info(RGWUserInfo *_info) {
+      old_info = _info;
+      return *this;
+    }
+
+    PutParams& set_objv_tracker(RGWObjVersionTracker *_objv_tracker) {
+      objv_tracker = _objv_tracker;
+      return *this;
+    }
+
+    PutParams& set_mtime(const ceph::real_time& _mtime) {
+      mtime = _mtime;
+      return *this;
+    }
+
+    PutParams& set_exclusive(bool _exclusive) {
+      exclusive = _exclusive;
+      return *this;
+    }
+
+    PutParams& set_attrs(std::map<std::string, bufferlist> *_attrs) {
+      attrs = _attrs;
+      return *this;
+    }
+  };
+
+  struct RemoveParams {
+    RGWObjVersionTracker *objv_tracker{nullptr};
+
+    RemoveParams() {}
+
+    RemoveParams& set_objv_tracker(RGWObjVersionTracker *_objv_tracker) {
+      objv_tracker = _objv_tracker;
+      return *this;
+    }
+  };
+
+  int get_info_by_uid(const DoutPrefixProvider *dpp, 
+                      const rgw_user& uid, RGWUserInfo *info,
+                      optional_yield y, const GetParams& params = {});
+  int get_info_by_email(const DoutPrefixProvider *dpp, 
+                        const std::string& email, RGWUserInfo *info,
+                        optional_yield y, const GetParams& params = {});
+  int get_info_by_swift(const DoutPrefixProvider *dpp, 
+                        const std::string& swift_name, RGWUserInfo *info,
+                        optional_yield y, const GetParams& params = {});
+  int get_info_by_access_key(const DoutPrefixProvider *dpp, 
+                             const std::string& access_key, RGWUserInfo *info,
+                             optional_yield y, const GetParams& params = {});
+
+  int get_attrs_by_uid(const DoutPrefixProvider *dpp, 
+                       const rgw_user& user_id,
+                       std::map<std::string, bufferlist> *attrs,
+                       optional_yield y,
+                       RGWObjVersionTracker *objv_tracker = nullptr);
+
+  int store_info(const DoutPrefixProvider *dpp, 
+                 const RGWUserInfo& info, optional_yield y,
+                 const PutParams& params = {});
+  int remove_info(const DoutPrefixProvider *dpp, 
+                  const RGWUserInfo& info, optional_yield y,
+                  const RemoveParams& params = {});
+
+  int list_buckets(const DoutPrefixProvider *dpp, 
+                   const rgw_user& user,
+                   const std::string& marker,
+                   const std::string& end_marker,
+                   uint64_t max,
+                   bool need_stats,
+                   RGWUserBuckets *buckets,
+                   bool *is_truncated,
+                  optional_yield y,
+                   uint64_t default_max = 1000);
+
+  int read_stats(const DoutPrefixProvider *dpp, 
+                 const rgw_user& user, RGWStorageStats *stats,
+                optional_yield y,
+                ceph::real_time *last_stats_sync = nullptr,     /* last time a full stats sync completed */
+                ceph::real_time *last_stats_update = nullptr);   /* last time a stats update was done */
+};
+
+class RGWUserMetaHandlerAllocator {
+public:
+  static RGWMetadataHandler *alloc(RGWSI_User *user_svc);
+};
+
+
+#endif
diff --git a/src/rgw/driver/rados/rgw_zone.cc b/src/rgw/driver/rados/rgw_zone.cc
new file mode 100644 (file)
index 0000000..d9e7505
--- /dev/null
@@ -0,0 +1,1287 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "rgw_zone.h"
+#include "rgw_realm_watcher.h"
+#include "rgw_sal_config.h"
+#include "rgw_sync.h"
+
+#include "services/svc_zone.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+using namespace rgw_zone_defaults;
+
+RGWMetaSyncStatusManager::~RGWMetaSyncStatusManager(){}
+
+#define FIRST_EPOCH 1
+
+struct RGWAccessKey;
+
+/// Generate a random uuid for realm/period/zonegroup/zone ids
+static std::string gen_random_uuid()
+{
+  uuid_d uuid;
+  uuid.generate_random();
+  return uuid.to_string();
+}
+
+void RGWDefaultZoneGroupInfo::dump(Formatter *f) const {
+  encode_json("default_zonegroup", default_zonegroup, f);
+}
+
+void RGWDefaultZoneGroupInfo::decode_json(JSONObj *obj) {
+
+  JSONDecoder::decode_json("default_zonegroup", default_zonegroup, obj);
+  /* backward compatability with region */
+  if (default_zonegroup.empty()) {
+    JSONDecoder::decode_json("default_region", default_zonegroup, obj);
+  }
+}
+
+int RGWZoneGroup::create_default(const DoutPrefixProvider *dpp, optional_yield y, bool old_format)
+{
+  name = default_zonegroup_name;
+  api_name = default_zonegroup_name;
+  is_master = true;
+
+  RGWZoneGroupPlacementTarget placement_target;
+  placement_target.name = "default-placement";
+  placement_targets[placement_target.name] = placement_target;
+  default_placement.name = "default-placement";
+
+  RGWZoneParams zone_params(default_zone_name);
+
+  int r = zone_params.init(dpp, cct, sysobj_svc, y, false);
+  if (r < 0) {
+    ldpp_dout(dpp, 0) << "create_default: error initializing zone params: " << cpp_strerror(-r) << dendl;
+    return r;
+  }
+
+  r = zone_params.create_default(dpp, y);
+  if (r < 0 && r != -EEXIST) {
+    ldpp_dout(dpp, 0) << "create_default: error in create_default  zone params: " << cpp_strerror(-r) << dendl;
+    return r;
+  } else if (r == -EEXIST) {
+    ldpp_dout(dpp, 10) << "zone_params::create_default() returned -EEXIST, we raced with another default zone_params creation" << dendl;
+    zone_params.clear_id();
+    r = zone_params.init(dpp, cct, sysobj_svc, y);
+    if (r < 0) {
+      ldpp_dout(dpp, 0) << "create_default: error in init existing zone params: " << cpp_strerror(-r) << dendl;
+      return r;
+    }
+    ldpp_dout(dpp, 20) << "zone_params::create_default() " << zone_params.get_name() << " id " << zone_params.get_id()
+                  << dendl;
+  }
+  
+  RGWZone& default_zone = zones[zone_params.get_id()];
+  default_zone.name = zone_params.get_name();
+  default_zone.id = zone_params.get_id();
+  master_zone = default_zone.id;
+
+  // enable all supported features
+  enabled_features.insert(rgw::zone_features::supported.begin(),
+                          rgw::zone_features::supported.end());
+  default_zone.supported_features = enabled_features;
+  
+  r = create(dpp, y);
+  if (r < 0 && r != -EEXIST) {
+    ldpp_dout(dpp, 0) << "error storing zone group info: " << cpp_strerror(-r) << dendl;
+    return r;
+  }
+
+  if (r == -EEXIST) {
+    ldpp_dout(dpp, 10) << "create_default() returned -EEXIST, we raced with another zonegroup creation" << dendl;
+    id.clear();
+    r = init(dpp, cct, sysobj_svc, y);
+    if (r < 0) {
+      return r;
+    }
+  }
+
+  if (old_format) {
+    name = id;
+  }
+
+  post_process_params(dpp, y);
+
+  return 0;
+}
+
+int RGWZoneGroup::equals(const string& other_zonegroup) const
+{
+  if (is_master && other_zonegroup.empty())
+    return true;
+
+  return (id  == other_zonegroup);
+}
+
+int RGWZoneGroup::add_zone(const DoutPrefixProvider *dpp, 
+                           const RGWZoneParams& zone_params, bool *is_master, bool *read_only,
+                           const list<string>& endpoints, const string *ptier_type,
+                           bool *psync_from_all, list<string>& sync_from, list<string>& sync_from_rm,
+                           string *predirect_zone, std::optional<int> bucket_index_max_shards,
+                           RGWSyncModulesManager *sync_mgr,
+                           const rgw::zone_features::set& enable_features,
+                           const rgw::zone_features::set& disable_features,
+                          optional_yield y)
+{
+  auto& zone_id = zone_params.get_id();
+  auto& zone_name = zone_params.get_name();
+
+  // check for duplicate zone name on insert
+  if (!zones.count(zone_id)) {
+    for (const auto& zone : zones) {
+      if (zone.second.name == zone_name) {
+        ldpp_dout(dpp, 0) << "ERROR: found existing zone name " << zone_name
+            << " (" << zone.first << ") in zonegroup " << get_name() << dendl;
+        return -EEXIST;
+      }
+    }
+  }
+
+  if (is_master) {
+    if (*is_master) {
+      if (!master_zone.empty() && master_zone != zone_id) {
+        ldpp_dout(dpp, 0) << "NOTICE: overriding master zone: " << master_zone << dendl;
+      }
+      master_zone = zone_id;
+    } else if (master_zone == zone_id) {
+      master_zone.clear();
+    }
+  }
+
+  RGWZone& zone = zones[zone_id];
+  zone.name = zone_name;
+  zone.id = zone_id;
+  if (!endpoints.empty()) {
+    zone.endpoints = endpoints;
+  }
+  if (read_only) {
+    zone.read_only = *read_only;
+  }
+  if (ptier_type) {
+    zone.tier_type = *ptier_type;
+    if (!sync_mgr->get_module(*ptier_type, nullptr)) {
+      ldpp_dout(dpp, 0) << "ERROR: could not found sync module: " << *ptier_type 
+                    << ",  valid sync modules: " 
+                    << sync_mgr->get_registered_module_names()
+                    << dendl;
+      return -ENOENT;
+    }
+  }
+
+  if (psync_from_all) {
+    zone.sync_from_all = *psync_from_all;
+  }
+
+  if (predirect_zone) {
+    zone.redirect_zone = *predirect_zone;
+  }
+
+  if (bucket_index_max_shards) {
+    zone.bucket_index_max_shards = *bucket_index_max_shards;
+  }
+
+  for (auto add : sync_from) {
+    zone.sync_from.insert(add);
+  }
+
+  for (auto rm : sync_from_rm) {
+    zone.sync_from.erase(rm);
+  }
+
+  zone.supported_features.insert(enable_features.begin(),
+                                 enable_features.end());
+
+  for (const auto& feature : disable_features) {
+    if (enabled_features.contains(feature)) {
+      lderr(cct) << "ERROR: Cannot disable zone feature \"" << feature
+          << "\" until it's been disabled in zonegroup " << name << dendl;
+      return -EINVAL;
+    }
+    auto i = zone.supported_features.find(feature);
+    if (i == zone.supported_features.end()) {
+      ldout(cct, 1) << "WARNING: zone feature \"" << feature
+          << "\" was not enabled in zone " << zone.name << dendl;
+      continue;
+    }
+    zone.supported_features.erase(i);
+  }
+
+  post_process_params(dpp, y);
+
+  return update(dpp,y);
+}
+
+
+int RGWZoneGroup::rename_zone(const DoutPrefixProvider *dpp, 
+                              const RGWZoneParams& zone_params,
+                             optional_yield y)
+{
+  RGWZone& zone = zones[zone_params.get_id()];
+  zone.name = zone_params.get_name();
+
+  return update(dpp, y);
+}
+
+void RGWZoneGroup::post_process_params(const DoutPrefixProvider *dpp, optional_yield y)
+{
+  bool log_data = zones.size() > 1;
+
+  if (master_zone.empty()) {
+    auto iter = zones.begin();
+    if (iter != zones.end()) {
+      master_zone = iter->first;
+    }
+  }
+  
+  for (auto& item : zones) {
+    RGWZone& zone = item.second;
+    zone.log_data = log_data;
+
+    RGWZoneParams zone_params(zone.id, zone.name);
+    int ret = zone_params.init(dpp, cct, sysobj_svc, y);
+    if (ret < 0) {
+      ldpp_dout(dpp, 0) << "WARNING: could not read zone params for zone id=" << zone.id << " name=" << zone.name << dendl;
+      continue;
+    }
+
+    for (auto& pitem : zone_params.placement_pools) {
+      const string& placement_name = pitem.first;
+      if (placement_targets.find(placement_name) == placement_targets.end()) {
+        RGWZoneGroupPlacementTarget placement_target;
+        placement_target.name = placement_name;
+        placement_targets[placement_name] = placement_target;
+      }
+    }
+  }
+
+  if (default_placement.empty() && !placement_targets.empty()) {
+    default_placement.init(placement_targets.begin()->first, RGW_STORAGE_CLASS_STANDARD);
+  }
+}
+
+int RGWZoneGroup::remove_zone(const DoutPrefixProvider *dpp, const std::string& zone_id, optional_yield y)
+{
+  auto iter = zones.find(zone_id);
+  if (iter == zones.end()) {
+    ldpp_dout(dpp, 0) << "zone id " << zone_id << " is not a part of zonegroup "
+        << name << dendl;
+    return -ENOENT;
+  }
+
+  zones.erase(iter);
+
+  post_process_params(dpp, y);
+
+  return update(dpp, y);
+}
+
+void RGWDefaultSystemMetaObjInfo::dump(Formatter *f) const {
+  encode_json("default_id", default_id, f);
+}
+
+void RGWDefaultSystemMetaObjInfo::decode_json(JSONObj *obj) {
+  JSONDecoder::decode_json("default_id", default_id, obj);
+}
+
+int RGWSystemMetaObj::rename(const DoutPrefixProvider *dpp, const string& new_name, optional_yield y)
+{
+  string new_id;
+  int ret = read_id(dpp, new_name, new_id, y);
+  if (!ret) {
+    return -EEXIST;
+  }
+  if (ret < 0 && ret != -ENOENT) {
+    ldpp_dout(dpp, 0) << "Error read_id " << new_name << ": " << cpp_strerror(-ret) << dendl;
+    return ret;
+  }
+  string old_name = name;
+  name = new_name;
+  ret = update(dpp, y);
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "Error storing new obj info " << new_name << ": " << cpp_strerror(-ret) << dendl;
+    return ret;
+  }
+  ret = store_name(dpp, true, y);
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "Error storing new name " << new_name << ": " << cpp_strerror(-ret) << dendl;
+    return ret;
+  }
+  /* delete old name */
+  rgw_pool pool(get_pool(cct));
+  string oid = get_names_oid_prefix() + old_name;
+  rgw_raw_obj old_name_obj(pool, oid);
+  auto sysobj = sysobj_svc->get_obj(old_name_obj);
+  ret = sysobj.wop().remove(dpp, y);
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "Error delete old obj name  " << old_name << ": " << cpp_strerror(-ret) << dendl;
+    return ret;
+  }
+
+  return ret;
+}
+
+int RGWSystemMetaObj::read(const DoutPrefixProvider *dpp, optional_yield y)
+{
+  int ret = read_id(dpp, name, id, y);
+  if (ret < 0) {
+    return ret;
+  }
+
+  return read_info(dpp, id, y);
+}
+
+int RGWZoneParams::create_default(const DoutPrefixProvider *dpp, optional_yield y, bool old_format)
+{
+  name = default_zone_name;
+
+  int r = create(dpp, y);
+  if (r < 0) {
+    return r;
+  }
+
+  if (old_format) {
+    name = id;
+  }
+
+  return r;
+}
+
+const string& RGWZoneParams::get_compression_type(const rgw_placement_rule& placement_rule) const
+{
+  static const std::string NONE{"none"};
+  auto p = placement_pools.find(placement_rule.name);
+  if (p == placement_pools.end()) {
+    return NONE;
+  }
+  const auto& type = p->second.get_compression_type(placement_rule.get_storage_class());
+  return !type.empty() ? type : NONE;
+}
+
+// run an MD5 hash on the zone_id and return the first 32 bits
+static uint32_t gen_short_zone_id(const std::string zone_id)
+{
+  unsigned char md5[CEPH_CRYPTO_MD5_DIGESTSIZE];
+  MD5 hash;
+  // Allow use of MD5 digest in FIPS mode for non-cryptographic purposes
+  hash.SetFlags(EVP_MD_CTX_FLAG_NON_FIPS_ALLOW);
+  hash.Update((const unsigned char *)zone_id.c_str(), zone_id.size());
+  hash.Final(md5);
+
+  uint32_t short_id;
+  memcpy((char *)&short_id, md5, sizeof(short_id));
+  return std::max(short_id, 1u);
+}
+
+int RGWPeriodMap::update(const RGWZoneGroup& zonegroup, CephContext *cct)
+{
+  if (zonegroup.is_master_zonegroup() && (!master_zonegroup.empty() && zonegroup.get_id() != master_zonegroup)) {
+    ldout(cct,0) << "Error updating periodmap, multiple master zonegroups configured "<< dendl;
+    ldout(cct,0) << "master zonegroup: " << master_zonegroup << " and  " << zonegroup.get_id() <<dendl;
+    return -EINVAL;
+  }
+  map<string, RGWZoneGroup>::iterator iter = zonegroups.find(zonegroup.get_id());
+  if (iter != zonegroups.end()) {
+    RGWZoneGroup& old_zonegroup = iter->second;
+    if (!old_zonegroup.api_name.empty()) {
+      zonegroups_by_api.erase(old_zonegroup.api_name);
+    }
+  }
+  zonegroups[zonegroup.get_id()] = zonegroup;
+
+  if (!zonegroup.api_name.empty()) {
+    zonegroups_by_api[zonegroup.api_name] = zonegroup;
+  }
+
+  if (zonegroup.is_master_zonegroup()) {
+    master_zonegroup = zonegroup.get_id();
+  } else if (master_zonegroup == zonegroup.get_id()) {
+    master_zonegroup = "";
+  }
+
+  for (auto& i : zonegroup.zones) {
+    auto& zone = i.second;
+    if (short_zone_ids.find(zone.id) != short_zone_ids.end()) {
+      continue;
+    }
+    // calculate the zone's short id
+    uint32_t short_id = gen_short_zone_id(zone.id);
+
+    // search for an existing zone with the same short id
+    for (auto& s : short_zone_ids) {
+      if (s.second == short_id) {
+        ldout(cct, 0) << "New zone '" << zone.name << "' (" << zone.id
+            << ") generates the same short_zone_id " << short_id
+            << " as existing zone id " << s.first << dendl;
+        return -EEXIST;
+      }
+    }
+
+    short_zone_ids[zone.id] = short_id;
+  }
+
+  return 0;
+}
+
+uint32_t RGWPeriodMap::get_zone_short_id(const string& zone_id) const
+{
+  auto i = short_zone_ids.find(zone_id);
+  if (i == short_zone_ids.end()) {
+    return 0;
+  }
+  return i->second;
+}
+
+bool RGWPeriodMap::find_zone_by_name(const string& zone_name,
+                                     RGWZoneGroup *zonegroup,
+                                     RGWZone *zone) const
+{
+  for (auto& iter : zonegroups) {
+    auto& zg = iter.second;
+    for (auto& ziter : zg.zones) {
+      auto& z = ziter.second;
+
+      if (z.name == zone_name) {
+        *zonegroup = zg;
+        *zone = z;
+        return true;
+      }
+    }
+  }
+
+  return false;
+}
+
+namespace rgw {
+
+int read_realm(const DoutPrefixProvider* dpp, optional_yield y,
+               sal::ConfigStore* cfgstore,
+               std::string_view realm_id,
+               std::string_view realm_name,
+               RGWRealm& info,
+               std::unique_ptr<sal::RealmWriter>* writer)
+{
+  if (!realm_id.empty()) {
+    return cfgstore->read_realm_by_id(dpp, y, realm_id, info, writer);
+  }
+  if (!realm_name.empty()) {
+    return cfgstore->read_realm_by_name(dpp, y, realm_name, info, writer);
+  }
+  return cfgstore->read_default_realm(dpp, y, info, writer);
+}
+
+int create_realm(const DoutPrefixProvider* dpp, optional_yield y,
+                 sal::ConfigStore* cfgstore, bool exclusive,
+                 RGWRealm& info,
+                 std::unique_ptr<sal::RealmWriter>* writer_out)
+{
+  if (info.name.empty()) {
+    ldpp_dout(dpp, -1) << __func__ << " requires a realm name" << dendl;
+    return -EINVAL;
+  }
+  if (info.id.empty()) {
+    info.id = gen_random_uuid();
+  }
+
+  // if the realm already has a current_period, just make sure it exists
+  std::optional<RGWPeriod> period;
+  if (!info.current_period.empty()) {
+    period.emplace();
+    int r = cfgstore->read_period(dpp, y, info.current_period,
+                                  std::nullopt, *period);
+    if (r < 0) {
+      ldpp_dout(dpp, -1) << __func__ << " failed to read realm's current_period="
+          << info.current_period << " with " << cpp_strerror(r) << dendl;
+      return r;
+    }
+  }
+
+  // create the realm
+  std::unique_ptr<sal::RealmWriter> writer;
+  int r = cfgstore->create_realm(dpp, y, exclusive, info, &writer);
+  if (r < 0) {
+    return r;
+  }
+
+  if (!period) {
+    // initialize and exclusive-create the initial period
+    period.emplace();
+    period->id = gen_random_uuid();
+    period->period_map.id = period->id;
+    period->epoch = FIRST_EPOCH;
+    period->realm_id = info.id;
+    period->realm_name = info.name;
+
+    r = cfgstore->create_period(dpp, y, true, *period);
+    if (r < 0) {
+      ldpp_dout(dpp, -1) << __func__ << " failed to create the initial period id="
+          << period->id << " for realm " << info.name
+          << " with " << cpp_strerror(r) << dendl;
+      return r;
+    }
+  }
+
+  // update the realm's current_period
+  r = realm_set_current_period(dpp, y, cfgstore, *writer, info, *period);
+  if (r < 0) {
+    return r;
+  }
+
+  // try to set as default. may race with another create, so pass exclusive=true
+  // so we don't override an existing default
+  r = set_default_realm(dpp, y, cfgstore, info, true);
+  if (r < 0 && r != -EEXIST) {
+    ldpp_dout(dpp, 0) << "WARNING: failed to set realm as default: "
+        << cpp_strerror(r) << dendl;
+  }
+
+  if (writer_out) {
+    *writer_out = std::move(writer);
+  }
+  return 0;
+}
+
+int set_default_realm(const DoutPrefixProvider* dpp, optional_yield y,
+                      sal::ConfigStore* cfgstore, const RGWRealm& info,
+                      bool exclusive)
+{
+  return cfgstore->write_default_realm_id(dpp, y, exclusive, info.id);
+}
+
+int realm_set_current_period(const DoutPrefixProvider* dpp, optional_yield y,
+                             sal::ConfigStore* cfgstore,
+                             sal::RealmWriter& writer, RGWRealm& realm,
+                             const RGWPeriod& period)
+{
+  // update realm epoch to match the period's
+  if (realm.epoch > period.realm_epoch) {
+    ldpp_dout(dpp, -1) << __func__ << " with old realm epoch "
+        << period.realm_epoch << ", current epoch=" << realm.epoch << dendl;
+    return -EINVAL;
+  }
+  if (realm.epoch == period.realm_epoch && realm.current_period != period.id) {
+    ldpp_dout(dpp, -1) << __func__ << " with same realm epoch "
+        << period.realm_epoch << ", but different period id "
+        << period.id << " != " << realm.current_period << dendl;
+    return -EINVAL;
+  }
+
+  realm.epoch = period.realm_epoch;
+  realm.current_period = period.id;
+
+  // update the realm object
+  int r = writer.write(dpp, y, realm);
+  if (r < 0) {
+    ldpp_dout(dpp, -1) << __func__ << " failed to overwrite realm "
+        << realm.name << " with " << cpp_strerror(r) << dendl;
+    return r;
+  }
+
+  // reflect the zonegroup and period config
+  (void) reflect_period(dpp, y, cfgstore, period);
+  return 0;
+}
+
+int reflect_period(const DoutPrefixProvider* dpp, optional_yield y,
+                   sal::ConfigStore* cfgstore, const RGWPeriod& info)
+{
+  // overwrite the local period config and zonegroup objects
+  constexpr bool exclusive = false;
+
+  int r = cfgstore->write_period_config(dpp, y, exclusive, info.realm_id,
+                                        info.period_config);
+  if (r < 0) {
+    ldpp_dout(dpp, -1) << __func__ << " failed to store period config for realm id="
+        << info.realm_id << " with " << cpp_strerror(r) << dendl;
+    return r;
+  }
+
+  for (auto& [zonegroup_id, zonegroup] : info.period_map.zonegroups) {
+    r = cfgstore->create_zonegroup(dpp, y, exclusive, zonegroup, nullptr);
+    if (r < 0) {
+      ldpp_dout(dpp, -1) << __func__ << " failed to store zonegroup id="
+          << zonegroup_id << " with " << cpp_strerror(r) << dendl;
+      return r;
+    }
+    if (zonegroup.is_master) {
+      // set master as default if no default exists
+      constexpr bool exclusive = true;
+      r = set_default_zonegroup(dpp, y, cfgstore, zonegroup, exclusive);
+      if (r == 0) {
+        ldpp_dout(dpp, 1) << "Set the period's master zonegroup "
+            << zonegroup.name << " as the default" << dendl;
+      }
+    }
+  }
+  return 0;
+}
+
+std::string get_staging_period_id(std::string_view realm_id)
+{
+  return string_cat_reserve(realm_id, ":staging");
+}
+
+void fork_period(const DoutPrefixProvider* dpp, RGWPeriod& info)
+{
+  ldpp_dout(dpp, 20) << __func__ << " realm id=" << info.realm_id
+      << " period id=" << info.id << dendl;
+
+  info.predecessor_uuid = std::move(info.id);
+  info.id = get_staging_period_id(info.realm_id);
+  info.period_map.reset();
+  info.realm_epoch++;
+}
+
+int update_period(const DoutPrefixProvider* dpp, optional_yield y,
+                  sal::ConfigStore* cfgstore, RGWPeriod& info)
+{
+  // clear zone short ids of removed zones. period_map.update() will add the
+  // remaining zones back
+  info.period_map.short_zone_ids.clear();
+
+  // list all zonegroups in the realm
+  rgw::sal::ListResult<std::string> listing;
+  std::array<std::string, 1000> zonegroup_names; // list in pages of 1000
+  do {
+    int ret = cfgstore->list_zonegroup_names(dpp, y, listing.next,
+                                             zonegroup_names, listing);
+    if (ret < 0) {
+      std::cerr << "failed to list zonegroups: " << cpp_strerror(-ret) << std::endl;
+      return -ret;
+    }
+    for (const auto& name : listing.entries) {
+      RGWZoneGroup zg;
+      ret = cfgstore->read_zonegroup_by_name(dpp, y, name, zg, nullptr);
+      if (ret < 0) {
+        ldpp_dout(dpp, 0) << "WARNING: failed to read zonegroup "
+            << name << ": " << cpp_strerror(-ret) << dendl;
+        continue;
+      }
+
+      if (zg.realm_id != info.realm_id) {
+        ldpp_dout(dpp, 20) << "skipping zonegroup " << zg.get_name()
+            << " with realm id " << zg.realm_id
+            << ", not on our realm " << info.realm_id << dendl;
+        continue;
+      }
+
+      if (zg.master_zone.empty()) {
+        ldpp_dout(dpp, 0) << "ERROR: zonegroup " << zg.get_name() << " should have a master zone " << dendl;
+        return -EINVAL;
+      }
+
+      if (zg.zones.find(zg.master_zone) == zg.zones.end()) {
+        ldpp_dout(dpp, 0) << "ERROR: zonegroup " << zg.get_name()
+                     << " has a non existent master zone "<< dendl;
+        return -EINVAL;
+      }
+
+      if (zg.is_master_zonegroup()) {
+        info.master_zonegroup = zg.get_id();
+        info.master_zone = zg.master_zone;
+      }
+
+      ret = info.period_map.update(zg, dpp->get_cct());
+      if (ret < 0) {
+        return ret;
+      }
+    } // foreach name in listing.entries
+  } while (!listing.next.empty());
+
+  // read the realm's current period config
+  int ret = cfgstore->read_period_config(dpp, y, info.realm_id,
+                                         info.period_config);
+  if (ret < 0 && ret != -ENOENT) {
+    ldpp_dout(dpp, 0) << "ERROR: failed to read period config: "
+        << cpp_strerror(ret) << dendl;
+    return ret;
+  }
+
+  return 0;
+}
+
+int commit_period(const DoutPrefixProvider* dpp, optional_yield y,
+                  sal::ConfigStore* cfgstore, sal::Driver* driver,
+                  RGWRealm& realm, sal::RealmWriter& realm_writer,
+                  const RGWPeriod& current_period,
+                  RGWPeriod& info, std::ostream& error_stream,
+                  bool force_if_stale)
+{
+  auto zone_svc = static_cast<rgw::sal::RadosStore*>(driver)->svc()->zone; // XXX
+
+  ldpp_dout(dpp, 20) << __func__ << " realm " << realm.id
+      << " period " << current_period.id << dendl;
+  // gateway must be in the master zone to commit
+  if (info.master_zone != zone_svc->get_zone_params().id) {
+    error_stream << "Cannot commit period on zone "
+        << zone_svc->get_zone_params().id << ", it must be sent to "
+        "the period's master zone " << info.master_zone << '.' << std::endl;
+    return -EINVAL;
+  }
+  // period predecessor must match current period
+  if (info.predecessor_uuid != current_period.id) {
+    error_stream << "Period predecessor " << info.predecessor_uuid
+        << " does not match current period " << current_period.id
+        << ". Use 'period pull' to get the latest period from the master, "
+        "reapply your changes, and try again." << std::endl;
+    return -EINVAL;
+  }
+  // realm epoch must be 1 greater than current period
+  if (info.realm_epoch != current_period.realm_epoch + 1) {
+    error_stream << "Period's realm epoch " << info.realm_epoch
+        << " does not come directly after current realm epoch "
+        << current_period.realm_epoch << ". Use 'realm pull' to get the "
+        "latest realm and period from the master zone, reapply your changes, "
+        "and try again." << std::endl;
+    return -EINVAL;
+  }
+  // did the master zone change?
+  if (info.master_zone != current_period.master_zone) {
+    // store the current metadata sync status in the period
+    int r = info.update_sync_status(dpp, driver, current_period,
+                                    error_stream, force_if_stale);
+    if (r < 0) {
+      ldpp_dout(dpp, 0) << "failed to update metadata sync status: "
+          << cpp_strerror(-r) << dendl;
+      return r;
+    }
+    // create an object with a new period id
+    info.period_map.id = info.id = gen_random_uuid();
+    info.epoch = FIRST_EPOCH;
+
+    constexpr bool exclusive = true;
+    r = cfgstore->create_period(dpp, y, exclusive, info);
+    if (r < 0) {
+      ldpp_dout(dpp, 0) << "failed to create new period: " << cpp_strerror(-r) << dendl;
+      return r;
+    }
+    // set as current period
+    r = realm_set_current_period(dpp, y, cfgstore, realm_writer, realm, info);
+    if (r < 0) {
+      ldpp_dout(dpp, 0) << "failed to update realm's current period: "
+          << cpp_strerror(-r) << dendl;
+      return r;
+    }
+    ldpp_dout(dpp, 4) << "Promoted to master zone and committed new period "
+        << info.id << dendl;
+    (void) cfgstore->realm_notify_new_period(dpp, y, info);
+    return 0;
+  }
+  // period must be based on current epoch
+  if (info.epoch != current_period.epoch) {
+    error_stream << "Period epoch " << info.epoch << " does not match "
+        "predecessor epoch " << current_period.epoch << ". Use "
+        "'period pull' to get the latest epoch from the master zone, "
+        "reapply your changes, and try again." << std::endl;
+    return -EINVAL;
+  }
+  // set period as next epoch
+  info.id = current_period.id;
+  info.epoch = current_period.epoch + 1;
+  info.predecessor_uuid = current_period.predecessor_uuid;
+  info.realm_epoch = current_period.realm_epoch;
+  // write the period
+  constexpr bool exclusive = true;
+  int r = cfgstore->create_period(dpp, y, exclusive, info);
+  if (r == -EEXIST) {
+    // already have this epoch (or a more recent one)
+    return 0;
+  }
+  if (r < 0) {
+    ldpp_dout(dpp, 0) << "failed to store period: " << cpp_strerror(r) << dendl;
+    return r;
+  }
+  r = reflect_period(dpp, y, cfgstore, info);
+  if (r < 0) {
+    ldpp_dout(dpp, 0) << "failed to update local objects: " << cpp_strerror(r) << dendl;
+    return r;
+  }
+  ldpp_dout(dpp, 4) << "Committed new epoch " << info.epoch
+      << " for period " << info.id << dendl;
+  (void) cfgstore->realm_notify_new_period(dpp, y, info);
+  return 0;
+}
+
+
+int read_zonegroup(const DoutPrefixProvider* dpp, optional_yield y,
+                   sal::ConfigStore* cfgstore,
+                   std::string_view zonegroup_id,
+                   std::string_view zonegroup_name,
+                   RGWZoneGroup& info,
+                   std::unique_ptr<sal::ZoneGroupWriter>* writer)
+{
+  if (!zonegroup_id.empty()) {
+    return cfgstore->read_zonegroup_by_id(dpp, y, zonegroup_id, info, writer);
+  }
+  if (!zonegroup_name.empty()) {
+    return cfgstore->read_zonegroup_by_name(dpp, y, zonegroup_name, info, writer);
+  }
+
+  std::string realm_id;
+  int r = cfgstore->read_default_realm_id(dpp, y, realm_id);
+  if (r == -ENOENT) {
+    return cfgstore->read_zonegroup_by_name(dpp, y, default_zonegroup_name,
+                                            info, writer);
+  }
+  if (r < 0) {
+    return r;
+  }
+  return cfgstore->read_default_zonegroup(dpp, y, realm_id, info, writer);
+}
+
+int create_zonegroup(const DoutPrefixProvider* dpp, optional_yield y,
+                     sal::ConfigStore* cfgstore, bool exclusive,
+                     RGWZoneGroup& info)
+{
+  if (info.name.empty()) {
+    ldpp_dout(dpp, -1) << __func__ << " requires a zonegroup name" << dendl;
+    return -EINVAL;
+  }
+  if (info.id.empty()) {
+    info.id = gen_random_uuid();
+  }
+
+  // insert the default placement target if it doesn't exist
+  constexpr std::string_view default_placement_name = "default-placement";
+
+  RGWZoneGroupPlacementTarget placement_target;
+  placement_target.name = default_placement_name;
+
+  info.placement_targets.emplace(default_placement_name, placement_target);
+  if (info.default_placement.name.empty()) {
+    info.default_placement.name = default_placement_name;
+  }
+
+  int r = cfgstore->create_zonegroup(dpp, y, exclusive, info, nullptr);
+  if (r < 0) {
+    ldpp_dout(dpp, 0) << "failed to create zonegroup with "
+        << cpp_strerror(r) << dendl;
+    return r;
+  }
+
+  // try to set as default. may race with another create, so pass exclusive=true
+  // so we don't override an existing default
+  r = set_default_zonegroup(dpp, y, cfgstore, info, true);
+  if (r < 0 && r != -EEXIST) {
+    ldpp_dout(dpp, 0) << "WARNING: failed to set zonegroup as default: "
+        << cpp_strerror(r) << dendl;
+  }
+
+  return 0;
+}
+
+int set_default_zonegroup(const DoutPrefixProvider* dpp, optional_yield y,
+                          sal::ConfigStore* cfgstore, const RGWZoneGroup& info,
+                          bool exclusive)
+{
+  return cfgstore->write_default_zonegroup_id(
+      dpp, y, exclusive, info.realm_id, info.id);
+}
+
+int remove_zone_from_group(const DoutPrefixProvider* dpp,
+                           RGWZoneGroup& zonegroup,
+                           const rgw_zone_id& zone_id)
+{
+  auto z = zonegroup.zones.find(zone_id);
+  if (z == zonegroup.zones.end()) {
+    return -ENOENT;
+  }
+  zonegroup.zones.erase(z);
+
+  if (zonegroup.master_zone == zone_id) {
+    // choose a new master zone
+    auto m = zonegroup.zones.begin();
+    if (m != zonegroup.zones.end()) {
+      zonegroup.master_zone = m->first;
+      ldpp_dout(dpp, 0) << "NOTICE: promoted " << m->second.name
+         << " as new master_zone of zonegroup " << zonegroup.name << dendl;
+    } else {
+      zonegroup.master_zone.clear();
+      ldpp_dout(dpp, 0) << "NOTICE: cleared master_zone of zonegroup "
+          << zonegroup.name << dendl;
+    }
+  }
+
+  const bool log_data = zonegroup.zones.size() > 1;
+  for (auto& [id, zone] : zonegroup.zones) {
+    zone.log_data = log_data;
+  }
+
+  return 0;
+}
+
+// try to remove the given zone id from every zonegroup in the cluster
+static int remove_zone_from_groups(const DoutPrefixProvider* dpp,
+                                   optional_yield y,
+                                   sal::ConfigStore* cfgstore,
+                                   const rgw_zone_id& zone_id)
+{
+  std::array<std::string, 128> zonegroup_names;
+  sal::ListResult<std::string> listing;
+  do {
+    int r = cfgstore->list_zonegroup_names(dpp, y, listing.next,
+                                           zonegroup_names, listing);
+    if (r < 0) {
+      ldpp_dout(dpp, 0) << "failed to list zonegroups with "
+          << cpp_strerror(r) << dendl;
+      return r;
+    }
+
+    for (const auto& name : listing.entries) {
+      RGWZoneGroup zonegroup;
+      std::unique_ptr<sal::ZoneGroupWriter> writer;
+      r = cfgstore->read_zonegroup_by_name(dpp, y, name, zonegroup, &writer);
+      if (r < 0) {
+        ldpp_dout(dpp, 0) << "WARNING: failed to load zonegroup " << name
+            << " with " << cpp_strerror(r) << dendl;
+        continue;
+      }
+
+      r = remove_zone_from_group(dpp, zonegroup, zone_id);
+      if (r < 0) {
+        continue;
+      }
+
+      // write the updated zonegroup
+      r = writer->write(dpp, y, zonegroup);
+      if (r < 0) {
+        ldpp_dout(dpp, 0) << "WARNING: failed to write zonegroup " << name
+            << " with " << cpp_strerror(r) << dendl;
+        continue;
+      }
+      ldpp_dout(dpp, 0) << "Removed zone from zonegroup " << name << dendl;
+    }
+  } while (!listing.next.empty());
+
+  return 0;
+}
+
+
+int read_zone(const DoutPrefixProvider* dpp, optional_yield y,
+              sal::ConfigStore* cfgstore,
+              std::string_view zone_id,
+              std::string_view zone_name,
+              RGWZoneParams& info,
+              std::unique_ptr<sal::ZoneWriter>* writer)
+{
+  if (!zone_id.empty()) {
+    return cfgstore->read_zone_by_id(dpp, y, zone_id, info, writer);
+  }
+  if (!zone_name.empty()) {
+    return cfgstore->read_zone_by_name(dpp, y, zone_name, info, writer);
+  }
+
+  std::string realm_id;
+  int r = cfgstore->read_default_realm_id(dpp, y, realm_id);
+  if (r == -ENOENT) {
+    return cfgstore->read_zone_by_name(dpp, y, default_zone_name, info, writer);
+  }
+  if (r < 0) {
+    return r;
+  }
+  return cfgstore->read_default_zone(dpp, y, realm_id, info, writer);
+}
+
+extern int get_zones_pool_set(const DoutPrefixProvider *dpp, optional_yield y,
+                              rgw::sal::ConfigStore* cfgstore,
+                              std::string_view my_zone_id,
+                              std::set<rgw_pool>& pools);
+
+int create_zone(const DoutPrefixProvider* dpp, optional_yield y,
+                sal::ConfigStore* cfgstore, bool exclusive,
+                RGWZoneParams& info, std::unique_ptr<sal::ZoneWriter>* writer)
+{
+  if (info.name.empty()) {
+    ldpp_dout(dpp, -1) << __func__ << " requires a zone name" << dendl;
+    return -EINVAL;
+  }
+  if (info.id.empty()) {
+    info.id = gen_random_uuid();
+  }
+
+  // add default placement with empty pool name
+  rgw_pool pool;
+  auto& placement = info.placement_pools["default-placement"];
+  placement.storage_classes.set_storage_class(
+      RGW_STORAGE_CLASS_STANDARD, &pool, nullptr);
+
+  // build a set of all pool names used by other zones
+  std::set<rgw_pool> pools;
+  int r = get_zones_pool_set(dpp, y, cfgstore, info.id, pools);
+  if (r < 0) {
+    return r;
+  }
+
+  // initialize pool names with the zone name prefix
+  r = init_zone_pool_names(dpp, y, pools, info);
+  if (r < 0) {
+    return r;
+  }
+
+  r = cfgstore->create_zone(dpp, y, exclusive, info, nullptr);
+  if (r < 0) {
+    ldpp_dout(dpp, 0) << "failed to create zone with "
+        << cpp_strerror(r) << dendl;
+    return r;
+  }
+
+  // try to set as default. may race with another create, so pass exclusive=true
+  // so we don't override an existing default
+  r = set_default_zone(dpp, y, cfgstore, info, true);
+  if (r < 0 && r != -EEXIST) {
+    ldpp_dout(dpp, 0) << "WARNING: failed to set zone as default: "
+        << cpp_strerror(r) << dendl;
+  }
+
+  return 0;
+
+}
+
+int set_default_zone(const DoutPrefixProvider* dpp, optional_yield y,
+                     sal::ConfigStore* cfgstore, const RGWZoneParams& info,
+                     bool exclusive)
+{
+  return cfgstore->write_default_zone_id(
+      dpp, y, exclusive, info.realm_id, info.id);
+}
+
+int delete_zone(const DoutPrefixProvider* dpp, optional_yield y,
+                sal::ConfigStore* cfgstore, const RGWZoneParams& info,
+                sal::ZoneWriter& writer)
+{
+  // remove this zone from any zonegroups that contain it
+  int r = remove_zone_from_groups(dpp, y, cfgstore, info.id);
+  if (r < 0) {
+    return r;
+  }
+
+  return writer.remove(dpp, y);
+}
+
+} // namespace rgw
+
+static inline int conf_to_uint64(const JSONFormattable& config, const string& key, uint64_t *pval)
+{
+  string sval;
+  if (config.find(key, &sval)) {
+    string err;
+    uint64_t val = strict_strtoll(sval.c_str(), 10, &err);
+    if (!err.empty()) {
+      return -EINVAL;
+    }
+    *pval = val;
+  }
+  return 0;
+}
+
+int RGWZoneGroupPlacementTier::update_params(const JSONFormattable& config)
+{
+  int r = -1;
+
+  if (config.exists("retain_head_object")) {
+    string s = config["retain_head_object"];
+    if (s == "true") {
+      retain_head_object = true;
+    } else {
+      retain_head_object = false;
+    }
+  }
+
+  if (tier_type == "cloud-s3") {
+    r = t.s3.update_params(config);
+  }
+
+  return r;
+}
+
+int RGWZoneGroupPlacementTier::clear_params(const JSONFormattable& config)
+{
+  if (config.exists("retain_head_object")) {
+    retain_head_object = false;
+  }
+
+  if (tier_type == "cloud-s3") {
+    t.s3.clear_params(config);
+  }
+
+  return 0;
+}
+
+int RGWZoneGroupPlacementTierS3::update_params(const JSONFormattable& config)
+{
+  int r = -1;
+
+  if (config.exists("endpoint")) {
+    endpoint = config["endpoint"];
+  }
+  if (config.exists("target_path")) {
+    target_path = config["target_path"];
+  }
+  if (config.exists("region")) {
+    region = config["region"];
+  }
+  if (config.exists("host_style")) {
+    string s;
+    s = config["host_style"];
+    if (s != "virtual") {
+      host_style = PathStyle;
+    } else {
+      host_style = VirtualStyle;
+    }
+  }
+  if (config.exists("target_storage_class")) {
+    target_storage_class = config["target_storage_class"];
+  }
+  if (config.exists("access_key")) {
+    key.id = config["access_key"];
+  }
+  if (config.exists("secret")) {
+    key.key = config["secret"];
+  }
+  if (config.exists("multipart_sync_threshold")) {
+    r = conf_to_uint64(config, "multipart_sync_threshold", &multipart_sync_threshold);
+    if (r < 0) {
+      multipart_sync_threshold = DEFAULT_MULTIPART_SYNC_PART_SIZE;
+    }
+  }
+
+  if (config.exists("multipart_min_part_size")) {
+    r = conf_to_uint64(config, "multipart_min_part_size", &multipart_min_part_size);
+    if (r < 0) {
+      multipart_min_part_size = DEFAULT_MULTIPART_SYNC_PART_SIZE;
+    }
+  }
+
+  if (config.exists("acls")) {
+    const JSONFormattable& cc = config["acls"];
+    if (cc.is_array()) {
+      for (auto& c : cc.array()) {
+        RGWTierACLMapping m;
+        m.init(c);
+        if (!m.source_id.empty()) {
+          acl_mappings[m.source_id] = m;
+        }
+      }
+    } else {
+      RGWTierACLMapping m;
+      m.init(cc);
+      if (!m.source_id.empty()) {
+        acl_mappings[m.source_id] = m;
+      }
+    }
+  }
+  return 0;
+}
+
+int RGWZoneGroupPlacementTierS3::clear_params(const JSONFormattable& config)
+{
+  if (config.exists("endpoint")) {
+    endpoint.clear();
+  }
+  if (config.exists("target_path")) {
+    target_path.clear();
+  }
+  if (config.exists("region")) {
+    region.clear();
+  }
+  if (config.exists("host_style")) {
+    /* default */
+    host_style = PathStyle;
+  }
+  if (config.exists("target_storage_class")) {
+    target_storage_class.clear();
+  }
+  if (config.exists("access_key")) {
+    key.id.clear();
+  }
+  if (config.exists("secret")) {
+    key.key.clear();
+  }
+  if (config.exists("multipart_sync_threshold")) {
+    multipart_sync_threshold = DEFAULT_MULTIPART_SYNC_PART_SIZE;
+  }
+  if (config.exists("multipart_min_part_size")) {
+    multipart_min_part_size = DEFAULT_MULTIPART_SYNC_PART_SIZE;
+  }
+  if (config.exists("acls")) {
+    const JSONFormattable& cc = config["acls"];
+    if (cc.is_array()) {
+      for (auto& c : cc.array()) {
+        RGWTierACLMapping m;
+        m.init(c);
+        acl_mappings.erase(m.source_id);
+      }
+    } else {
+      RGWTierACLMapping m;
+      m.init(cc);
+      acl_mappings.erase(m.source_id);
+    }
+  }
+  return 0;
+}
+
+void rgw_meta_sync_info::generate_test_instances(list<rgw_meta_sync_info*>& o)
+{
+  auto info = new rgw_meta_sync_info;
+  info->state = rgw_meta_sync_info::StateBuildingFullSyncMaps;
+  info->period = "periodid";
+  info->realm_epoch = 5;
+  o.push_back(info);
+  o.push_back(new rgw_meta_sync_info);
+}
+
+void rgw_meta_sync_marker::generate_test_instances(list<rgw_meta_sync_marker*>& o)
+{
+  auto marker = new rgw_meta_sync_marker;
+  marker->state = rgw_meta_sync_marker::IncrementalSync;
+  marker->marker = "01234";
+  marker->realm_epoch = 5;
+  o.push_back(marker);
+  o.push_back(new rgw_meta_sync_marker);
+}
+
+void rgw_meta_sync_status::generate_test_instances(list<rgw_meta_sync_status*>& o)
+{
+  o.push_back(new rgw_meta_sync_status);
+}
+
+void RGWZoneParams::generate_test_instances(list<RGWZoneParams*> &o)
+{
+  o.push_back(new RGWZoneParams);
+  o.push_back(new RGWZoneParams);
+}
+
+void RGWPeriodLatestEpochInfo::generate_test_instances(list<RGWPeriodLatestEpochInfo*> &o)
+{
+  RGWPeriodLatestEpochInfo *z = new RGWPeriodLatestEpochInfo;
+  o.push_back(z);
+  o.push_back(new RGWPeriodLatestEpochInfo);
+}
+
+void RGWZoneGroup::generate_test_instances(list<RGWZoneGroup*>& o)
+{
+  RGWZoneGroup *r = new RGWZoneGroup;
+  o.push_back(r);
+  o.push_back(new RGWZoneGroup);
+}
+
+void RGWPeriodLatestEpochInfo::dump(Formatter *f) const {
+  encode_json("latest_epoch", epoch, f);
+}
+
+void RGWPeriodLatestEpochInfo::decode_json(JSONObj *obj) {
+  JSONDecoder::decode_json("latest_epoch", epoch, obj);
+}
+
+void RGWNameToId::dump(Formatter *f) const {
+  encode_json("obj_id", obj_id, f);
+}
+
+void RGWNameToId::decode_json(JSONObj *obj) {
+  JSONDecoder::decode_json("obj_id", obj_id, obj);
+}
+
diff --git a/src/rgw/driver/rados/rgw_zone.h b/src/rgw/driver/rados/rgw_zone.h
new file mode 100644 (file)
index 0000000..e1792a4
--- /dev/null
@@ -0,0 +1,1525 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#ifndef CEPH_RGW_ZONE_H
+#define CEPH_RGW_ZONE_H
+
+#include <ostream>
+#include "rgw_common.h"
+#include "rgw_sal_fwd.h"
+#include "rgw_sync_policy.h"
+#include "rgw_zone_features.h"
+
+namespace rgw_zone_defaults {
+
+extern std::string zone_names_oid_prefix;
+extern std::string region_info_oid_prefix;
+extern std::string realm_names_oid_prefix;
+extern std::string zone_group_info_oid_prefix;
+extern std::string realm_info_oid_prefix;
+extern std::string default_region_info_oid;
+extern std::string default_zone_group_info_oid;
+extern std::string region_map_oid;
+extern std::string default_realm_info_oid;
+extern std::string default_zonegroup_name;
+extern std::string default_zone_name;
+extern std::string zonegroup_names_oid_prefix;
+extern std::string RGW_DEFAULT_ZONE_ROOT_POOL;
+extern std::string RGW_DEFAULT_ZONEGROUP_ROOT_POOL;
+extern std::string RGW_DEFAULT_REALM_ROOT_POOL;
+extern std::string RGW_DEFAULT_PERIOD_ROOT_POOL;
+extern std::string avail_pools;
+extern std::string default_storage_pool_suffix;
+
+}
+
+class JSONObj;
+class RGWSyncModulesManager;
+
+
+struct RGWNameToId {
+  std::string obj_id;
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(1, 1, bl);
+    encode(obj_id, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) {
+    DECODE_START(1, bl);
+    decode(obj_id, bl);
+    DECODE_FINISH(bl);
+  }
+
+  void dump(Formatter *f) const;
+  void decode_json(JSONObj *obj);
+};
+WRITE_CLASS_ENCODER(RGWNameToId)
+
+struct RGWDefaultSystemMetaObjInfo {
+  std::string default_id;
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(1, 1, bl);
+    encode(default_id, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) {
+    DECODE_START(1, bl);
+    decode(default_id, bl);
+    DECODE_FINISH(bl);
+  }
+
+  void dump(Formatter *f) const;
+  void decode_json(JSONObj *obj);
+};
+WRITE_CLASS_ENCODER(RGWDefaultSystemMetaObjInfo)
+
+class RGWSI_SysObj;
+class RGWSI_Zone;
+
+class RGWSystemMetaObj {
+public:
+  std::string id;
+  std::string name;
+
+  CephContext *cct{nullptr};
+  RGWSI_SysObj *sysobj_svc{nullptr};
+  RGWSI_Zone *zone_svc{nullptr};
+
+  int store_name(const DoutPrefixProvider *dpp, bool exclusive, optional_yield y);
+  int store_info(const DoutPrefixProvider *dpp, bool exclusive, optional_yield y);
+  int read_info(const DoutPrefixProvider *dpp, const std::string& obj_id, optional_yield y, bool old_format = false);
+  int read_id(const DoutPrefixProvider *dpp, const std::string& obj_name, std::string& obj_id, optional_yield y);
+  int read_default(const DoutPrefixProvider *dpp, 
+                   RGWDefaultSystemMetaObjInfo& default_info,
+                  const std::string& oid,
+                  optional_yield y);
+  /* read and use default id */
+  int use_default(const DoutPrefixProvider *dpp, optional_yield y, bool old_format = false);
+
+public:
+  RGWSystemMetaObj() {}
+  RGWSystemMetaObj(const std::string& _name): name(_name) {}
+  RGWSystemMetaObj(const std::string& _id, const std::string& _name) : id(_id), name(_name) {}
+  RGWSystemMetaObj(CephContext *_cct, RGWSI_SysObj *_sysobj_svc) {
+    reinit_instance(_cct, _sysobj_svc);
+  }
+  RGWSystemMetaObj(const std::string& _name, CephContext *_cct, RGWSI_SysObj *_sysobj_svc): name(_name) {
+    reinit_instance(_cct, _sysobj_svc);
+  }
+
+  const std::string& get_name() const { return name; }
+  const std::string& get_id() const { return id; }
+
+  void set_name(const std::string& _name) { name = _name;}
+  void set_id(const std::string& _id) { id = _id;}
+  void clear_id() { id.clear(); }
+
+  virtual ~RGWSystemMetaObj() {}
+
+  virtual void encode(bufferlist& bl) const {
+    ENCODE_START(1, 1, bl);
+    encode(id, bl);
+    encode(name, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  virtual void decode(bufferlist::const_iterator& bl) {
+    DECODE_START(1, bl);
+    decode(id, bl);
+    decode(name, bl);
+    DECODE_FINISH(bl);
+  }
+
+  void reinit_instance(CephContext *_cct, RGWSI_SysObj *_sysobj_svc);
+  int init(const DoutPrefixProvider *dpp, CephContext *_cct, RGWSI_SysObj *_sysobj_svc,
+          optional_yield y,
+          bool setup_obj = true, bool old_format = false);
+  virtual int read_default_id(const DoutPrefixProvider *dpp, std::string& default_id, optional_yield y,
+                             bool old_format = false);
+  virtual int set_as_default(const DoutPrefixProvider *dpp, optional_yield y, bool exclusive = false);
+  int delete_default();
+  virtual int create(const DoutPrefixProvider *dpp, optional_yield y, bool exclusive = true);
+  int delete_obj(const DoutPrefixProvider *dpp, optional_yield y, bool old_format = false);
+  int rename(const DoutPrefixProvider *dpp, const std::string& new_name, optional_yield y);
+  int update(const DoutPrefixProvider *dpp, optional_yield y) { return store_info(dpp, false, y);}
+  int update_name(const DoutPrefixProvider *dpp, optional_yield y) { return store_name(dpp, false, y);}
+  int read(const DoutPrefixProvider *dpp, optional_yield y);
+  int write(const DoutPrefixProvider *dpp, bool exclusive, optional_yield y);
+
+  virtual rgw_pool get_pool(CephContext *cct) const = 0;
+  virtual const std::string get_default_oid(bool old_format = false) const = 0;
+  virtual const std::string& get_names_oid_prefix() const = 0;
+  virtual const std::string& get_info_oid_prefix(bool old_format = false) const = 0;
+  virtual std::string get_predefined_id(CephContext *cct) const = 0;
+  virtual const std::string& get_predefined_name(CephContext *cct) const = 0;
+
+  void dump(Formatter *f) const;
+  void decode_json(JSONObj *obj);
+};
+WRITE_CLASS_ENCODER(RGWSystemMetaObj)
+
+struct RGWZoneStorageClass {
+  boost::optional<rgw_pool> data_pool;
+  boost::optional<std::string> compression_type;
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(1, 1, bl);
+    encode(data_pool, bl);
+    encode(compression_type, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) {
+    DECODE_START(1, bl);
+    decode(data_pool, bl);
+    decode(compression_type, bl);
+    DECODE_FINISH(bl);
+  }
+
+  void dump(Formatter *f) const;
+  void decode_json(JSONObj *obj);
+};
+WRITE_CLASS_ENCODER(RGWZoneStorageClass)
+
+
+class RGWZoneStorageClasses {
+  std::map<std::string, RGWZoneStorageClass> m;
+
+  /* in memory only */
+  RGWZoneStorageClass *standard_class;
+
+public:
+  RGWZoneStorageClasses() {
+    standard_class = &m[RGW_STORAGE_CLASS_STANDARD];
+  }
+  RGWZoneStorageClasses(const RGWZoneStorageClasses& rhs) {
+    m = rhs.m;
+    standard_class = &m[RGW_STORAGE_CLASS_STANDARD];
+  }
+  RGWZoneStorageClasses& operator=(const RGWZoneStorageClasses& rhs) {
+    m = rhs.m;
+    standard_class = &m[RGW_STORAGE_CLASS_STANDARD];
+    return *this;
+  }
+
+  const RGWZoneStorageClass& get_standard() const {
+    return *standard_class;
+  }
+
+  bool find(const std::string& sc, const RGWZoneStorageClass **pstorage_class) const {
+    auto iter = m.find(sc);
+    if (iter == m.end()) {
+      return false;
+    }
+    *pstorage_class = &iter->second;
+    return true;
+  }
+
+  bool exists(const std::string& sc) const {
+    if (sc.empty()) {
+      return true;
+    }
+    auto iter = m.find(sc);
+    return (iter != m.end());
+  }
+
+  const std::map<std::string, RGWZoneStorageClass>& get_all() const {
+    return m;
+  }
+
+  std::map<std::string, RGWZoneStorageClass>& get_all() {
+    return m;
+  }
+
+  void set_storage_class(const std::string& sc, const rgw_pool *data_pool, const std::string *compression_type) {
+    const std::string *psc = &sc;
+    if (sc.empty()) {
+      psc = &RGW_STORAGE_CLASS_STANDARD;
+    }
+    RGWZoneStorageClass& storage_class = m[*psc];
+    if (data_pool) {
+      storage_class.data_pool = *data_pool;
+    }
+    if (compression_type) {
+      storage_class.compression_type = *compression_type;
+    }
+  }
+
+  void remove_storage_class(const std::string& sc) {
+    if (!sc.empty()) {
+      m.erase(sc);
+    }
+  }
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(1, 1, bl);
+    encode(m, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) {
+    DECODE_START(1, bl);
+    decode(m, bl);
+    standard_class = &m[RGW_STORAGE_CLASS_STANDARD];
+    DECODE_FINISH(bl);
+  }
+
+  void dump(Formatter *f) const;
+  void decode_json(JSONObj *obj);
+};
+WRITE_CLASS_ENCODER(RGWZoneStorageClasses)
+
+struct RGWZonePlacementInfo {
+  rgw_pool index_pool;
+  rgw_pool data_extra_pool; /* if not set we should use data_pool */
+  RGWZoneStorageClasses storage_classes;
+  rgw::BucketIndexType index_type;
+
+  RGWZonePlacementInfo() : index_type(rgw::BucketIndexType::Normal) {}
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(7, 1, bl);
+    encode(index_pool.to_str(), bl);
+    rgw_pool standard_data_pool = get_data_pool(RGW_STORAGE_CLASS_STANDARD);
+    encode(standard_data_pool.to_str(), bl);
+    encode(data_extra_pool.to_str(), bl);
+    encode((uint32_t)index_type, bl);
+    std::string standard_compression_type = get_compression_type(RGW_STORAGE_CLASS_STANDARD);
+    encode(standard_compression_type, bl);
+    encode(storage_classes, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) {
+    DECODE_START(7, bl);
+    std::string index_pool_str;
+    std::string data_pool_str;
+    decode(index_pool_str, bl);
+    index_pool = rgw_pool(index_pool_str);
+    decode(data_pool_str, bl);
+    rgw_pool standard_data_pool(data_pool_str);
+    if (struct_v >= 4) {
+      std::string data_extra_pool_str;
+      decode(data_extra_pool_str, bl);
+      data_extra_pool = rgw_pool(data_extra_pool_str);
+    }
+    if (struct_v >= 5) {
+      uint32_t it;
+      decode(it, bl);
+      index_type = (rgw::BucketIndexType)it;
+    }
+    std::string standard_compression_type;
+    if (struct_v >= 6) {
+      decode(standard_compression_type, bl);
+    }
+    if (struct_v >= 7) {
+      decode(storage_classes, bl);
+    } else {
+      storage_classes.set_storage_class(RGW_STORAGE_CLASS_STANDARD, &standard_data_pool,
+                                        (!standard_compression_type.empty() ? &standard_compression_type : nullptr));
+    }
+    DECODE_FINISH(bl);
+  }
+  const rgw_pool& get_data_extra_pool() const {
+    static rgw_pool no_pool;
+    if (data_extra_pool.empty()) {
+      return storage_classes.get_standard().data_pool.get_value_or(no_pool);
+    }
+    return data_extra_pool;
+  }
+  const rgw_pool& get_data_pool(const std::string& sc) const {
+    const RGWZoneStorageClass *storage_class;
+    static rgw_pool no_pool;
+
+    if (!storage_classes.find(sc, &storage_class)) {
+      return storage_classes.get_standard().data_pool.get_value_or(no_pool);
+    }
+
+    return storage_class->data_pool.get_value_or(no_pool);
+  }
+  const rgw_pool& get_standard_data_pool() const {
+    return get_data_pool(RGW_STORAGE_CLASS_STANDARD);
+  }
+
+  const std::string& get_compression_type(const std::string& sc) const {
+    const RGWZoneStorageClass *storage_class;
+    static std::string no_compression;
+
+    if (!storage_classes.find(sc, &storage_class)) {
+      return no_compression;
+    }
+    return storage_class->compression_type.get_value_or(no_compression);
+  }
+
+  bool storage_class_exists(const std::string& sc) const {
+    return storage_classes.exists(sc);
+  }
+
+  void dump(Formatter *f) const;
+  void decode_json(JSONObj *obj);
+
+};
+WRITE_CLASS_ENCODER(RGWZonePlacementInfo)
+
+struct RGWZoneParams : RGWSystemMetaObj {
+  rgw_pool domain_root;
+  rgw_pool control_pool;
+  rgw_pool gc_pool;
+  rgw_pool lc_pool;
+  rgw_pool log_pool;
+  rgw_pool intent_log_pool;
+  rgw_pool usage_log_pool;
+  rgw_pool user_keys_pool;
+  rgw_pool user_email_pool;
+  rgw_pool user_swift_pool;
+  rgw_pool user_uid_pool;
+  rgw_pool roles_pool;
+  rgw_pool reshard_pool;
+  rgw_pool otp_pool;
+  rgw_pool oidc_pool;
+  rgw_pool notif_pool;
+
+  RGWAccessKey system_key;
+
+  std::map<std::string, RGWZonePlacementInfo> placement_pools;
+
+  std::string realm_id;
+
+  JSONFormattable tier_config;
+
+  RGWZoneParams() : RGWSystemMetaObj() {}
+  explicit RGWZoneParams(const std::string& name) : RGWSystemMetaObj(name){}
+  RGWZoneParams(const rgw_zone_id& id, const std::string& name) : RGWSystemMetaObj(id.id, name) {}
+  RGWZoneParams(const rgw_zone_id& id, const std::string& name, const std::string& _realm_id)
+    : RGWSystemMetaObj(id.id, name), realm_id(_realm_id) {}
+  virtual ~RGWZoneParams();
+
+  rgw_pool get_pool(CephContext *cct) const override;
+  const std::string get_default_oid(bool old_format = false) const override;
+  const std::string& get_names_oid_prefix() const override;
+  const std::string& get_info_oid_prefix(bool old_format = false) const override;
+  std::string get_predefined_id(CephContext *cct) const override;
+  const std::string& get_predefined_name(CephContext *cct) const override;
+
+  int init(const DoutPrefixProvider *dpp, 
+           CephContext *_cct, RGWSI_SysObj *_sysobj_svc, optional_yield y,
+          bool setup_obj = true, bool old_format = false);
+  using RGWSystemMetaObj::init;
+  int read_default_id(const DoutPrefixProvider *dpp, std::string& default_id, optional_yield y, bool old_format = false) override;
+  int set_as_default(const DoutPrefixProvider *dpp, optional_yield y, bool exclusive = false) override;
+  int create_default(const DoutPrefixProvider *dpp, optional_yield y, bool old_format = false);
+  int create(const DoutPrefixProvider *dpp, optional_yield y, bool exclusive = true) override;
+  int fix_pool_names(const DoutPrefixProvider *dpp, optional_yield y);
+
+  const std::string& get_compression_type(const rgw_placement_rule& placement_rule) const;
+  
+  void encode(bufferlist& bl) const override {
+    ENCODE_START(14, 1, bl);
+    encode(domain_root, bl);
+    encode(control_pool, bl);
+    encode(gc_pool, bl);
+    encode(log_pool, bl);
+    encode(intent_log_pool, bl);
+    encode(usage_log_pool, bl);
+    encode(user_keys_pool, bl);
+    encode(user_email_pool, bl);
+    encode(user_swift_pool, bl);
+    encode(user_uid_pool, bl);
+    RGWSystemMetaObj::encode(bl);
+    encode(system_key, bl);
+    encode(placement_pools, bl);
+    rgw_pool unused_metadata_heap;
+    encode(unused_metadata_heap, bl);
+    encode(realm_id, bl);
+    encode(lc_pool, bl);
+    std::map<std::string, std::string, ltstr_nocase> old_tier_config;
+    encode(old_tier_config, bl);
+    encode(roles_pool, bl);
+    encode(reshard_pool, bl);
+    encode(otp_pool, bl);
+    encode(tier_config, bl);
+    encode(oidc_pool, bl);
+    encode(notif_pool, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) override {
+    DECODE_START(14, bl);
+    decode(domain_root, bl);
+    decode(control_pool, bl);
+    decode(gc_pool, bl);
+    decode(log_pool, bl);
+    decode(intent_log_pool, bl);
+    decode(usage_log_pool, bl);
+    decode(user_keys_pool, bl);
+    decode(user_email_pool, bl);
+    decode(user_swift_pool, bl);
+    decode(user_uid_pool, bl);
+    if (struct_v >= 6) {
+      RGWSystemMetaObj::decode(bl);
+    } else if (struct_v >= 2) {
+      decode(name, bl);
+      id = name;
+    }
+    if (struct_v >= 3)
+      decode(system_key, bl);
+    if (struct_v >= 4)
+      decode(placement_pools, bl);
+    if (struct_v >= 5) {
+      rgw_pool unused_metadata_heap;
+      decode(unused_metadata_heap, bl);
+    }
+    if (struct_v >= 6) {
+      decode(realm_id, bl);
+    }
+    if (struct_v >= 7) {
+      decode(lc_pool, bl);
+    } else {
+      lc_pool = log_pool.name + ":lc";
+    }
+    std::map<std::string, std::string, ltstr_nocase> old_tier_config;
+    if (struct_v >= 8) {
+      decode(old_tier_config, bl);
+    }
+    if (struct_v >= 9) {
+      decode(roles_pool, bl);
+    } else {
+      roles_pool = name + ".rgw.meta:roles";
+    }
+    if (struct_v >= 10) {
+      decode(reshard_pool, bl);
+    } else {
+      reshard_pool = log_pool.name + ":reshard";
+    }
+    if (struct_v >= 11) {
+      ::decode(otp_pool, bl);
+    } else {
+      otp_pool = name + ".rgw.otp";
+    }
+    if (struct_v >= 12) {
+      ::decode(tier_config, bl);
+    } else {
+      for (auto& kv : old_tier_config) {
+        tier_config.set(kv.first, kv.second);
+      }
+    }
+    if (struct_v >= 13) {
+      ::decode(oidc_pool, bl);
+    } else {
+      oidc_pool = name + ".rgw.meta:oidc";
+    }
+    if (struct_v >= 14) {
+      decode(notif_pool, bl);
+    } else {
+      notif_pool = log_pool.name + ":notif";
+    }
+    DECODE_FINISH(bl);
+  }
+  void dump(Formatter *f) const;
+  void decode_json(JSONObj *obj);
+  static void generate_test_instances(std::list<RGWZoneParams*>& o);
+
+  bool get_placement(const std::string& placement_id, RGWZonePlacementInfo *placement) const {
+    auto iter = placement_pools.find(placement_id);
+    if (iter == placement_pools.end()) {
+      return false;
+    }
+    *placement = iter->second;
+    return true;
+  }
+
+  /*
+   * return data pool of the head object
+   */
+  bool get_head_data_pool(const rgw_placement_rule& placement_rule, const rgw_obj& obj, rgw_pool *pool) const {
+    const rgw_data_placement_target& explicit_placement = obj.bucket.explicit_placement;
+    if (!explicit_placement.data_pool.empty()) {
+      if (!obj.in_extra_data) {
+        *pool = explicit_placement.data_pool;
+      } else {
+        *pool = explicit_placement.get_data_extra_pool();
+      }
+      return true;
+    }
+    if (placement_rule.empty()) {
+      return false;
+    }
+    auto iter = placement_pools.find(placement_rule.name);
+    if (iter == placement_pools.end()) {
+      return false;
+    }
+    if (!obj.in_extra_data) {
+      *pool = iter->second.get_data_pool(placement_rule.storage_class);
+    } else {
+      *pool = iter->second.get_data_extra_pool();
+    }
+    return true;
+  }
+
+  bool valid_placement(const rgw_placement_rule& rule) const {
+    auto iter = placement_pools.find(rule.name);
+    if (iter == placement_pools.end()) {
+      return false;
+    }
+    return iter->second.storage_class_exists(rule.storage_class);
+  }
+};
+WRITE_CLASS_ENCODER(RGWZoneParams)
+
+
+struct RGWZone {
+  std::string id;
+  std::string name;
+  std::list<std::string> endpoints;
+  bool log_meta;
+  bool log_data;
+  bool read_only;
+  std::string tier_type;
+
+  std::string redirect_zone;
+
+/**
+ * Represents the number of shards for the bucket index object, a value of zero
+ * indicates there is no sharding. By default (no sharding, the name of the object
+ * is '.dir.{marker}', with sharding, the name is '.dir.{marker}.{sharding_id}',
+ * sharding_id is zero-based value. It is not recommended to set a too large value
+ * (e.g. thousand) as it increases the cost for bucket listing.
+ */
+  uint32_t bucket_index_max_shards;
+
+  // pre-shard buckets on creation to enable some write-parallism by default,
+  // delay the need to reshard as the bucket grows, and (in multisite) get some
+  // bucket index sharding where dynamic resharding is not supported
+  static constexpr uint32_t default_bucket_index_max_shards = 11;
+
+  bool sync_from_all;
+  std::set<std::string> sync_from; /* list of zones to sync from */
+
+  rgw::zone_features::set supported_features;
+
+  RGWZone()
+    : log_meta(false), log_data(false), read_only(false),
+      bucket_index_max_shards(default_bucket_index_max_shards),
+      sync_from_all(true) {}
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(8, 1, bl);
+    encode(name, bl);
+    encode(endpoints, bl);
+    encode(log_meta, bl);
+    encode(log_data, bl);
+    encode(bucket_index_max_shards, bl);
+    encode(id, bl);
+    encode(read_only, bl);
+    encode(tier_type, bl);
+    encode(sync_from_all, bl);
+    encode(sync_from, bl);
+    encode(redirect_zone, bl);
+    encode(supported_features, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) {
+    DECODE_START(8, bl);
+    decode(name, bl);
+    if (struct_v < 4) {
+      id = name;
+    }
+    decode(endpoints, bl);
+    if (struct_v >= 2) {
+      decode(log_meta, bl);
+      decode(log_data, bl);
+    }
+    if (struct_v >= 3) {
+      decode(bucket_index_max_shards, bl);
+    }
+    if (struct_v >= 4) {
+      decode(id, bl);
+      decode(read_only, bl);
+    }
+    if (struct_v >= 5) {
+      decode(tier_type, bl);
+    }
+    if (struct_v >= 6) {
+      decode(sync_from_all, bl);
+      decode(sync_from, bl);
+    }
+    if (struct_v >= 7) {
+      decode(redirect_zone, bl);
+    }
+    if (struct_v >= 8) {
+      decode(supported_features, bl);
+    }
+    DECODE_FINISH(bl);
+  }
+  void dump(Formatter *f) const;
+  void decode_json(JSONObj *obj);
+  static void generate_test_instances(std::list<RGWZone*>& o);
+
+  bool is_read_only() const { return read_only; }
+
+  bool syncs_from(const std::string& zone_name) const {
+    return (sync_from_all || sync_from.find(zone_name) != sync_from.end());
+  }
+
+  bool supports(std::string_view feature) const {
+    return supported_features.contains(feature);
+  }
+};
+WRITE_CLASS_ENCODER(RGWZone)
+
+struct RGWDefaultZoneGroupInfo {
+  std::string default_zonegroup;
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(1, 1, bl);
+    encode(default_zonegroup, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) {
+    DECODE_START(1, bl);
+    decode(default_zonegroup, bl);
+    DECODE_FINISH(bl);
+  }
+  void dump(Formatter *f) const;
+  void decode_json(JSONObj *obj);
+  //todo: implement ceph-dencoder
+};
+WRITE_CLASS_ENCODER(RGWDefaultZoneGroupInfo)
+
+struct RGWTierACLMapping {
+  ACLGranteeTypeEnum type{ACL_TYPE_CANON_USER};
+  std::string source_id;
+  std::string dest_id;
+
+  RGWTierACLMapping() = default;
+
+  RGWTierACLMapping(ACLGranteeTypeEnum t,
+             const std::string& s,
+             const std::string& d) : type(t),
+  source_id(s),
+  dest_id(d) {}
+
+  void init(const JSONFormattable& config) {
+    const std::string& t = config["type"];
+
+    if (t == "email") {
+      type = ACL_TYPE_EMAIL_USER;
+    } else if (t == "uri") {
+      type = ACL_TYPE_GROUP;
+    } else {
+      type = ACL_TYPE_CANON_USER;
+    }
+
+    source_id = config["source_id"];
+    dest_id = config["dest_id"];
+  }
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(1, 1, bl);
+    encode((uint32_t)type, bl);
+    encode(source_id, bl);
+    encode(dest_id, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) {
+    DECODE_START(1, bl);
+    uint32_t it;
+    decode(it, bl);
+    type = (ACLGranteeTypeEnum)it;
+    decode(source_id, bl);
+    decode(dest_id, bl);
+    DECODE_FINISH(bl);
+  }
+  void dump(Formatter *f) const;
+  void decode_json(JSONObj *obj);
+};
+WRITE_CLASS_ENCODER(RGWTierACLMapping)
+
+struct RGWZoneGroupPlacementTierS3 {
+#define DEFAULT_MULTIPART_SYNC_PART_SIZE (32 * 1024 * 1024)
+  std::string endpoint;
+  RGWAccessKey key;
+  std::string region;
+  HostStyle host_style{PathStyle};
+  std::string target_storage_class;
+
+  /* Should below be bucket/zone specific?? */
+  std::string target_path;
+  std::map<std::string, RGWTierACLMapping> acl_mappings;
+
+  uint64_t multipart_sync_threshold{DEFAULT_MULTIPART_SYNC_PART_SIZE};
+  uint64_t multipart_min_part_size{DEFAULT_MULTIPART_SYNC_PART_SIZE};
+
+  int update_params(const JSONFormattable& config);
+  int clear_params(const JSONFormattable& config);
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(1, 1, bl);
+    encode(endpoint, bl);
+    encode(key, bl);
+    encode(region, bl);
+    encode((uint32_t)host_style, bl);
+    encode(target_storage_class, bl);
+    encode(target_path, bl);
+    encode(acl_mappings, bl);
+    encode(multipart_sync_threshold, bl);
+    encode(multipart_min_part_size, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) {
+    DECODE_START(1, bl);
+    decode(endpoint, bl);
+    decode(key, bl);
+    decode(region, bl);
+
+    uint32_t it;
+    decode(it, bl);
+    host_style = (HostStyle)it;
+
+    decode(target_storage_class, bl);
+    decode(target_path, bl);
+    decode(acl_mappings, bl);
+    decode(multipart_sync_threshold, bl);
+    decode(multipart_min_part_size, bl);
+    DECODE_FINISH(bl);
+  }
+  void dump(Formatter *f) const;
+  void decode_json(JSONObj *obj);
+};
+WRITE_CLASS_ENCODER(RGWZoneGroupPlacementTierS3)
+
+struct RGWZoneGroupPlacementTier {
+  std::string tier_type;
+  std::string storage_class;
+  bool retain_head_object = false;
+
+  struct _tier {
+    RGWZoneGroupPlacementTierS3 s3;
+  } t;
+
+  int update_params(const JSONFormattable& config);
+  int clear_params(const JSONFormattable& config);
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(1, 1, bl);
+    encode(tier_type, bl);
+    encode(storage_class, bl);
+    encode(retain_head_object, bl);
+    if (tier_type == "cloud-s3") {
+      encode(t.s3, bl);
+    }
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) {
+    DECODE_START(1, bl);
+    decode(tier_type, bl);
+    decode(storage_class, bl);
+    decode(retain_head_object, bl);
+    if (tier_type == "cloud-s3") {
+      decode(t.s3, bl);
+    }
+    DECODE_FINISH(bl);
+  }
+
+  void dump(Formatter *f) const;
+  void decode_json(JSONObj *obj);
+};
+WRITE_CLASS_ENCODER(RGWZoneGroupPlacementTier)
+
+struct RGWZoneGroupPlacementTarget {
+  std::string name;
+  std::set<std::string> tags;
+  std::set<std::string> storage_classes;
+  std::map<std::string, RGWZoneGroupPlacementTier> tier_targets;
+
+  bool user_permitted(const std::list<std::string>& user_tags) const {
+    if (tags.empty()) {
+      return true;
+    }
+    for (auto& rule : user_tags) {
+      if (tags.find(rule) != tags.end()) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(3, 1, bl);
+    encode(name, bl);
+    encode(tags, bl);
+    encode(storage_classes, bl);
+    encode(tier_targets, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) {
+    DECODE_START(3, bl);
+    decode(name, bl);
+    decode(tags, bl);
+    if (struct_v >= 2) {
+      decode(storage_classes, bl);
+    }
+    if (storage_classes.empty()) {
+      storage_classes.insert(RGW_STORAGE_CLASS_STANDARD);
+    }
+    if (struct_v >= 3) {
+      decode(tier_targets, bl);
+    }
+    DECODE_FINISH(bl);
+  }
+  void dump(Formatter *f) const;
+  void decode_json(JSONObj *obj);
+};
+WRITE_CLASS_ENCODER(RGWZoneGroupPlacementTarget)
+
+struct RGWZoneGroup : public RGWSystemMetaObj {
+  std::string api_name;
+  std::list<std::string> endpoints;
+  bool is_master = false;
+
+  rgw_zone_id master_zone;
+  std::map<rgw_zone_id, RGWZone> zones;
+
+  std::map<std::string, RGWZoneGroupPlacementTarget> placement_targets;
+  rgw_placement_rule default_placement;
+
+  std::list<std::string> hostnames;
+  std::list<std::string> hostnames_s3website;
+  // TODO: Maybe convert hostnames to a map<std::string,std::list<std::string>> for
+  // endpoint_type->hostnames
+/*
+20:05 < _robbat21irssi> maybe I do someting like: if (hostname_map.empty()) { populate all map keys from hostnames; };
+20:05 < _robbat21irssi> but that's a later compatability migration planning bit
+20:06 < yehudasa> more like if (!hostnames.empty()) {
+20:06 < yehudasa> for (std::list<std::string>::iterator iter = hostnames.begin(); iter != hostnames.end(); ++iter) {
+20:06 < yehudasa> hostname_map["s3"].append(iter->second);
+20:07 < yehudasa> hostname_map["s3website"].append(iter->second);
+20:07 < yehudasa> s/append/push_back/g
+20:08 < _robbat21irssi> inner loop over APIs
+20:08 < yehudasa> yeah, probably
+20:08 < _robbat21irssi> s3, s3website, swift, swith_auth, swift_website
+*/
+  std::map<std::string, std::list<std::string> > api_hostname_map;
+  std::map<std::string, std::list<std::string> > api_endpoints_map;
+
+  std::string realm_id;
+
+  rgw_sync_policy_info sync_policy;
+  rgw::zone_features::set enabled_features;
+
+  RGWZoneGroup(): is_master(false){}
+  RGWZoneGroup(const std::string &id, const std::string &name):RGWSystemMetaObj(id, name) {}
+  explicit RGWZoneGroup(const std::string &_name):RGWSystemMetaObj(_name) {}
+  RGWZoneGroup(const std::string &_name, bool _is_master, CephContext *cct, RGWSI_SysObj* sysobj_svc,
+              const std::string& _realm_id, const std::list<std::string>& _endpoints)
+    : RGWSystemMetaObj(_name, cct , sysobj_svc), endpoints(_endpoints), is_master(_is_master),
+      realm_id(_realm_id) {}
+  virtual ~RGWZoneGroup();
+
+  bool is_master_zonegroup() const { return is_master;}
+  void update_master(const DoutPrefixProvider *dpp, bool _is_master, optional_yield y) {
+    is_master = _is_master;
+    post_process_params(dpp, y);
+  }
+  void post_process_params(const DoutPrefixProvider *dpp, optional_yield y);
+
+  void encode(bufferlist& bl) const override {
+    ENCODE_START(6, 1, bl);
+    encode(name, bl);
+    encode(api_name, bl);
+    encode(is_master, bl);
+    encode(endpoints, bl);
+    encode(master_zone, bl);
+    encode(zones, bl);
+    encode(placement_targets, bl);
+    encode(default_placement, bl);
+    encode(hostnames, bl);
+    encode(hostnames_s3website, bl);
+    RGWSystemMetaObj::encode(bl);
+    encode(realm_id, bl);
+    encode(sync_policy, bl);
+    encode(enabled_features, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) override {
+    DECODE_START(6, bl);
+    decode(name, bl);
+    decode(api_name, bl);
+    decode(is_master, bl);
+    decode(endpoints, bl);
+    decode(master_zone, bl);
+    decode(zones, bl);
+    decode(placement_targets, bl);
+    decode(default_placement, bl);
+    if (struct_v >= 2) {
+      decode(hostnames, bl);
+    }
+    if (struct_v >= 3) {
+      decode(hostnames_s3website, bl);
+    }
+    if (struct_v >= 4) {
+      RGWSystemMetaObj::decode(bl);
+      decode(realm_id, bl);
+    } else {
+      id = name;
+    }
+    if (struct_v >= 5) {
+      decode(sync_policy, bl);
+    }
+    if (struct_v >= 6) {
+      decode(enabled_features, bl);
+    }
+    DECODE_FINISH(bl);
+  }
+
+  int read_default_id(const DoutPrefixProvider *dpp, std::string& default_id, optional_yield y, bool old_format = false) override;
+  int set_as_default(const DoutPrefixProvider *dpp, optional_yield y, bool exclusive = false) override;
+  int create_default(const DoutPrefixProvider *dpp, optional_yield y, bool old_format = false);
+  int equals(const std::string& other_zonegroup) const;
+  int add_zone(const DoutPrefixProvider *dpp, 
+               const RGWZoneParams& zone_params, bool *is_master, bool *read_only,
+               const std::list<std::string>& endpoints, const std::string *ptier_type,
+               bool *psync_from_all, std::list<std::string>& sync_from,
+               std::list<std::string>& sync_from_rm, std::string *predirect_zone,
+               std::optional<int> bucket_index_max_shards, RGWSyncModulesManager *sync_mgr,
+               const rgw::zone_features::set& enable_features,
+               const rgw::zone_features::set& disable_features,
+              optional_yield y);
+  int remove_zone(const DoutPrefixProvider *dpp, const std::string& zone_id, optional_yield y);
+  int rename_zone(const DoutPrefixProvider *dpp, const RGWZoneParams& zone_params, optional_yield y);
+  rgw_pool get_pool(CephContext *cct) const override;
+  const std::string get_default_oid(bool old_region_format = false) const override;
+  const std::string& get_info_oid_prefix(bool old_region_format = false) const override;
+  const std::string& get_names_oid_prefix() const override;
+  std::string get_predefined_id(CephContext *cct) const override;
+  const std::string& get_predefined_name(CephContext *cct) const override;
+
+  void dump(Formatter *f) const;
+  void decode_json(JSONObj *obj);
+  static void generate_test_instances(std::list<RGWZoneGroup*>& o);
+
+  bool supports(std::string_view feature) const {
+    return enabled_features.contains(feature);
+  }
+};
+WRITE_CLASS_ENCODER(RGWZoneGroup)
+
+struct RGWPeriodMap
+{
+  std::string id;
+  std::map<std::string, RGWZoneGroup> zonegroups;
+  std::map<std::string, RGWZoneGroup> zonegroups_by_api;
+  std::map<std::string, uint32_t> short_zone_ids;
+
+  std::string master_zonegroup;
+
+  void encode(bufferlist& bl) const;
+  void decode(bufferlist::const_iterator& bl);
+
+  int update(const RGWZoneGroup& zonegroup, CephContext *cct);
+
+  void dump(Formatter *f) const;
+  void decode_json(JSONObj *obj);
+
+  void reset() {
+    zonegroups.clear();
+    zonegroups_by_api.clear();
+    master_zonegroup.clear();
+  }
+
+  uint32_t get_zone_short_id(const std::string& zone_id) const;
+
+  bool find_zone_by_id(const rgw_zone_id& zone_id,
+                       RGWZoneGroup *zonegroup,
+                       RGWZone *zone) const;
+  bool find_zone_by_name(const std::string& zone_id,
+                       RGWZoneGroup *zonegroup,
+                       RGWZone *zone) const;
+};
+WRITE_CLASS_ENCODER(RGWPeriodMap)
+
+struct RGWPeriodConfig
+{
+  RGWQuota quota;
+  RGWRateLimitInfo user_ratelimit;
+  RGWRateLimitInfo bucket_ratelimit;
+  // rate limit unauthenticated user
+  RGWRateLimitInfo anon_ratelimit;
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(2, 1, bl);
+    encode(quota.bucket_quota, bl);
+    encode(quota.user_quota, bl);
+    encode(bucket_ratelimit, bl);
+    encode(user_ratelimit, bl);
+    encode(anon_ratelimit, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) {
+    DECODE_START(2, bl);
+    decode(quota.bucket_quota, bl);
+    decode(quota.user_quota, bl);
+    if (struct_v >= 2) {
+      decode(bucket_ratelimit, bl);
+      decode(user_ratelimit, bl);
+      decode(anon_ratelimit, bl);
+    }
+    DECODE_FINISH(bl);
+  }
+
+  void dump(Formatter *f) const;
+  void decode_json(JSONObj *obj);
+
+  // the period config must be stored in a local object outside of the period,
+  // so that it can be used in a default configuration where no realm/period
+  // exists
+  int read(const DoutPrefixProvider *dpp, RGWSI_SysObj *sysobj_svc, const std::string& realm_id, optional_yield y);
+  int write(const DoutPrefixProvider *dpp, RGWSI_SysObj *sysobj_svc, const std::string& realm_id, optional_yield y);
+
+  static std::string get_oid(const std::string& realm_id);
+  static rgw_pool get_pool(CephContext *cct);
+};
+WRITE_CLASS_ENCODER(RGWPeriodConfig)
+
+class RGWRealm;
+class RGWPeriod;
+
+class RGWRealm : public RGWSystemMetaObj
+{
+public:
+  std::string current_period;
+  epoch_t epoch{0}; //< realm epoch, incremented for each new period
+
+  int create_control(const DoutPrefixProvider *dpp, bool exclusive, optional_yield y);
+  int delete_control(const DoutPrefixProvider *dpp, optional_yield y);
+public:
+  RGWRealm() {}
+  RGWRealm(const std::string& _id, const std::string& _name = "") : RGWSystemMetaObj(_id, _name) {}
+  RGWRealm(CephContext *_cct, RGWSI_SysObj *_sysobj_svc): RGWSystemMetaObj(_cct, _sysobj_svc) {}
+  RGWRealm(const std::string& _name, CephContext *_cct, RGWSI_SysObj *_sysobj_svc): RGWSystemMetaObj(_name, _cct, _sysobj_svc){}
+  virtual ~RGWRealm() override;
+
+  void encode(bufferlist& bl) const override {
+    ENCODE_START(1, 1, bl);
+    RGWSystemMetaObj::encode(bl);
+    encode(current_period, bl);
+    encode(epoch, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) override {
+    DECODE_START(1, bl);
+    RGWSystemMetaObj::decode(bl);
+    decode(current_period, bl);
+    decode(epoch, bl);
+    DECODE_FINISH(bl);
+  }
+
+  int create(const DoutPrefixProvider *dpp, optional_yield y, bool exclusive = true) override;
+  int delete_obj(const DoutPrefixProvider *dpp, optional_yield y);
+  rgw_pool get_pool(CephContext *cct) const override;
+  const std::string get_default_oid(bool old_format = false) const override;
+  const std::string& get_names_oid_prefix() const override;
+  const std::string& get_info_oid_prefix(bool old_format = false) const override;
+  std::string get_predefined_id(CephContext *cct) const override;
+  const std::string& get_predefined_name(CephContext *cct) const override;
+
+  using RGWSystemMetaObj::read_id; // expose as public for radosgw-admin
+
+  void dump(Formatter *f) const;
+  void decode_json(JSONObj *obj);
+  static void generate_test_instances(std::list<RGWRealm*>& o);
+
+  const std::string& get_current_period() const {
+    return current_period;
+  }
+  int set_current_period(const DoutPrefixProvider *dpp, RGWPeriod& period, optional_yield y);
+  void clear_current_period_and_epoch() {
+    current_period.clear();
+    epoch = 0;
+  }
+  epoch_t get_epoch() const { return epoch; }
+
+  std::string get_control_oid() const;
+  /// send a notify on the realm control object
+  int notify_zone(const DoutPrefixProvider *dpp, bufferlist& bl, optional_yield y);
+  /// notify the zone of a new period
+  int notify_new_period(const DoutPrefixProvider *dpp, const RGWPeriod& period, optional_yield y);
+
+  int find_zone(const DoutPrefixProvider *dpp,
+                const rgw_zone_id& zid,
+                RGWPeriod *pperiod,
+                RGWZoneGroup *pzonegroup,
+                bool *pfound,
+                optional_yield y) const;
+};
+WRITE_CLASS_ENCODER(RGWRealm)
+
+struct RGWPeriodLatestEpochInfo {
+  epoch_t epoch = 0;
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(1, 1, bl);
+    encode(epoch, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) {
+    DECODE_START(1, bl);
+    decode(epoch, bl);
+    DECODE_FINISH(bl);
+  }
+
+  void dump(Formatter *f) const;
+  void decode_json(JSONObj *obj);
+  static void generate_test_instances(std::list<RGWPeriodLatestEpochInfo*>& o);
+};
+WRITE_CLASS_ENCODER(RGWPeriodLatestEpochInfo)
+
+
+/*
+ * The RGWPeriod object contains the entire configuration of a
+ * RGWRealm, including its RGWZoneGroups and RGWZones. Consistency of
+ * this configuration is maintained across all zones by passing around
+ * the RGWPeriod object in its JSON representation.
+ *
+ * If a new configuration changes which zone is the metadata master
+ * zone (i.e., master zone of the master zonegroup), then a new
+ * RGWPeriod::id (a uuid) is generated, its RGWPeriod::realm_epoch is
+ * incremented, and the RGWRealm object is updated to reflect that new
+ * current_period id and epoch. If the configuration changes BUT which
+ * zone is the metadata master does NOT change, then only the
+ * RGWPeriod::epoch is incremented (and the RGWPeriod::id remains the
+ * same).
+ *
+ * When a new RGWPeriod is created with a new RGWPeriod::id (uuid), it
+ * is linked back to its predecessor RGWPeriod through the
+ * RGWPeriod::predecessor_uuid field, thus creating a "linked
+ * list"-like structure of RGWPeriods back to the cluster's creation.
+ */
+class RGWPeriod
+{
+public:
+  std::string id; //< a uuid
+  epoch_t epoch{0};
+  std::string predecessor_uuid;
+  std::vector<std::string> sync_status;
+  RGWPeriodMap period_map;
+  RGWPeriodConfig period_config;
+  std::string master_zonegroup;
+  rgw_zone_id master_zone;
+
+  std::string realm_id;
+  std::string realm_name;
+  epoch_t realm_epoch{1}; //< realm epoch when period was made current
+
+  CephContext *cct{nullptr};
+  RGWSI_SysObj *sysobj_svc{nullptr};
+
+  int read_info(const DoutPrefixProvider *dpp, optional_yield y);
+  int read_latest_epoch(const DoutPrefixProvider *dpp,
+                        RGWPeriodLatestEpochInfo& epoch_info,
+                       optional_yield y,
+                        RGWObjVersionTracker *objv = nullptr);
+  int use_latest_epoch(const DoutPrefixProvider *dpp, optional_yield y);
+  int use_current_period();
+
+  const std::string get_period_oid() const;
+  const std::string get_period_oid_prefix() const;
+
+  // gather the metadata sync status for each shard; only for use on master zone
+  int update_sync_status(const DoutPrefixProvider *dpp, 
+                         rgw::sal::Driver* driver,
+                         const RGWPeriod &current_period,
+                         std::ostream& error_stream, bool force_if_stale);
+
+public:
+  RGWPeriod() {}
+
+  explicit RGWPeriod(const std::string& period_id, epoch_t _epoch = 0)
+    : id(period_id), epoch(_epoch) {}
+
+  const std::string& get_id() const { return id; }
+  epoch_t get_epoch() const { return epoch; }
+  epoch_t get_realm_epoch() const { return realm_epoch; }
+  const std::string& get_predecessor() const { return predecessor_uuid; }
+  const rgw_zone_id& get_master_zone() const { return master_zone; }
+  const std::string& get_master_zonegroup() const { return master_zonegroup; }
+  const std::string& get_realm() const { return realm_id; }
+  const std::string& get_realm_name() const { return realm_name; }
+  const RGWPeriodMap& get_map() const { return period_map; }
+  RGWPeriodConfig& get_config() { return period_config; }
+  const RGWPeriodConfig& get_config() const { return period_config; }
+  const std::vector<std::string>& get_sync_status() const { return sync_status; }
+  rgw_pool get_pool(CephContext *cct) const;
+  const std::string& get_latest_epoch_oid() const;
+  const std::string& get_info_oid_prefix() const;
+
+  void set_user_quota(RGWQuotaInfo& user_quota) {
+    period_config.quota.user_quota = user_quota;
+  }
+
+  void set_bucket_quota(RGWQuotaInfo& bucket_quota) {
+    period_config.quota.bucket_quota = bucket_quota;
+  }
+
+  void set_id(const std::string& _id) {
+    this->id = _id;
+    period_map.id = _id;
+  }
+  void set_epoch(epoch_t epoch) { this->epoch = epoch; }
+  void set_realm_epoch(epoch_t epoch) { realm_epoch = epoch; }
+
+  void set_predecessor(const std::string& predecessor)
+  {
+    predecessor_uuid = predecessor;
+  }
+
+  void set_realm_id(const std::string& _realm_id) {
+    realm_id = _realm_id;
+  }
+
+  int reflect(const DoutPrefixProvider *dpp, optional_yield y);
+
+  int get_zonegroup(RGWZoneGroup& zonegroup,
+                   const std::string& zonegroup_id) const;
+
+  bool is_single_zonegroup() const
+  {
+      return (period_map.zonegroups.size() <= 1);
+  }
+
+  /*
+    returns true if there are several zone groups with a least one zone
+   */
+  bool is_multi_zonegroups_with_zones() const
+  {
+    int count = 0;
+    for (const auto& zg:  period_map.zonegroups) {
+      if (zg.second.zones.size() > 0) {
+       if (count++ > 0) {
+         return true;
+       }
+      }
+    }
+    return false;
+  }
+
+  bool find_zone(const DoutPrefixProvider *dpp,
+                const rgw_zone_id& zid,
+                RGWZoneGroup *pzonegroup,
+                optional_yield y) const;
+
+  int get_latest_epoch(const DoutPrefixProvider *dpp, epoch_t& epoch, optional_yield y);
+  int set_latest_epoch(const DoutPrefixProvider *dpp, optional_yield y,
+                      epoch_t epoch, bool exclusive = false,
+                       RGWObjVersionTracker *objv = nullptr);
+  // update latest_epoch if the given epoch is higher, else return -EEXIST
+  int update_latest_epoch(const DoutPrefixProvider *dpp, epoch_t epoch, optional_yield y);
+
+  int init(const DoutPrefixProvider *dpp, CephContext *_cct, RGWSI_SysObj *_sysobj_svc, const std::string &period_realm_id, optional_yield y,
+          const std::string &period_realm_name = "", bool setup_obj = true);
+  int init(const DoutPrefixProvider *dpp, CephContext *_cct, RGWSI_SysObj *_sysobj_svc, optional_yield y, bool setup_obj = true);  
+
+  int create(const DoutPrefixProvider *dpp, optional_yield y, bool exclusive = true);
+  int delete_obj(const DoutPrefixProvider *dpp, optional_yield y);
+  int store_info(const DoutPrefixProvider *dpp, bool exclusive, optional_yield y);
+  int add_zonegroup(const DoutPrefixProvider *dpp, const RGWZoneGroup& zonegroup, optional_yield y);
+
+  void fork();
+  int update(const DoutPrefixProvider *dpp, optional_yield y);
+
+  // commit a staging period; only for use on master zone
+  int commit(const DoutPrefixProvider *dpp,
+            rgw::sal::Driver* driver,
+             RGWRealm& realm, const RGWPeriod &current_period,
+             std::ostream& error_stream, optional_yield y,
+            bool force_if_stale = false);
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(1, 1, bl);
+    encode(id, bl);
+    encode(epoch, bl);
+    encode(realm_epoch, bl);
+    encode(predecessor_uuid, bl);
+    encode(sync_status, bl);
+    encode(period_map, bl);
+    encode(master_zone, bl);
+    encode(master_zonegroup, bl);
+    encode(period_config, bl);
+    encode(realm_id, bl);
+    encode(realm_name, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) {
+    DECODE_START(1, bl);
+    decode(id, bl);
+    decode(epoch, bl);
+    decode(realm_epoch, bl);
+    decode(predecessor_uuid, bl);
+    decode(sync_status, bl);
+    decode(period_map, bl);
+    decode(master_zone, bl);
+    decode(master_zonegroup, bl);
+    decode(period_config, bl);
+    decode(realm_id, bl);
+    decode(realm_name, bl);
+    DECODE_FINISH(bl);
+  }
+  void dump(Formatter *f) const;
+  void decode_json(JSONObj *obj);
+  static void generate_test_instances(std::list<RGWPeriod*>& o);
+
+  static std::string get_staging_id(const std::string& realm_id) {
+    return realm_id + ":staging";
+  }
+};
+WRITE_CLASS_ENCODER(RGWPeriod)
+
+namespace rgw {
+
+/// Look up a realm by its id. If no id is given, look it up by name.
+/// If no name is given, fall back to the cluster's default realm.
+int read_realm(const DoutPrefixProvider* dpp, optional_yield y,
+               sal::ConfigStore* cfgstore,
+               std::string_view realm_id,
+               std::string_view realm_name,
+               RGWRealm& info,
+               std::unique_ptr<sal::RealmWriter>* writer = nullptr);
+
+/// Create a realm and its initial period. If the info.id is empty, a
+/// random uuid will be generated.
+int create_realm(const DoutPrefixProvider* dpp, optional_yield y,
+                 sal::ConfigStore* cfgstore, bool exclusive,
+                 RGWRealm& info,
+                 std::unique_ptr<sal::RealmWriter>* writer = nullptr);
+
+/// Set the given realm as the cluster's default realm.
+int set_default_realm(const DoutPrefixProvider* dpp, optional_yield y,
+                      sal::ConfigStore* cfgstore, const RGWRealm& info,
+                      bool exclusive = false);
+
+/// Update the current_period of an existing realm.
+int realm_set_current_period(const DoutPrefixProvider* dpp, optional_yield y,
+                             sal::ConfigStore* cfgstore,
+                             sal::RealmWriter& writer, RGWRealm& realm,
+                             const RGWPeriod& period);
+
+/// Overwrite the local zonegroup and period config objects with the new
+/// configuration contained in the given period.
+int reflect_period(const DoutPrefixProvider* dpp, optional_yield y,
+                   sal::ConfigStore* cfgstore, const RGWPeriod& info);
+
+/// Return the staging period id for the given realm.
+std::string get_staging_period_id(std::string_view realm_id);
+
+/// Convert the given period into a separate staging period, where
+/// radosgw-admin can make changes to it without effecting the running
+/// configuration.
+void fork_period(const DoutPrefixProvider* dpp, RGWPeriod& info);
+
+/// Read all zonegroups in the period's realm and add them to the period.
+int update_period(const DoutPrefixProvider* dpp, optional_yield y,
+                  sal::ConfigStore* cfgstore, RGWPeriod& info);
+
+/// Validates the given 'staging' period and tries to commit it as the
+/// realm's new current period.
+int commit_period(const DoutPrefixProvider* dpp, optional_yield y,
+                  sal::ConfigStore* cfgstore, sal::Driver* driver,
+                  RGWRealm& realm, sal::RealmWriter& realm_writer,
+                  const RGWPeriod& current_period,
+                  RGWPeriod& info, std::ostream& error_stream,
+                  bool force_if_stale);
+
+
+/// Look up a zonegroup by its id. If no id is given, look it up by name.
+/// If no name is given, fall back to the cluster's default zonegroup.
+int read_zonegroup(const DoutPrefixProvider* dpp, optional_yield y,
+                   sal::ConfigStore* cfgstore,
+                   std::string_view zonegroup_id,
+                   std::string_view zonegroup_name,
+                   RGWZoneGroup& info,
+                   std::unique_ptr<sal::ZoneGroupWriter>* writer = nullptr);
+
+/// Initialize and create the given zonegroup. If the given info.id is empty,
+/// a random uuid will be generated. May fail with -EEXIST.
+int create_zonegroup(const DoutPrefixProvider* dpp, optional_yield y,
+                     sal::ConfigStore* cfgstore, bool exclusive,
+                     RGWZoneGroup& info);
+
+/// Set the given zonegroup as its realm's default zonegroup.
+int set_default_zonegroup(const DoutPrefixProvider* dpp, optional_yield y,
+                          sal::ConfigStore* cfgstore, const RGWZoneGroup& info,
+                          bool exclusive = false);
+
+/// Add a zone to the zonegroup, or update an existing zone entry.
+int add_zone_to_group(const DoutPrefixProvider* dpp,
+                      RGWZoneGroup& zonegroup,
+                      const RGWZoneParams& zone_params,
+                      const bool *pis_master, const bool *pread_only,
+                      const std::list<std::string>& endpoints,
+                      const std::string *ptier_type,
+                      const bool *psync_from_all,
+                      const std::list<std::string>& sync_from,
+                      const std::list<std::string>& sync_from_rm,
+                      const std::string *predirect_zone,
+                      std::optional<int> bucket_index_max_shards,
+                      const rgw::zone_features::set& enable_features,
+                      const rgw::zone_features::set& disable_features);
+
+/// Remove a zone by id from its zonegroup, promoting a new master zone if
+/// necessary.
+int remove_zone_from_group(const DoutPrefixProvider* dpp,
+                           RGWZoneGroup& info,
+                           const rgw_zone_id& zone_id);
+
+
+/// Look up a zone by its id. If no id is given, look it up by name. If no name
+/// is given, fall back to the realm's default zone.
+int read_zone(const DoutPrefixProvider* dpp, optional_yield y,
+              sal::ConfigStore* cfgstore,
+              std::string_view zone_id,
+              std::string_view zone_name,
+              RGWZoneParams& info,
+              std::unique_ptr<sal::ZoneWriter>* writer = nullptr);
+
+/// Initialize and create a new zone. If the given info.id is empty, a random
+/// uuid will be generated. Pool names are initialized with the zone name as a
+/// prefix. If any pool names conflict with existing zones, a random suffix is
+/// added.
+int create_zone(const DoutPrefixProvider* dpp, optional_yield y,
+                sal::ConfigStore* cfgstore, bool exclusive,
+                RGWZoneParams& info,
+                std::unique_ptr<sal::ZoneWriter>* writer = nullptr);
+
+/// Initialize the zone's pool names using the zone name as a prefix. If a pool
+/// name conflicts with an existing zone's pool, add a unique suffix.
+int init_zone_pool_names(const DoutPrefixProvider *dpp, optional_yield y,
+                         const std::set<rgw_pool>& pools, RGWZoneParams& info);
+
+/// Set the given zone as its realm's default zone.
+int set_default_zone(const DoutPrefixProvider* dpp, optional_yield y,
+                      sal::ConfigStore* cfgstore, const RGWZoneParams& info,
+                      bool exclusive = false);
+
+/// Delete an existing zone and remove it from any zonegroups that contain it.
+int delete_zone(const DoutPrefixProvider* dpp, optional_yield y,
+                sal::ConfigStore* cfgstore, const RGWZoneParams& info,
+                sal::ZoneWriter& writer);
+
+} // namespace rgw
+
+#endif
index e3234751f4ab4bf7b8fce1b0ea591c288af8d8c9..4f2820c6cb60a9c4ba2d9348411d3cdeaa34b9bd 100644 (file)
@@ -3021,7 +3021,7 @@ int RGWRados::Object::Write::_do_write_meta(const DoutPrefixProvider *dpp,
   string req_id;
   if (!s) {
     // fake req_id
-    req_id = store->svc.zone_utils->unique_id(store->store->get_new_req_id());
+    req_id = store->svc.zone_utils->unique_id(store->driver->get_new_req_id());
   } else {
     req_id = s->req_id;
   }
index 0eee88fd674a54c439d824088e814c65b9240f87..8a793f753146763090cc54dd3942040462721fa7 100644 (file)
 
 #include "rgw_sal.h"
 #include "rgw_sal_rados.h"
-#include "store/rados/config/store.h"
-#include "store/json_config/store.h"
+#include "driver/rados/config/store.h"
+#include "driver/json_config/store.h"
 #include "rgw_d3n_datacache.h"
 
 #ifdef WITH_RADOSGW_DBSTORE
 #include "rgw_sal_dbstore.h"
-#include "store/dbstore/config/store.h"
+#include "driver/dbstore/config/store.h"
 #endif
 
 #ifdef WITH_RADOSGW_MOTR
index c46fb6f842e09ed30fd8ecc907b30f77536e73cd..a7f496191c73fb216393f61223e2c3096193f6c2 100644 (file)
@@ -21,8 +21,8 @@
 #include "rgw_lc.h"
 #include "rgw_multi.h"
 
-#include "store/dbstore/common/dbstore.h"
-#include "store/dbstore/dbstore_mgr.h"
+#include "driver/dbstore/common/dbstore.h"
+#include "driver/dbstore/dbstore_mgr.h"
 
 namespace rgw { namespace sal {
 
diff --git a/src/rgw/store/daos/README.md b/src/rgw/store/daos/README.md
deleted file mode 100644 (file)
index de6d215..0000000
+++ /dev/null
@@ -1,47 +0,0 @@
-# DAOS
-
-Standalone RADOS Gateway (RGW) on [DAOS](http://daos.io/) (Experimental)
-
-## CMake Option
-
-Add below cmake option
-
-```bash
-    -DWITH_RADOSGW_DAOS=ON
-```
-
-## Build
-
-```bash
-    cd build
-    ninja [vstart]
-```
-
-## Running Test cluster
-
-Edit ceph.conf to add below option
-
-```conf
-    [client]
-        rgw backend store = daos
-```
-
-Restart vstart cluster or just RGW server
-
-```bash
-    [..] RGW=1 ../src/vstart.sh -d
-```
-
-The above configuration brings up an RGW server on DAOS.
-
-## Creating a test user
-
- To create a `testid` user to be used for s3 operations, use the following command:
-
- ```bash
-local akey='0555b35654ad1656d804'
-local skey='h7GhxuBLTrlhVUyxSPUKUV8r/2EI4ngqJxD7iBdBYLhwluN30JaT3Q=='
-    radosgw-admin user create --uid testid \
-        --access-key $akey --secret $skey \
-        --display-name 'M. Tester' --email tester@ceph.com --no-mon-config
- ```
diff --git a/src/rgw/store/dbstore/CMakeLists.txt b/src/rgw/store/dbstore/CMakeLists.txt
deleted file mode 100644 (file)
index 0d34d32..0000000
+++ /dev/null
@@ -1,72 +0,0 @@
-#need to update cmake version here
-cmake_minimum_required(VERSION 3.14.0)
-project(dbstore)
-
-option(USE_SQLITE "Enable SQLITE DB" ON)
-
-set (CMAKE_INCLUDE_DIR ${CMAKE_INCLUDE_DIR} "${CMAKE_CURRENT_SOURCE_DIR}/common")
-
-set(dbstore_srcs
-    common/dbstore_log.h
-    common/dbstore.h
-    common/dbstore.cc
-    config/store.cc)
-IF(USE_SQLITE)
-  list(APPEND dbstore_srcs
-      config/sqlite.cc
-      sqlite/connection.cc
-      sqlite/error.cc
-      sqlite/statement.cc)
-endif()
-
-set(dbstore_mgr_srcs
-    dbstore_mgr.h
-    dbstore_mgr.cc
-    )
-
-add_library(dbstore_lib ${dbstore_srcs})
-target_include_directories(dbstore_lib
-    PUBLIC "${CMAKE_SOURCE_DIR}/src/fmt/include"
-    PUBLIC "${CMAKE_SOURCE_DIR}/src/rgw"
-    PUBLIC "${CMAKE_SOURCE_DIR}/src/rgw/store/rados"
-    PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}")
-set(link_targets spawn)
-if(WITH_JAEGER)
-  list(APPEND link_targets jaeger_base)
-endif()
-list(APPEND link_targets rgw_common)
-target_link_libraries(dbstore_lib PUBLIC ${link_targets})
-
-set (CMAKE_LINK_LIBRARIES ${CMAKE_LINK_LIBRARIES} dbstore_lib)
-
-IF(USE_SQLITE)
-  add_subdirectory(sqlite)
-  set(CMAKE_INCLUDE_DIR ${CMAKE_INCLUDE_DIR} "${CMAKE_CURRENT_SOURCE_DIR}/sqlite")
-  add_compile_definitions(SQLITE_ENABLED=1)
-  set (CMAKE_LINK_LIBRARIES ${CMAKE_LINK_LIBRARIES} rgw_common)
-  set (CMAKE_LINK_LIBRARIES ${CMAKE_LINK_LIBRARIES} sqlite_db)
-  add_dependencies(sqlite_db dbstore_lib)
-ENDIF()
-
-# add pthread library
-set (CMAKE_LINK_LIBRARIES ${CMAKE_LINK_LIBRARIES} pthread)
-
-find_package(gtest QUIET)
-if(WITH_TESTS)
-    add_subdirectory(tests)
-else()
-       message(WARNING "Gtest not enabled")
-endif()
-
-include_directories(${CMAKE_INCLUDE_DIR})
-add_library(dbstore STATIC ${dbstore_mgr_srcs})
-target_link_libraries(dbstore ${CMAKE_LINK_LIBRARIES})
-
-# testing purpose
-set(dbstore_main_srcs
-    dbstore_main.cc)
-
-set (CMAKE_LINK_LIBRARIES ${CMAKE_LINK_LIBRARIES} dbstore)
-add_executable(dbstore-bin ${dbstore_main_srcs})
-add_dependencies(dbstore-bin dbstore)
-target_link_libraries(dbstore-bin ${CMAKE_LINK_LIBRARIES})
diff --git a/src/rgw/store/dbstore/README.md b/src/rgw/store/dbstore/README.md
deleted file mode 100644 (file)
index 659bc20..0000000
+++ /dev/null
@@ -1,53 +0,0 @@
-# DBStore
-Standalone Rados Gateway (RGW) on DBStore (Experimental)
-
-
-## CMake Option
-Add below cmake option (enabled by default)
-
-    -DWITH_RADOSGW_DBSTORE=ON 
-
-
-## Build
-
-    cd build
-    ninja [vstart]
-
-
-## Running Test cluster
-Edit ceph.conf to add below option
-
-    [client]
-        rgw backend store = dbstore
-
-Start vstart cluster
-
-    [..] RGW=1 ../src/vstart.sh -o rgw_backend_store=dbstore -n -d
-
-The above vstart command brings up RGW server on dbstore and creates few default users (eg., testid) to be used for s3 operations.
-
-`radosgw-admin` can be used to create and remove other users.
-
-
-By default, dbstore creates .db file *'/var/lib/ceph/radosgw/dbstore-default_ns.db'* to store the data. This can be configured using below options in ceph.conf
-
-    [client]
-        dbstore db dir = <path for the directory for storing the db backend store data>
-        dbstore db name prefix = <prefix to the file names created by db backend store>
-
-
-## DBStore Unit Tests
-To execute DBStore unit test cases (using Gtest framework), from build directory
-
-    ninja unittest_dbstore_tests
-    ./bin/unittest_dbstore_tests [logfile] [loglevel]
-    (default logfile: rgw_dbstore_tests.log, loglevel: 20)
-    ninja unittest_dbstore_mgr_tests
-    ./bin/unittest_dbstore_mgr_tests
-
-To execute Sample test file
-
-    ninja src/rgw/store/dbstore/install
-    ./bin/dbstore-bin [logfile] [loglevel]
-    (default logfile: rgw_dbstore_bin.log, loglevel: 20)
-
diff --git a/src/rgw/store/dbstore/common/connection_pool.h b/src/rgw/store/dbstore/common/connection_pool.h
deleted file mode 100644 (file)
index 07f3c81..0000000
+++ /dev/null
@@ -1,147 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab ft=cpp
-
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2022 Red Hat, Inc.
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation. See file COPYING.
- *
- */
-
-#pragma once
-
-#include <concepts>
-#include <condition_variable>
-#include <memory>
-#include <mutex>
-#include <boost/circular_buffer.hpp>
-#include "common/dout.h"
-
-namespace rgw::dbstore {
-
-template <typename Connection>
-class ConnectionHandle;
-
-/// A thread-safe base class that manages a fixed-size pool of generic database
-/// connections and supports the reclamation of ConnectionHandles. This class
-/// is the subset of ConnectionPool which doesn't depend on the Factory type.
-template <typename Connection>
-class ConnectionPoolBase {
- public:
-  ConnectionPoolBase(std::size_t max_connections)
-      : connections(max_connections)
-  {}
- private:
-  friend class ConnectionHandle<Connection>;
-
-  // TODO: the caller may detect a connection error that prevents the connection
-  // from being reused. allow them to indicate these errors here
-  void put(std::unique_ptr<Connection> connection)
-  {
-    auto lock = std::scoped_lock{mutex};
-    connections.push_back(std::move(connection));
-
-    if (connections.size() == 1) { // was empty
-      cond.notify_one();
-    }
-  }
- protected:
-  std::mutex mutex;
-  std::condition_variable cond;
-  boost::circular_buffer<std::unique_ptr<Connection>> connections;
-};
-
-/// Handle to a database connection borrowed from the pool. Automatically
-/// returns the connection to its pool on the handle's destruction.
-template <typename Connection>
-class ConnectionHandle {
-  ConnectionPoolBase<Connection>* pool = nullptr;
-  std::unique_ptr<Connection> conn;
- public:
-  ConnectionHandle() noexcept = default;
-  ConnectionHandle(ConnectionPoolBase<Connection>* pool,
-                   std::unique_ptr<Connection> conn) noexcept
-    : pool(pool), conn(std::move(conn)) {}
-
-  ~ConnectionHandle() {
-    if (conn) {
-      pool->put(std::move(conn));
-    }
-  }
-
-  ConnectionHandle(ConnectionHandle&&) = default;
-  ConnectionHandle& operator=(ConnectionHandle&& o) noexcept {
-    if (conn) {
-      pool->put(std::move(conn));
-    }
-    conn = std::move(o.conn);
-    pool = o.pool;
-    return *this;
-  }
-
-  explicit operator bool() const noexcept { return static_cast<bool>(conn); }
-  Connection& operator*() const noexcept { return *conn; }
-  Connection* operator->() const noexcept { return conn.get(); }
-  Connection* get() const noexcept { return conn.get(); }
-};
-
-
-// factory_of concept requires the function signature:
-//   F(const DoutPrefixProvider*) -> std::unique_ptr<T>
-template <typename F, typename T>
-concept factory_of = requires (F factory, const DoutPrefixProvider* dpp) {
-  { factory(dpp) } -> std::same_as<std::unique_ptr<T>>;
-  requires std::move_constructible<F>;
-};
-
-
-/// Generic database connection pool that enforces a limit on open connections.
-template <typename Connection, factory_of<Connection> Factory>
-class ConnectionPool : public ConnectionPoolBase<Connection> {
- public:
-  ConnectionPool(Factory factory, std::size_t max_connections)
-      : ConnectionPoolBase<Connection>(max_connections),
-        factory(std::move(factory))
-  {}
-
-  /// Borrow a connection from the pool. If all existing connections are in use,
-  /// use the connection factory to create another one. If we've reached the
-  /// limit on open connections, wait on a condition variable for the next one
-  /// returned to the pool.
-  auto get(const DoutPrefixProvider* dpp)
-      -> ConnectionHandle<Connection>
-  {
-    auto lock = std::unique_lock{this->mutex};
-    std::unique_ptr<Connection> conn;
-
-    if (!this->connections.empty()) {
-      // take an existing connection
-      conn = std::move(this->connections.front());
-      this->connections.pop_front();
-    } else if (total < this->connections.capacity()) {
-      // add another connection to the pool
-      conn = factory(dpp);
-      ++total;
-    } else {
-      // wait for the next put()
-      // TODO: support optional_yield
-      ldpp_dout(dpp, 4) << "ConnectionPool waiting on a connection" << dendl;
-      this->cond.wait(lock, [&] { return !this->connections.empty(); });
-      ldpp_dout(dpp, 4) << "ConnectionPool done waiting" << dendl;
-      conn = std::move(this->connections.front());
-      this->connections.pop_front();
-    }
-
-    return {this, std::move(conn)};
-  }
- private:
-  Factory factory;
-  std::size_t total = 0;
-};
-
-} // namespace rgw::dbstore
diff --git a/src/rgw/store/dbstore/common/dbstore.cc b/src/rgw/store/dbstore/common/dbstore.cc
deleted file mode 100644 (file)
index 3936368..0000000
+++ /dev/null
@@ -1,2245 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-
-#include "dbstore.h"
-
-using namespace std;
-
-namespace rgw { namespace store {
-
-map<string, class ObjectOp*> DB::objectmap = {};
-
-map<string, class ObjectOp*> DB::getObjectMap() {
-  return DB::objectmap;
-}
-
-int DB::Initialize(string logfile, int loglevel)
-{
-  int ret = -1;
-  const DoutPrefixProvider *dpp = get_def_dpp();
-
-  if (!cct) {
-    cout << "Failed to Initialize. No ceph Context \n";
-    return -1;
-  }
-
-  if (loglevel > 0) {
-    cct->_conf->subsys.set_log_level(ceph_subsys_rgw, loglevel);
-  }
-  if (!logfile.empty()) {
-    cct->_log->set_log_file(logfile);
-    cct->_log->reopen_log_file();
-  }
-
-
-  db = openDB(dpp);
-
-  if (!db) {
-    ldpp_dout(dpp, 0) <<"Failed to open database " << dendl;
-    return ret;
-  }
-
-  ret = InitializeDBOps(dpp);
-
-  if (ret) {
-    ldpp_dout(dpp, 0) <<"InitializeDBOps failed " << dendl;
-    closeDB(dpp);
-    db = NULL;
-    return ret;
-  }
-
-  ldpp_dout(dpp, 0) << "DB successfully initialized - name:" \
-    << db_name << "" << dendl;
-
-  return ret;
-}
-
-int DB::createGC(const DoutPrefixProvider *dpp) {
-  int ret = 0;
-  /* create gc thread */
-
-  gc_worker = std::make_unique<DB::GC>(dpp, this);
-  gc_worker->create("db_gc");
-
-  return ret;
-}
-
-int DB::stopGC() {
-  if (gc_worker) {
-    gc_worker->signal_stop();
-    gc_worker->join();
-  }
-  return 0;
-}
-
-int DB::Destroy(const DoutPrefixProvider *dpp)
-{
-  if (!db)
-    return 0;
-
-  stopGC();
-
-  closeDB(dpp);
-
-
-  ldpp_dout(dpp, 20)<<"DB successfully destroyed - name:" \
-    <<db_name << dendl;
-
-  return 0;
-}
-
-
-std::shared_ptr<class DBOp> DB::getDBOp(const DoutPrefixProvider *dpp, std::string_view Op,
-                  const DBOpParams *params)
-{
-  if (!Op.compare("InsertUser"))
-    return dbops.InsertUser;
-  if (!Op.compare("RemoveUser"))
-    return dbops.RemoveUser;
-  if (!Op.compare("GetUser"))
-    return dbops.GetUser;
-  if (!Op.compare("InsertBucket"))
-    return dbops.InsertBucket;
-  if (!Op.compare("UpdateBucket"))
-    return dbops.UpdateBucket;
-  if (!Op.compare("RemoveBucket"))
-    return dbops.RemoveBucket;
-  if (!Op.compare("GetBucket"))
-    return dbops.GetBucket;
-  if (!Op.compare("ListUserBuckets"))
-    return dbops.ListUserBuckets;
-  if (!Op.compare("InsertLCEntry"))
-    return dbops.InsertLCEntry;
-  if (!Op.compare("RemoveLCEntry"))
-    return dbops.RemoveLCEntry;
-  if (!Op.compare("GetLCEntry"))
-    return dbops.GetLCEntry;
-  if (!Op.compare("ListLCEntries"))
-    return dbops.ListLCEntries;
-  if (!Op.compare("InsertLCHead"))
-    return dbops.InsertLCHead;
-  if (!Op.compare("RemoveLCHead"))
-    return dbops.RemoveLCHead;
-  if (!Op.compare("GetLCHead"))
-    return dbops.GetLCHead;
-
-  /* Object Operations */
-  map<string, class ObjectOp*>::iterator iter;
-  class ObjectOp* Ob;
-
-  {
-    const std::lock_guard<std::mutex> lk(mtx);
-    iter = DB::objectmap.find(params->op.bucket.info.bucket.name);
-  }
-
-  if (iter == DB::objectmap.end()) {
-    ldpp_dout(dpp, 30)<<"No objectmap found for bucket: " \
-      <<params->op.bucket.info.bucket.name << dendl;
-    /* not found */
-    return nullptr;
-  }
-
-  Ob = iter->second;
-
-  if (!Op.compare("PutObject"))
-    return Ob->PutObject;
-  if (!Op.compare("DeleteObject"))
-    return Ob->DeleteObject;
-  if (!Op.compare("GetObject"))
-    return Ob->GetObject;
-  if (!Op.compare("UpdateObject"))
-    return Ob->UpdateObject;
-  if (!Op.compare("ListBucketObjects"))
-    return Ob->ListBucketObjects;
-  if (!Op.compare("ListVersionedObjects"))
-    return Ob->ListVersionedObjects;
-  if (!Op.compare("PutObjectData"))
-    return Ob->PutObjectData;
-  if (!Op.compare("UpdateObjectData"))
-    return Ob->UpdateObjectData;
-  if (!Op.compare("GetObjectData"))
-    return Ob->GetObjectData;
-  if (!Op.compare("DeleteObjectData"))
-    return Ob->DeleteObjectData;
-  if (!Op.compare("DeleteStaleObjectData"))
-    return Ob->DeleteStaleObjectData;
-
-  return nullptr;
-}
-
-int DB::objectmapInsert(const DoutPrefixProvider *dpp, string bucket, class ObjectOp* ptr)
-{
-  map<string, class ObjectOp*>::iterator iter;
-  class ObjectOp *Ob;
-
-  const std::lock_guard<std::mutex> lk(mtx);
-  iter = DB::objectmap.find(bucket);
-
-  if (iter != DB::objectmap.end()) {
-    // entry already exists
-    // return success or replace it or
-    // return error ?
-    //
-    // return success for now & delete the newly allocated ptr
-    ldpp_dout(dpp, 30)<<"Objectmap entry already exists for bucket("\
-      <<bucket<<"). Not inserted " << dendl;
-    delete ptr;
-    return 0;
-  }
-
-  Ob = (class ObjectOp*) ptr;
-  Ob->InitializeObjectOps(getDBname(), dpp);
-
-  DB::objectmap.insert(pair<string, class ObjectOp*>(bucket, Ob));
-
-  return 0;
-}
-
-int DB::objectmapDelete(const DoutPrefixProvider *dpp, string bucket)
-{
-  map<string, class ObjectOp*>::iterator iter;
-
-  const std::lock_guard<std::mutex> lk(mtx);
-  iter = DB::objectmap.find(bucket);
-
-  if (iter == DB::objectmap.end()) {
-    // entry doesn't exist
-    // return success or return error ?
-    // return success for now
-    ldpp_dout(dpp, 20)<<"Objectmap entry for bucket("<<bucket<<") "
-      <<"doesnt exist to delete " << dendl;
-    return 0;
-  }
-
-  DB::objectmap.erase(iter);
-
-  return 0;
-}
-
-int DB::InitializeParams(const DoutPrefixProvider *dpp, DBOpParams *params)
-{
-  int ret = -1;
-
-  if (!params)
-    goto out;
-
-  params->cct = cct;
-
-  //reset params here
-  params->user_table = user_table;
-  params->bucket_table = bucket_table;
-  params->quota_table = quota_table;
-  params->lc_entry_table = lc_entry_table;
-  params->lc_head_table = lc_head_table;
-
-  ret = 0;
-out:
-  return ret;
-}
-
-int DB::ProcessOp(const DoutPrefixProvider *dpp, std::string_view Op, DBOpParams *params) {
-  int ret = -1;
-  shared_ptr<class DBOp> db_op;
-
-  db_op = getDBOp(dpp, Op, params);
-
-  if (!db_op) {
-    ldpp_dout(dpp, 0)<<"No db_op found for Op("<<Op<<")" << dendl;
-    return ret;
-  }
-  ret = db_op->Execute(dpp, params);
-
-  if (ret) {
-    ldpp_dout(dpp, 0)<<"In Process op Execute failed for fop(" << Op << ")" << dendl;
-  } else {
-    ldpp_dout(dpp, 20)<<"Successfully processed fop(" << Op << ")" << dendl;
-  }
-
-  return ret;
-}
-
-int DB::get_user(const DoutPrefixProvider *dpp,
-    const std::string& query_str, const std::string& query_str_val,
-    RGWUserInfo& uinfo, map<string, bufferlist> *pattrs,
-    RGWObjVersionTracker *pobjv_tracker) {
-  int ret = 0;
-
-  if (query_str.empty() || query_str_val.empty()) {
-    ldpp_dout(dpp, 0)<<"In GetUser - Invalid query(" << query_str <<"), query_str_val(" << query_str_val <<")" << dendl;
-    return -1;
-  }
-
-  DBOpParams params = {};
-  InitializeParams(dpp, &params);
-
-  params.op.query_str = query_str;
-
-  // validate query_str with UserTable entries names
-  if (query_str == "username") {
-    params.op.user.uinfo.display_name = query_str_val;
-  } else if (query_str == "email") {
-    params.op.user.uinfo.user_email = query_str_val;
-  } else if (query_str == "access_key") {
-    RGWAccessKey k(query_str_val, "");
-    map<string, RGWAccessKey> keys;
-    keys[query_str_val] = k;
-    params.op.user.uinfo.access_keys = keys;
-  } else if (query_str == "user_id") {
-    params.op.user.uinfo.user_id = uinfo.user_id;
-  } else {
-    ldpp_dout(dpp, 0)<<"In GetUser Invalid query string :" <<query_str.c_str()<<") " << dendl;
-    return -1;
-  }
-
-  ret = ProcessOp(dpp, "GetUser", &params);
-
-  if (ret)
-    goto out;
-
-  /* Verify if its a valid user */
-  if (params.op.user.uinfo.access_keys.empty() ||
-        params.op.user.uinfo.user_id.id.empty()) {
-    ldpp_dout(dpp, 0)<<"In GetUser - No user with query(" <<query_str.c_str()<<"), user_id(" << uinfo.user_id <<") found" << dendl;
-    return -ENOENT;
-  }
-
-  uinfo = params.op.user.uinfo;
-
-  if (pattrs) {
-    *pattrs = params.op.user.user_attrs;
-  }
-
-  if (pobjv_tracker) {
-    pobjv_tracker->read_version = params.op.user.user_version;
-  }
-
-out:
-  return ret;
-}
-
-int DB::store_user(const DoutPrefixProvider *dpp,
-    RGWUserInfo& uinfo, bool exclusive, map<string, bufferlist> *pattrs,
-    RGWObjVersionTracker *pobjv, RGWUserInfo* pold_info)
-{
-  DBOpParams params = {};
-  InitializeParams(dpp, &params);
-  int ret = 0;
-
-  /* Check if the user already exists and return the old info, caller will have a use for it */
-  RGWUserInfo orig_info;
-  RGWObjVersionTracker objv_tracker = {};
-  obj_version& obj_ver = objv_tracker.read_version;
-
-  orig_info.user_id = uinfo.user_id;
-  ret = get_user(dpp, string("user_id"), uinfo.user_id.id, orig_info, nullptr, &objv_tracker);
-
-  if (!ret && obj_ver.ver) {
-    /* already exists. */
-
-    if (pold_info) {
-      *pold_info = orig_info;
-    }
-
-    if (pobjv && (pobjv->read_version.ver != obj_ver.ver)) {
-      /* Object version mismatch.. return ECANCELED */
-      ret = -ECANCELED;
-      ldpp_dout(dpp, 0)<<"User Read version mismatch err:(" <<ret<<") " << dendl;
-      return ret;
-    }
-
-    if (exclusive) {
-      // return
-      return ret;
-    }
-    obj_ver.ver++;
-  } else {
-    obj_ver.ver = 1;
-    obj_ver.tag = "UserTAG";
-  }
-
-  params.op.user.user_version = obj_ver;
-  params.op.user.uinfo = uinfo;
-
-  if (pattrs) {
-    params.op.user.user_attrs = *pattrs;
-  }
-
-  ret = ProcessOp(dpp, "InsertUser", &params);
-
-  if (ret) {
-    ldpp_dout(dpp, 0)<<"store_user failed with err:(" <<ret<<") " << dendl;
-    goto out;
-  }
-  ldpp_dout(dpp, 20)<<"User creation successful - userid:(" <<uinfo.user_id<<") " << dendl;
-
-  if (pobjv) {
-    pobjv->read_version = obj_ver;
-    pobjv->write_version = obj_ver;
-  }
-
-out:
-  return ret;
-}
-
-int DB::remove_user(const DoutPrefixProvider *dpp,
-    RGWUserInfo& uinfo, RGWObjVersionTracker *pobjv)
-{
-  DBOpParams params = {};
-  InitializeParams(dpp, &params);
-  int ret = 0;
-
-  RGWUserInfo orig_info;
-  RGWObjVersionTracker objv_tracker = {};
-
-  orig_info.user_id = uinfo.user_id;
-  ret = get_user(dpp, string("user_id"), uinfo.user_id.id, orig_info, nullptr, &objv_tracker);
-
-  if (ret) {
-    return ret;
-  }
-
-  if (!ret && objv_tracker.read_version.ver) {
-    /* already exists. */
-
-    if (pobjv && (pobjv->read_version.ver != objv_tracker.read_version.ver)) {
-      /* Object version mismatch.. return ECANCELED */
-      ret = -ECANCELED;
-      ldpp_dout(dpp, 0)<<"User Read version mismatch err:(" <<ret<<") " << dendl;
-      return ret;
-    }
-  }
-
-  params.op.user.uinfo.user_id = uinfo.user_id;
-
-  ret = ProcessOp(dpp, "RemoveUser", &params);
-
-  if (ret) {
-    ldpp_dout(dpp, 0)<<"remove_user failed with err:(" <<ret<<") " << dendl;
-    goto out;
-  }
-
-out:
-  return ret;
-}
-
-int DB::get_bucket_info(const DoutPrefixProvider *dpp, const std::string& query_str,
-    const std::string& query_str_val,
-    RGWBucketInfo& info,
-    rgw::sal::Attrs* pattrs, ceph::real_time* pmtime,
-    obj_version* pbucket_version) {
-  int ret = 0;
-
-  if (query_str.empty()) {
-    // not checking for query_str_val as the query can be to fetch
-    // entries with null values
-    return -1;
-  }
-
-  DBOpParams params = {};
-  DBOpParams params2 = {};
-  InitializeParams(dpp, &params);
-
-  if (query_str == "name") {
-    params.op.bucket.info.bucket.name = info.bucket.name;
-  } else {
-    ldpp_dout(dpp, 0)<<"In GetBucket Invalid query string :" <<query_str.c_str()<<") " << dendl;
-    return -1;
-  }
-
-  ret = ProcessOp(dpp, "GetBucket", &params);
-
-  if (ret) {
-    ldpp_dout(dpp, 0)<<"In GetBucket failed err:(" <<ret<<") " << dendl;
-    goto out;
-  }
-
-  if (!ret && params.op.bucket.info.bucket.marker.empty()) {
-    return -ENOENT;
-  }
-  info = params.op.bucket.info;
-
-  if (pattrs) {
-    *pattrs = params.op.bucket.bucket_attrs;
-  }
-
-  if (pmtime) {
-    *pmtime = params.op.bucket.mtime;
-  }
-  if (pbucket_version) {
-    *pbucket_version = params.op.bucket.bucket_version;
-  }
-
-out:
-  return ret;
-}
-
-int DB::create_bucket(const DoutPrefixProvider *dpp,
-    const RGWUserInfo& owner, rgw_bucket& bucket,
-    const string& zonegroup_id,
-    const rgw_placement_rule& placement_rule,
-    const string& swift_ver_location,
-    const RGWQuotaInfo * pquota_info,
-    map<std::string, bufferlist>& attrs,
-    RGWBucketInfo& info,
-    obj_version *pobjv,
-    obj_version *pep_objv,
-    real_time creation_time,
-    rgw_bucket *pmaster_bucket,
-    uint32_t *pmaster_num_shards,
-    optional_yield y,
-    bool exclusive)
-{
-  /*
-   * XXX: Simple creation for now.
-   *
-   * Referring to RGWRados::create_bucket(), 
-   * Check if bucket already exists, select_bucket_placement,
-   * is explicit put/remove instance info needed? - should not be ideally
-   */
-
-  DBOpParams params = {};
-  InitializeParams(dpp, &params);
-  int ret = 0;
-
-  /* Check if the bucket already exists and return the old info, caller will have a use for it */
-  RGWBucketInfo orig_info;
-  orig_info.bucket.name = bucket.name;
-  ret = get_bucket_info(dpp, string("name"), "", orig_info, nullptr, nullptr, nullptr);
-
-  if (!ret && !orig_info.owner.id.empty() && exclusive) {
-    /* already exists. Return the old info */
-
-    info = std::move(orig_info);
-    return ret;
-  }
-
-  RGWObjVersionTracker& objv_tracker = info.objv_tracker;
-
-  objv_tracker.read_version.clear();
-
-  if (pobjv) {
-    objv_tracker.write_version = *pobjv;
-  } else {
-    objv_tracker.generate_new_write_ver(cct);
-  }
-  params.op.bucket.bucket_version = objv_tracker.write_version;
-  objv_tracker.read_version = params.op.bucket.bucket_version;
-
-  uint64_t bid = next_bucket_id();
-  string s = getDBname() + "." + std::to_string(bid);
-  bucket.marker = bucket.bucket_id = s;
-
-  info.bucket = bucket;
-  info.owner = owner.user_id;
-  info.zonegroup = zonegroup_id;
-  info.placement_rule = placement_rule;
-  info.swift_ver_location = swift_ver_location;
-  info.swift_versioning = (!swift_ver_location.empty());
-
-  info.requester_pays = false;
-  if (real_clock::is_zero(creation_time)) {
-    info.creation_time = ceph::real_clock::now();
-  } else {
-    info.creation_time = creation_time;
-  }
-  if (pquota_info) {
-    info.quota = *pquota_info;
-  }
-
-  params.op.bucket.info = info;
-  params.op.bucket.bucket_attrs = attrs;
-  params.op.bucket.mtime = ceph::real_time();
-  params.op.user.uinfo.user_id.id = owner.user_id.id;
-
-  ret = ProcessOp(dpp, "InsertBucket", &params);
-
-  if (ret) {
-    ldpp_dout(dpp, 0)<<"create_bucket failed with err:(" <<ret<<") " << dendl;
-    goto out;
-  }
-
-out:
-  return ret;
-}
-
-int DB::remove_bucket(const DoutPrefixProvider *dpp, const RGWBucketInfo info) {
-  int ret = 0;
-
-  DBOpParams params = {};
-  InitializeParams(dpp, &params);
-
-  params.op.bucket.info.bucket.name = info.bucket.name;
-
-  ret = ProcessOp(dpp, "RemoveBucket", &params);
-
-  if (ret) {
-    ldpp_dout(dpp, 0)<<"In RemoveBucket failed err:(" <<ret<<") " << dendl;
-    goto out;
-  }
-
-out:
-  return ret;
-}
-
-int DB::list_buckets(const DoutPrefixProvider *dpp, const std::string& query_str,
-    rgw_user& user,
-    const string& marker,
-    const string& end_marker,
-    uint64_t max,
-    bool need_stats,
-    RGWUserBuckets *buckets,
-    bool *is_truncated)
-{
-  int ret = 0;
-
-  DBOpParams params = {};
-  InitializeParams(dpp, &params);
-
-  params.op.user.uinfo.user_id = user;
-  params.op.bucket.min_marker = marker;
-  params.op.bucket.max_marker = end_marker;
-  params.op.list_max_count = max;
-  params.op.query_str = query_str;
-
-  ret = ProcessOp(dpp, "ListUserBuckets", &params);
-
-  if (ret) {
-    ldpp_dout(dpp, 0)<<"In ListUserBuckets failed err:(" <<ret<<") " << dendl;
-    goto out;
-  }
-
-  /* need_stats: stats are already part of entries... In case they are maintained in
-   * separate table , maybe use "Inner Join" with stats table for the query.
-   */
-  if (params.op.bucket.list_entries.size() == max)
-    *is_truncated = true;
-
-  for (auto& entry : params.op.bucket.list_entries) {
-    if (!end_marker.empty() &&
-        end_marker.compare(entry.bucket.marker) <= 0) {
-      *is_truncated = false;
-      break;
-    }
-    buckets->add(std::move(entry));
-  }
-
-  if (query_str == "all") {
-    // userID/OwnerID may have changed. Update it.
-    user.id = params.op.bucket.info.owner.id;
-  }
-
-out:
-  return ret;
-}
-
-int DB::update_bucket(const DoutPrefixProvider *dpp, const std::string& query_str,
-    RGWBucketInfo& info,
-    bool exclusive,
-    const rgw_user* powner_id,
-    map<std::string, bufferlist>* pattrs,
-    ceph::real_time* pmtime,
-    RGWObjVersionTracker* pobjv)
-{
-  int ret = 0;
-  DBOpParams params = {};
-  obj_version bucket_version;
-  RGWBucketInfo orig_info;
-
-  /* Check if the bucket already exists and return the old info, caller will have a use for it */
-  orig_info.bucket.name = info.bucket.name;
-  params.op.bucket.info.bucket.name = info.bucket.name;
-  ret = get_bucket_info(dpp, string("name"), "", orig_info, nullptr, nullptr,
-      &bucket_version);
-
-  if (ret) {
-    ldpp_dout(dpp, 0)<<"Failed to read bucket info err:(" <<ret<<") " << dendl;
-    goto out;
-  }
-
-  if (!orig_info.owner.id.empty() && exclusive) {
-    /* already exists. Return the old info */
-
-    info = std::move(orig_info);
-    return ret;
-  }
-
-  /* Verify if the objv read_ver matches current bucket version */
-  if (pobjv) {
-    if (pobjv->read_version.ver != bucket_version.ver) {
-      ldpp_dout(dpp, 0)<<"Read version mismatch err:(" <<ret<<") " << dendl;
-      ret = -ECANCELED;
-      goto out;
-    }
-  } else {
-    pobjv = &info.objv_tracker;
-  }
-
-  InitializeParams(dpp, &params);
-
-  params.op.bucket.info.bucket.name = info.bucket.name;
-
-  if (powner_id) {
-    params.op.user.uinfo.user_id.id = powner_id->id;
-  } else {
-    params.op.user.uinfo.user_id.id = orig_info.owner.id;
-  }
-
-  /* Update version & mtime */
-  params.op.bucket.bucket_version.ver = ++(bucket_version.ver);
-
-  if (pmtime) {
-    params.op.bucket.mtime = *pmtime;;
-  } else {
-    params.op.bucket.mtime = ceph::real_time();
-  }
-
-  if (query_str == "attrs") {
-    params.op.query_str = "attrs";
-    params.op.bucket.bucket_attrs = *pattrs;
-  } else if (query_str == "owner") {
-    /* Update only owner i.e, chown. 
-     * Update creation_time too */
-    params.op.query_str = "owner";
-    params.op.bucket.info.creation_time = params.op.bucket.mtime;
-  } else if (query_str == "info") {
-    params.op.query_str = "info";
-    params.op.bucket.info = info;
-  } else {
-    ret = -1;
-    ldpp_dout(dpp, 0)<<"In UpdateBucket Invalid query_str : " << query_str << dendl;
-    goto out;
-  }
-
-  ret = ProcessOp(dpp, "UpdateBucket", &params);
-
-  if (ret) {
-    ldpp_dout(dpp, 0)<<"In UpdateBucket failed err:(" <<ret<<") " << dendl;
-    goto out;
-  }
-
-  if (pobjv) {
-    pobjv->read_version = params.op.bucket.bucket_version;
-    pobjv->write_version = params.op.bucket.bucket_version;
-  }
-
-out:
-  return ret;
-}
-
-/**
- * Get ordered listing of the objects in a bucket.
- *
- * max_p: maximum number of results to return
- * bucket: bucket to list contents of
- * prefix: only return results that match this prefix
- * delim: do not include results that match this string.
- *     Any skipped results will have the matching portion of their name
- *     inserted in common_prefixes with a "true" mark.
- * marker: if filled in, begin the listing with this object.
- * end_marker: if filled in, end the listing with this object.
- * result: the objects are put in here.
- * common_prefixes: if delim is filled in, any matching prefixes are
- * placed here.
- * is_truncated: if number of objects in the bucket is bigger than
- * max, then truncated.
- */
-int DB::Bucket::List::list_objects(const DoutPrefixProvider *dpp, int64_t max,
-                          vector<rgw_bucket_dir_entry> *result,
-                          map<string, bool> *common_prefixes, bool *is_truncated)
-{
-  int ret = 0;
-  DB *store = target->get_store();
-  int64_t count = 0;
-  std::string prev_obj;
-
-  DBOpParams db_params = {};
-  store->InitializeParams(dpp, &db_params);
-
-  db_params.op.bucket.info = target->get_bucket_info(); 
-  /* XXX: Handle whole marker? key -> name, instance, ns? */
-  db_params.op.obj.min_marker = params.marker.name;
-  db_params.op.obj.max_marker = params.end_marker.name;
-  db_params.op.obj.prefix = params.prefix + "%";
-  db_params.op.list_max_count = max + 1; /* +1 for next_marker */
-
-  ret = store->ProcessOp(dpp, "ListBucketObjects", &db_params);
-
-  if (ret) {
-    ldpp_dout(dpp, 0)<<"In ListBucketObjects failed err:(" <<ret<<") " << dendl;
-    goto out;
-  }
-
-  for (auto& entry : db_params.op.obj.list_entries) {
-
-    if (!params.list_versions) {
-      if (entry.flags & rgw_bucket_dir_entry::FLAG_DELETE_MARKER) {
-        prev_obj = entry.key.name;
-        // skip all non-current entries and delete_marker
-        continue;
-      }
-      if (entry.key.name == prev_obj) {
-        // non current versions..skip the entry
-        continue;
-      }
-      entry.flags |= rgw_bucket_dir_entry::FLAG_CURRENT;
-    } else {
-      if (entry.key.name != prev_obj) {
-        // current version
-        entry.flags |= rgw_bucket_dir_entry::FLAG_CURRENT;
-      } else {
-        entry.flags &= ~(rgw_bucket_dir_entry::FLAG_CURRENT);
-        entry.flags |= rgw_bucket_dir_entry::FLAG_VER;
-      }
-    }
-
-    prev_obj = entry.key.name;
-
-    if (count >= max) {
-      *is_truncated = true;
-      next_marker.name = entry.key.name;
-      next_marker.instance = entry.key.instance;
-      break;
-    }
-
-    if (!params.delim.empty()) {
-    const std::string& objname = entry.key.name;
-       const int delim_pos = objname.find(params.delim, params.prefix.size());
-         if (delim_pos >= 0) {
-           /* extract key -with trailing delimiter- for CommonPrefix */
-           const std::string& prefix_key =
-             objname.substr(0, delim_pos + params.delim.length());
-
-           if (common_prefixes &&
-               common_prefixes->find(prefix_key) == common_prefixes->end()) {
-          next_marker = prefix_key;
-          (*common_prefixes)[prefix_key] = true;
-          count++;
-        }
-        continue;
-      }
-    }
-
-    if (!params.end_marker.name.empty() &&
-        params.end_marker.name.compare(entry.key.name) <= 0) {
-      // should not include end_marker
-      *is_truncated = false;
-      break;
-    }
-    count++;
-    result->push_back(std::move(entry));
-  }
-out:
-  return ret;
-}
-
-int DB::raw_obj::InitializeParamsfromRawObj(const DoutPrefixProvider *dpp,
-                                            DBOpParams* params) {
-  int ret = 0;
-
-  if (!params)
-    return -1;
-
-  params->op.bucket.info.bucket.name = bucket_name;
-  params->op.obj.state.obj.key.name = obj_name;
-  params->op.obj.state.obj.key.instance = obj_instance;
-  params->op.obj.state.obj.key.ns = obj_ns;
-  params->op.obj.obj_id = obj_id;
-
-  if (multipart_part_str != "0.0") {
-    params->op.obj.is_multipart = true;
-  } else {
-    params->op.obj.is_multipart = false;
-  }
-
-  params->op.obj_data.multipart_part_str = multipart_part_str;
-  params->op.obj_data.part_num = part_num;
-
-  return ret;
-}
-
-int DB::Object::InitializeParamsfromObject(const DoutPrefixProvider *dpp,
-                                           DBOpParams* params) {
-  int ret = 0;
-  string bucket = bucket_info.bucket.name;
-
-  if (!params)
-    return -1;
-
-  params->op.bucket.info.bucket.name = bucket;
-  params->op.obj.state.obj = obj;
-  params->op.obj.obj_id = obj_id;
-
-  return ret;
-}
-
-int DB::Object::get_object_impl(const DoutPrefixProvider *dpp, DBOpParams& params) {
-  int ret = 0;
-
-  if (params.op.obj.state.obj.key.name.empty()) {
-    /* Initialize */
-    store->InitializeParams(dpp, &params);
-    InitializeParamsfromObject(dpp, &params);
-  }
-
-  ret = store->ProcessOp(dpp, "GetObject", &params);
-
-  /* pick one field check if object exists */
-  if (!ret && !params.op.obj.state.exists) {
-    ldpp_dout(dpp, 0)<<"Object(bucket:" << bucket_info.bucket.name << ", Object:"<< obj.key.name << ") doesn't exist" << dendl;
-    ret = -ENOENT;
-  }
-
-  return ret;
-}
-
-int DB::Object::obj_omap_set_val_by_key(const DoutPrefixProvider *dpp,
-                                        const std::string& key, bufferlist& val,
-                                        bool must_exist) {
-  int ret = 0;
-
-  DBOpParams params = {};
-
-  ret = get_object_impl(dpp, params);
-
-  if (ret) {
-    ldpp_dout(dpp, 0) <<"get_object_impl failed err:(" <<ret<<")" << dendl;
-    goto out;
-  }
-
-  params.op.obj.omap[key] = val;
-  params.op.query_str = "omap";
-  params.op.obj.state.mtime = real_clock::now();
-
-  ret = store->ProcessOp(dpp, "UpdateObject", &params);
-
-  if (ret) {
-    ldpp_dout(dpp, 0)<<"In UpdateObject failed err:(" <<ret<<") " << dendl;
-    goto out;
-  }
-
-out:
-  return ret;
-}
-
-int DB::Object::obj_omap_get_vals_by_keys(const DoutPrefixProvider *dpp,
-                                          const std::string& oid,
-                                          const std::set<std::string>& keys,
-                                          std::map<std::string, bufferlist>* vals)
-{
-  int ret = 0;
-  DBOpParams params = {};
-  std::map<std::string, bufferlist> omap;
-
-  if (!vals)
-    return -1;
-
-  ret = get_object_impl(dpp, params);
-
-  if (ret) {
-    ldpp_dout(dpp, 0) <<"get_object_impl failed err:(" <<ret<<")" << dendl;
-    goto out;
-  }
-
-  omap = params.op.obj.omap;
-
-  for (const auto& k :  keys) {
-    (*vals)[k] = omap[k];
-  }
-
-out:
-  return ret;
-}
-
-int DB::Object::add_mp_part(const DoutPrefixProvider *dpp,
-                            RGWUploadPartInfo info) {
-  int ret = 0;
-
-  DBOpParams params = {};
-
-  ret = get_object_impl(dpp, params);
-
-  if (ret) {
-    ldpp_dout(dpp, 0) <<"get_object_impl failed err:(" <<ret<<")" << dendl;
-    goto out;
-  }
-
-  params.op.obj.mp_parts.push_back(info);
-  params.op.query_str = "mp";
-  params.op.obj.state.mtime = real_clock::now();
-
-  ret = store->ProcessOp(dpp, "UpdateObject", &params);
-
-  if (ret) {
-    ldpp_dout(dpp, 0)<<"In UpdateObject failed err:(" <<ret<<") " << dendl;
-    goto out;
-  }
-
-out:
-  return ret;
-}
-
-int DB::Object::get_mp_parts_list(const DoutPrefixProvider *dpp,
-                                  std::list<RGWUploadPartInfo>& info)
-{
-  int ret = 0;
-  DBOpParams params = {};
-  std::map<std::string, bufferlist> omap;
-
-  ret = get_object_impl(dpp, params);
-
-  if (ret) {
-    ldpp_dout(dpp, 0) <<"get_object_impl failed err:(" <<ret<<")" << dendl;
-    goto out;
-  }
-
-  info = params.op.obj.mp_parts;
-
-out:
-  return ret;
-}
-
-/* Taken from rgw_rados.cc */
-void DB::gen_rand_obj_instance_name(rgw_obj_key *target_key)
-{
-#define OBJ_INSTANCE_LEN 32
-  char buf[OBJ_INSTANCE_LEN + 1];
-
-  gen_rand_alphanumeric_no_underscore(cct, buf, OBJ_INSTANCE_LEN); /* don't want it to get url escaped,
-                                                                      no underscore for instance name due to the way we encode the raw keys */
-
-  target_key->set_instance(buf);
-}
-
-int DB::Object::obj_omap_get_all(const DoutPrefixProvider *dpp,
-                                 std::map<std::string, bufferlist> *m)
-{
-  int ret = 0;
-  DBOpParams params = {};
-  std::map<std::string, bufferlist> omap;
-
-  if (!m)
-    return -1;
-
-  ret = get_object_impl(dpp, params);
-
-  if (ret) {
-    ldpp_dout(dpp, 0) <<"get_object_impl failed err:(" <<ret<<")" << dendl;
-    goto out;
-  }
-
-  (*m) = params.op.obj.omap;
-
-out:
-  return ret;
-}
-
-int DB::Object::obj_omap_get_vals(const DoutPrefixProvider *dpp,
-                                  const std::string& marker,
-                                  uint64_t max_count,
-                                  std::map<std::string, bufferlist> *m, bool* pmore)
-{
-  int ret = 0;
-  DBOpParams params = {};
-  std::map<std::string, bufferlist> omap;
-  map<string, bufferlist>::iterator iter;
-  uint64_t count = 0;
-
-  if (!m)
-    return -1;
-
-  ret = get_object_impl(dpp, params);
-
-  if (ret) {
-    ldpp_dout(dpp, 0) <<"get_object_impl failed err:(" <<ret<<")" << dendl;
-    goto out;
-  }
-
-  omap = params.op.obj.omap;
-
-  for (iter = omap.begin(); iter != omap.end(); ++iter) {
-
-    if (iter->first < marker)
-      continue;
-
-    if ((++count) > max_count) {
-      *pmore = true;
-      break;
-    }
-
-    (*m)[iter->first] = iter->second;
-  }
-
-out:
-  return ret;
-}
-
-int DB::Object::set_attrs(const DoutPrefixProvider *dpp,
-                          map<string, bufferlist>& setattrs,
-                          map<string, bufferlist>* rmattrs)
-{
-  int ret = 0;
-
-  DBOpParams params = {};
-  rgw::sal::Attrs *attrs;
-  map<string, bufferlist>::iterator iter;
-
-  ret = get_object_impl(dpp, params);
-
-  if (ret) {
-    ldpp_dout(dpp, 0) <<"get_object_impl failed err:(" <<ret<<")" << dendl;
-    goto out;
-  }
-
-  /* For now lets keep it simple..rmattrs & setattrs ..
-   * XXX: Check rgw_rados::set_attrs
-   */
-  attrs = &params.op.obj.state.attrset;
-  if (rmattrs) {
-    for (iter = rmattrs->begin(); iter != rmattrs->end(); ++iter) {
-      (*attrs).erase(iter->first);
-    }
-  }
-  for (iter = setattrs.begin(); iter != setattrs.end(); ++iter) {
-    (*attrs)[iter->first] = iter->second;
-  }
-
-  params.op.query_str = "attrs";
-  params.op.obj.state.mtime = real_clock::now();
-
-  ret = store->ProcessOp(dpp, "UpdateObject", &params);
-
-  if (ret) {
-    ldpp_dout(dpp, 0)<<"In UpdateObject failed err:(" <<ret<<") " << dendl;
-    goto out;
-  }
-
-out:
-  return ret;
-}
-
-int DB::Object::transition(const DoutPrefixProvider *dpp,
-                           const rgw_placement_rule& rule,
-                           const real_time& mtime,
-                           uint64_t olh_epoch)
-{
-  int ret = 0;
-
-  DBOpParams params = {};
-  map<string, bufferlist> *attrset;
-
-  store->InitializeParams(dpp, &params);
-  InitializeParamsfromObject(dpp, &params);
-
-  ret = store->ProcessOp(dpp, "GetObject", &params);
-
-  if (ret) {
-    ldpp_dout(dpp, 0) <<"In GetObject failed err:(" <<ret<<")" << dendl;
-    goto out;
-  }
-
-  /* pick one field check if object exists */
-  if (!params.op.obj.state.exists) {
-    ldpp_dout(dpp, 0)<<"Object(bucket:" << bucket_info.bucket.name << ", Object:"<< obj.key.name << ") doesn't exist" << dendl;
-    return -1;
-  }
-
-  params.op.query_str = "meta";
-  params.op.obj.state.mtime = real_clock::now();
-  params.op.obj.storage_class = rule.storage_class;
-  attrset = &params.op.obj.state.attrset;
-  if (!rule.storage_class.empty()) {
-    bufferlist bl;
-    bl.append(rule.storage_class);
-    (*attrset)[RGW_ATTR_STORAGE_CLASS] = bl;
-  }
-  params.op.obj.versioned_epoch = olh_epoch; // XXX: not sure if needed
-
-  /* Unlike Rados, in dbstore for now, both head and tail objects
-   * refer to same storage class
-   */
-  params.op.obj.head_placement_rule = rule;
-  params.op.obj.tail_placement.placement_rule = rule;
-
-  ret = store->ProcessOp(dpp, "UpdateObject", &params);
-
-  if (ret) {
-    ldpp_dout(dpp, 0)<<"In UpdateObject failed err:(" <<ret<<") " << dendl;
-    goto out;
-  }
-
-out:
-  return ret;
-}
-
-int DB::raw_obj::read(const DoutPrefixProvider *dpp, int64_t ofs,
-                      uint64_t len, bufferlist& bl)
-{
-  int ret = 0;
-  DBOpParams params = {};
-
-  db->InitializeParams(dpp, &params);
-  InitializeParamsfromRawObj(dpp, &params);
-
-  ret = db->ProcessOp(dpp, "GetObjectData", &params);
-
-  if (ret) {
-    ldpp_dout(dpp, 0)<<"In GetObjectData failed err:(" <<ret<<")" << dendl;
-    return ret;
-  }
-
-  /* Verify if its valid obj */
-  if (!params.op.obj_data.size) {
-    ret = -ENOENT;
-    ldpp_dout(dpp, 0)<<"In GetObjectData failed err:(" <<ret<<")" << dendl;
-    return ret;
-  }
-
-  bufferlist& read_bl = params.op.obj_data.data;
-
-  unsigned copy_len;
-  copy_len = std::min((uint64_t)read_bl.length() - ofs, len);
-  read_bl.begin(ofs).copy(copy_len, bl);
-  return bl.length();
-}
-
-int DB::raw_obj::write(const DoutPrefixProvider *dpp, int64_t ofs, int64_t write_ofs,
-                       uint64_t len, bufferlist& bl)
-{
-  int ret = 0;
-  DBOpParams params = {};
-
-  db->InitializeParams(dpp, &params);
-  InitializeParamsfromRawObj(dpp, &params);
-
-  /* XXX: Check for chunk_size ?? */
-  params.op.obj_data.offset = ofs;
-  unsigned write_len = std::min((uint64_t)bl.length() - write_ofs, len);
-  bl.begin(write_ofs).copy(write_len, params.op.obj_data.data);
-  params.op.obj_data.size = params.op.obj_data.data.length();
-  params.op.obj.state.mtime = real_clock::now();
-
-  ret = db->ProcessOp(dpp, "PutObjectData", &params);
-
-  if (ret) {
-    ldpp_dout(dpp, 0)<<"In PutObjectData failed err:(" <<ret<<")" << dendl;
-    return ret;
-  }
-
-  return write_len;
-}
-
-int DB::Object::list_versioned_objects(const DoutPrefixProvider *dpp,
-                                       std::list<rgw_bucket_dir_entry>& list_entries) {
-  int ret = 0;
-  store = get_store();
-  DBOpParams db_params = {};
-
-  store->InitializeParams(dpp, &db_params);
-  InitializeParamsfromObject(dpp, &db_params);
-
-  db_params.op.list_max_count = MAX_VERSIONED_OBJECTS;
-
-  ret = store->ProcessOp(dpp, "ListVersionedObjects", &db_params);
-
-  if (ret) {
-    ldpp_dout(dpp, 0)<<"In ListVersionedObjects failed err:(" <<ret<<") " << dendl;
-  } else {
-    list_entries = db_params.op.obj.list_entries;
-  }
-
-  return ret;
-}
-
-int DB::Object::get_obj_state(const DoutPrefixProvider *dpp,
-                              const RGWBucketInfo& bucket_info, const rgw_obj& obj,
-                              bool follow_olh, RGWObjState** state)
-{
-  int ret = 0;
-
-  DBOpParams params = {};
-  RGWObjState* s;
-
-  if (!obj.key.instance.empty()) {
-    /* Versionid provided. Fetch the object */
-    ret = get_object_impl(dpp, params);
-
-    if (ret && ret != -ENOENT) {
-      ldpp_dout(dpp, 0) <<"get_object_impl failed err:(" <<ret<<")" << dendl;
-      goto out;
-    }
-  } else {
-    /* Instance is empty. May or may not be versioned object.
-     * List all the versions and read the most recent entry */
-    ret = list_versioned_objects(dpp, params.op.obj.list_entries);
-
-    if (params.op.obj.list_entries.size() != 0) {
-       /* Ensure its not a delete marker */
-      auto& ent = params.op.obj.list_entries.front();
-      if (ent.flags & rgw_bucket_dir_entry::FLAG_DELETE_MARKER) {
-        ret = -ENOENT;
-        goto out;
-      }
-      store->InitializeParams(dpp, &params);
-      InitializeParamsfromObject(dpp, &params);
-      params.op.obj.state.obj.key = ent.key;
-    
-      ret = get_object_impl(dpp, params);
-
-      if (ret) {
-        ldpp_dout(dpp, 0) <<"get_object_impl of versioned object failed err:(" <<ret<<")" << dendl;
-        goto out;
-      }
-    } else {
-      ret = -ENOENT;
-      return ret;
-    }
-  }
-
-  s = &params.op.obj.state;
-  /* XXX: For now use state->shadow_obj to store ObjectID string */
-  s->shadow_obj = params.op.obj.obj_id;
-
-  *state = &obj_state;
-  **state = *s;
-
-out:
-  return ret;
-
-}
-
-int DB::Object::get_state(const DoutPrefixProvider *dpp, RGWObjState** pstate, bool follow_olh)
-{
-  return get_obj_state(dpp, bucket_info, obj, follow_olh, pstate);
-}
-
-int DB::Object::Read::get_attr(const DoutPrefixProvider *dpp, const char *name, bufferlist& dest)
-{
-  RGWObjState* state;
-  int r = source->get_state(dpp, &state, true);
-  if (r < 0)
-    return r;
-  if (!state->exists)
-    return -ENOENT;
-  if (!state->get_attr(name, dest))
-    return -ENODATA;
-
-  return 0;
-}
-
-int DB::Object::Read::prepare(const DoutPrefixProvider *dpp)
-{
-  DB *store = source->get_store();
-  CephContext *cct = store->ctx();
-
-  bufferlist etag;
-
-  map<string, bufferlist>::iterator iter;
-
-  RGWObjState* astate;
-
-  int r = source->get_state(dpp, &astate, true);
-  if (r < 0)
-    return r;
-
-  if (!astate->exists) {
-    return -ENOENT;
-  }
-
-  state.obj = astate->obj;
-  source->obj_id = astate->shadow_obj;
-
-  if (params.target_obj) {
-    *params.target_obj = state.obj;
-  }
-  if (params.attrs) {
-    *params.attrs = astate->attrset;
-    if (cct->_conf->subsys.should_gather<ceph_subsys_rgw, 20>()) {
-      for (iter = params.attrs->begin(); iter != params.attrs->end(); ++iter) {
-        ldpp_dout(dpp, 20) << "Read xattr rgw_rados: " << iter->first << dendl;
-      }
-    }
-  }
-
-  if (conds.if_match || conds.if_nomatch) {
-    r = get_attr(dpp, RGW_ATTR_ETAG, etag);
-    if (r < 0)
-      return r;
-
-    if (conds.if_match) {
-      string if_match_str = rgw_string_unquote(conds.if_match);
-      ldpp_dout(dpp, 10) << "ETag: " << string(etag.c_str(), etag.length()) << " " << " If-Match: " << if_match_str << dendl;
-      if (if_match_str.compare(0, etag.length(), etag.c_str(), etag.length()) != 0) {
-        return -ERR_PRECONDITION_FAILED;
-      }
-    }
-
-    if (conds.if_nomatch) {
-      string if_nomatch_str = rgw_string_unquote(conds.if_nomatch);
-      ldpp_dout(dpp, 10) << "ETag: " << string(etag.c_str(), etag.length()) << " " << " If-NoMatch: " << if_nomatch_str << dendl;
-      if (if_nomatch_str.compare(0, etag.length(), etag.c_str(), etag.length()) == 0) {
-        return -ERR_NOT_MODIFIED;
-      }
-    }
-  }
-
-  if (params.obj_size)
-    *params.obj_size = astate->size;
-  if (params.lastmod)
-    *params.lastmod = astate->mtime;
-
-  return 0;
-}
-
-int DB::Object::Read::range_to_ofs(uint64_t obj_size, int64_t &ofs, int64_t &end)
-{
-  if (ofs < 0) {
-    ofs += obj_size;
-    if (ofs < 0)
-      ofs = 0;
-    end = obj_size - 1;
-  } else if (end < 0) {
-    end = obj_size - 1;
-  }
-
-  if (obj_size > 0) {
-    if (ofs >= (off_t)obj_size) {
-      return -ERANGE;
-    }
-    if (end >= (off_t)obj_size) {
-      end = obj_size - 1;
-    }
-  }
-  return 0;
-}
-
-int DB::Object::Read::read(int64_t ofs, int64_t end, bufferlist& bl, const DoutPrefixProvider *dpp)
-{
-  DB *store = source->get_store();
-
-  uint64_t read_ofs = ofs;
-  uint64_t len, read_len;
-
-  bufferlist read_bl;
-  uint64_t max_chunk_size = store->get_max_chunk_size();
-
-  RGWObjState* astate;
-  int r = source->get_state(dpp, &astate, true);
-  if (r < 0)
-    return r;
-
-  if (!astate->exists) {
-    return -ENOENT;
-  }
-
-  if (astate->size == 0) {
-    end = 0;
-  } else if (end >= (int64_t)astate->size) {
-    end = astate->size - 1;
-  }
-
-  if (end < 0)
-    len = 0;
-  else
-    len = end - ofs + 1;
-
-
-  if (len > max_chunk_size) {
-    len = max_chunk_size;
-  }
-
-  int head_data_size = astate->data.length();
-  bool reading_from_head = (ofs < head_data_size);
-
-  if (reading_from_head) {
-    if (astate) { // && astate->prefetch_data)?
-      if (!ofs && astate->data.length() >= len) {
-        bl = astate->data;
-        return bl.length();
-      }
-
-      if (ofs < astate->data.length()) {
-        unsigned copy_len = std::min((uint64_t)head_data_size - ofs, len);
-        astate->data.begin(ofs).copy(copy_len, bl);
-        return bl.length();
-      }
-    }
-  }
-
-  /* tail object */
-  int part_num = (ofs / max_chunk_size);
-  /* XXX: Handle multipart_str */
-  raw_obj read_obj(store, source->get_bucket_info().bucket.name, astate->obj.key.name, 
-      astate->obj.key.instance, astate->obj.key.ns, source->obj_id, "0.0", part_num);
-
-  read_len = len;
-
-  ldpp_dout(dpp, 20) << "dbstore->read obj-ofs=" << ofs << " read_ofs=" << read_ofs << " read_len=" << read_len << dendl;
-
-  // read from non head object
-  r = read_obj.read(dpp, read_ofs, read_len, bl);
-
-  if (r < 0) {
-    return r;
-  }
-
-  return bl.length();
-}
-
-static int _get_obj_iterate_cb(const DoutPrefixProvider *dpp,
-    const DB::raw_obj& read_obj, off_t obj_ofs,
-    off_t len, bool is_head_obj,
-    RGWObjState* astate, void *arg)
-{
-  struct db_get_obj_data* d = static_cast<struct db_get_obj_data*>(arg);
-  return d->store->get_obj_iterate_cb(dpp, read_obj, obj_ofs, len,
-      is_head_obj, astate, arg);
-}
-
-int DB::get_obj_iterate_cb(const DoutPrefixProvider *dpp,
-    const raw_obj& read_obj, off_t obj_ofs,
-    off_t len, bool is_head_obj,
-    RGWObjState* astate, void *arg)
-{
-  struct db_get_obj_data* d = static_cast<struct db_get_obj_data*>(arg);
-  bufferlist bl;
-  int r = 0;
-
-  if (is_head_obj) {
-    bl = astate->data;
-  } else {
-    // read from non head object
-    raw_obj robj = read_obj;
-    /* read entire data. So pass offset as '0' & len as '-1' */
-    r = robj.read(dpp, 0, -1, bl);
-
-    if (r <= 0) {
-      return r;
-    }
-  }
-
-  unsigned read_ofs = 0, read_len = 0;
-  while (read_ofs < bl.length()) {
-    unsigned chunk_len = std::min((uint64_t)bl.length() - read_ofs, (uint64_t)len);
-    r = d->client_cb->handle_data(bl, read_ofs, chunk_len);
-    if (r < 0)
-      return r;
-    read_ofs += chunk_len;
-    read_len += chunk_len;
-    ldpp_dout(dpp, 20) << "dbstore->get_obj_iterate_cb  obj-ofs=" << obj_ofs << " len=" << len <<  " chunk_len = " << chunk_len << " read_len = " << read_len << dendl;
-  }
-
-
-  d->offset += read_len;
-
-  return read_len;
-}
-
-int DB::Object::Read::iterate(const DoutPrefixProvider *dpp, int64_t ofs, int64_t end, RGWGetDataCB *cb)
-{
-  DB *store = source->get_store();
-  const uint64_t chunk_size = store->get_max_chunk_size();
-
-  db_get_obj_data data(store, cb, ofs);
-
-  int r = source->iterate_obj(dpp, source->get_bucket_info(), state.obj,
-      ofs, end, chunk_size, _get_obj_iterate_cb, &data);
-  if (r < 0) {
-    ldpp_dout(dpp, 0) << "iterate_obj() failed with " << r << dendl;
-    return r;
-  }
-
-  return 0;
-}
-
-int DB::Object::iterate_obj(const DoutPrefixProvider *dpp,
-    const RGWBucketInfo& bucket_info, const rgw_obj& obj,
-    off_t ofs, off_t end, uint64_t max_chunk_size,
-    iterate_obj_cb cb, void *arg)
-{
-  DB *store = get_store();
-  uint64_t len;
-  RGWObjState* astate;
-
-  int r = get_state(dpp, &astate, true);
-  if (r < 0) {
-    return r;
-  }
-
-  if (!astate->exists) {
-    return -ENOENT;
-  }
-
-  if (end < 0)
-    len = 0;
-  else
-    len = end - ofs + 1;
-
-  /* XXX: Will it really help to store all parts info in astate like manifest in Rados? */
-  int part_num = 0;
-  int head_data_size = astate->data.length();
-
-  while (ofs <= end && (uint64_t)ofs < astate->size) {
-    part_num = (ofs / max_chunk_size);
-    uint64_t read_len = std::min(len, max_chunk_size);
-
-    /* XXX: Handle multipart_str */
-    raw_obj read_obj(store, get_bucket_info().bucket.name, astate->obj.key.name, 
-        astate->obj.key.instance, astate->obj.key.ns, obj_id, "0.0", part_num);
-    bool reading_from_head = (ofs < head_data_size);
-
-    r = cb(dpp, read_obj, ofs, read_len, reading_from_head, astate, arg);
-    if (r <= 0) {
-      return r;
-    }
-    /* r refers to chunk_len (no. of bytes) handled in cb */
-    len -= r;
-    ofs += r;
-  }
-
-  return 0;
-}
-
-int DB::Object::Write::prepare(const DoutPrefixProvider* dpp)
-{
-  DB *store = target->get_store();
-
-  int ret = -1;
-
-  /* XXX: handle assume_noent */
-
-  obj_state.obj = target->obj;
-  if (target->obj_id.empty()) {
-    if (!target->obj.key.instance.empty() && (target->obj.key.instance != "null")) {
-      /* versioned object. Set obj_id same as versionID/instance */
-      target->obj_id = target->obj.key.instance;
-    } else {
-      // generate obj_id
-      char buf[33];
-      gen_rand_alphanumeric(store->ctx(), buf, sizeof(buf) - 1);
-      target->obj_id = buf;
-    }
-  }
-
-  ret = 0;
-  return ret;
-}
-
-/* writes tail objects */
-int DB::Object::Write::write_data(const DoutPrefixProvider* dpp,
-                               bufferlist& data, uint64_t ofs) {
-  DB *store = target->get_store();
-  /* tail objects */
-  /* XXX: Split into parts each of max_chunk_size. But later make tail
-   * object chunk size limit to sqlite blob limit */
-  int part_num = 0;
-
-  uint64_t max_chunk_size = store->get_max_chunk_size();
-
-  /* tail_obj ofs should be greater than max_head_size */
-  if (mp_part_str == "0.0")  { // ensure not multipart meta object
-    if (ofs < store->get_max_head_size()) {
-      return -1;
-    }
-  }
-  
-  uint64_t end = data.length();
-  uint64_t write_ofs = 0;
-  /* as we are writing max_chunk_size at a time in sal_dbstore DBAtomicWriter::process(),
-   * maybe this while loop is not needed
-   */
-  while (write_ofs < end) {
-    part_num = (ofs / max_chunk_size);
-    uint64_t len = std::min(end, max_chunk_size);
-
-    /* XXX: Handle multipart_str */
-    raw_obj write_obj(store, target->get_bucket_info().bucket.name, obj_state.obj.key.name, 
-        obj_state.obj.key.instance, obj_state.obj.key.ns, target->obj_id, mp_part_str, part_num);
-
-
-    ldpp_dout(dpp, 20) << "dbstore->write obj-ofs=" << ofs << " write_len=" << len << dendl;
-
-    // write into non head object
-    int r = write_obj.write(dpp, ofs, write_ofs, len, data); 
-    if (r < 0) {
-      return r;
-    }
-    /* r refers to chunk_len (no. of bytes) handled in raw_obj::write */
-    len -= r;
-    ofs += r;
-    write_ofs += r;
-  }
-
-  return 0;
-}
-
-/* Write metadata & head object data */
-int DB::Object::Write::_do_write_meta(const DoutPrefixProvider *dpp,
-    uint64_t size, uint64_t accounted_size,
-    map<string, bufferlist>& attrs,
-    bool assume_noent, bool modify_tail)
-{
-  DB *store = target->get_store();
-
-  RGWObjState* state = &obj_state;
-  map<string, bufferlist> *attrset;
-  DBOpParams params = {};
-  int ret = 0;
-  string etag;
-  string content_type;
-  bufferlist acl_bl;
-  string storage_class;
-
-  map<string, bufferlist>::iterator iter;
-
-  store->InitializeParams(dpp, &params);
-  target->InitializeParamsfromObject(dpp, &params);
-
-  obj_state = params.op.obj.state;
-
-  if (real_clock::is_zero(meta.set_mtime)) {
-    meta.set_mtime = real_clock::now();
-  }
-
-  attrset = &state->attrset;
-  if (target->bucket_info.obj_lock_enabled() && target->bucket_info.obj_lock.has_rule()) {
-    // && meta.flags == PUT_OBJ_CREATE) {
-    auto iter = attrs.find(RGW_ATTR_OBJECT_RETENTION);
-    if (iter == attrs.end()) {
-      real_time lock_until_date = target->bucket_info.obj_lock.get_lock_until_date(meta.set_mtime);
-      string mode = target->bucket_info.obj_lock.get_mode();
-      RGWObjectRetention obj_retention(mode, lock_until_date);
-      bufferlist bl;
-      obj_retention.encode(bl);
-      (*attrset)[RGW_ATTR_OBJECT_RETENTION] = bl;
-    }
-  }
-
-  state->mtime = meta.set_mtime;
-
-  if (meta.data) {
-    /* if we want to overwrite the data, we also want to overwrite the
-       xattrs, so just remove the object */
-    params.op.obj.head_data = *meta.data;
-  }
-
-  if (meta.rmattrs) {
-    for (iter = meta.rmattrs->begin(); iter != meta.rmattrs->end(); ++iter) {
-      const string& name = iter->first;
-      (*attrset).erase(name.c_str());
-    }
-  }
-
-  if (meta.manifest) {
-    storage_class = meta.manifest->get_tail_placement().placement_rule.storage_class;
-
-    /* remove existing manifest attr */
-    iter = attrs.find(RGW_ATTR_MANIFEST);
-    if (iter != attrs.end())
-      attrs.erase(iter);
-
-    bufferlist bl;
-    encode(*meta.manifest, bl);
-    (*attrset)[RGW_ATTR_MANIFEST] = bl;
-  }
-
-  for (iter = attrs.begin(); iter != attrs.end(); ++iter) {
-    const string& name = iter->first;
-    bufferlist& bl = iter->second;
-
-    if (!bl.length())
-      continue;
-
-    (*attrset)[name.c_str()] = bl;
-
-    if (name.compare(RGW_ATTR_ETAG) == 0) {
-      etag = rgw_bl_str(bl);
-      params.op.obj.etag = etag;
-    } else if (name.compare(RGW_ATTR_CONTENT_TYPE) == 0) {
-      content_type = rgw_bl_str(bl);
-    } else if (name.compare(RGW_ATTR_ACL) == 0) {
-      acl_bl = bl;
-    }
-  }
-
-  if (!storage_class.empty()) {
-    bufferlist bl;
-    bl.append(storage_class);
-    (*attrset)[RGW_ATTR_STORAGE_CLASS] = bl;
-  }
-
-  params.op.obj.state = *state ;
-  params.op.obj.state.exists = true;
-  params.op.obj.state.size = size;
-  params.op.obj.state.accounted_size = accounted_size;
-  params.op.obj.owner = target->get_bucket_info().owner.id;
-  params.op.obj.category = meta.category;
-
-  if (meta.mtime) {
-    *meta.mtime = meta.set_mtime;
-  }
-
-  params.op.query_str = "meta";
-  params.op.obj.obj_id = target->obj_id;
-
-  /* Check if versioned */
-  bool is_versioned = !target->obj.key.instance.empty() && (target->obj.key.instance != "null");
-  params.op.obj.is_versioned = is_versioned;
-
-  if (is_versioned && (params.op.obj.category == RGWObjCategory::Main)) {
-    /* versioned object */
-    params.op.obj.flags |= rgw_bucket_dir_entry::FLAG_VER;
-  }
-  ret = store->ProcessOp(dpp, "PutObject", &params);
-  if (ret) {
-    ldpp_dout(dpp, 0)<<"In PutObject failed err:(" <<ret<<")" << dendl;
-    goto out;
-  }
-
-
-out:
-  if (ret < 0) {
-    ldpp_dout(dpp, 0) << "ERROR: do_write_meta returned ret=" << ret << dendl;
-  }
-
-  meta.canceled = true;
-
-  return ret;
-}
-
-int DB::Object::Write::write_meta(const DoutPrefixProvider *dpp, uint64_t size, uint64_t accounted_size,
-    map<string, bufferlist>& attrs)
-{
-  bool assume_noent = false;
-  /* handle assume_noent */
-  int r = _do_write_meta(dpp, size, accounted_size, attrs, assume_noent, meta.modify_tail);
-  return r;
-}
-
-int DB::Object::Delete::delete_obj(const DoutPrefixProvider *dpp) {
-  int ret = 0;
-  DBOpParams del_params = {};
-  bool versioning_enabled = ((params.versioning_status & BUCKET_VERSIONED) == BUCKET_VERSIONED); 
-  bool versioning_suspended = ((params.versioning_status & BUCKET_VERSIONS_SUSPENDED) == BUCKET_VERSIONS_SUSPENDED); 
-  bool regular_obj = true;
-  std::string versionid = target->obj.key.instance;
-
-  ret = target->get_object_impl(dpp, del_params);
-
-  if (ret < 0 && ret != -ENOENT) {
-    ldpp_dout(dpp, 0)<<"GetObject during delete failed err:(" <<ret<<")" << dendl;
-    return ret;
-  }
-
-  regular_obj = (del_params.op.obj.category == RGWObjCategory::Main);
-  if (!ret) {
-    if (!versionid.empty()) {
-      // version-id is provided
-      ret = delete_obj_impl(dpp, del_params);
-      return ret;
-    } else { // version-id is empty..
-      /*
-       * case: bucket_versioned
-       *    create_delete_marker;
-       * case: bucket_suspended
-       *    delete entry
-       *    create delete marker with version-id null;
-       * default:
-       *   just delete the entry
-       */
-      if (versioning_suspended && regular_obj) {
-        ret = delete_obj_impl(dpp, del_params);
-        ret = create_dm(dpp, del_params);
-      } else if (versioning_enabled && regular_obj) {
-        ret = create_dm(dpp, del_params);
-      } else {
-        ret = delete_obj_impl(dpp, del_params);
-      }
-    }
-  } else { // ret == -ENOENT
-     /* case: VersionID given
-      *     return -ENOENT
-      * else: // may or may not be versioned object
-      *     Listversionedobjects
-      *     if (list_entries.empty()) {
-      *         nothing to do..return ENOENT
-      *     } else {
-      *         read top entry
-      *         if (top.flags | FLAG_DELETE_MARKER) {
-      *            // nothing to do
-      *            return -ENOENT;
-      *          }
-      *          if (bucket_versioned)  {
-      *            // create delete marker with new version-id
-      *          } else if (bucket_suspended) {
-      *            // create delete marker with version-id null
-      *          }
-      *          bucket cannot be in unversioned state post having versions
-      *     }
-      */
-     if (!versionid.empty()) {
-       return -ENOENT;
-     }
-     ret = target->list_versioned_objects(dpp, del_params.op.obj.list_entries);
-     if (ret) {
-        ldpp_dout(dpp, 0)<<"ListVersionedObjects failed err:(" <<ret<<")" << dendl;
-        return ret;
-     }
-    if (del_params.op.obj.list_entries.empty()) {
-      return -ENOENT;
-    }
-    auto &ent = del_params.op.obj.list_entries.front();
-    if (ent.flags & rgw_bucket_dir_entry::FLAG_DELETE_MARKER) {
-      // for now do not create another delete marker..just exit
-      return 0;
-    }
-    ret = create_dm(dpp, del_params);
-  }
-  return ret;
-}
-
-int DB::Object::Delete::delete_obj_impl(const DoutPrefixProvider *dpp,
-                                        DBOpParams& del_params) {
-  int ret = 0;
-  DB *store = target->get_store();
-
-  ret = store->ProcessOp(dpp, "DeleteObject", &del_params);
-  if (ret) {
-    ldpp_dout(dpp, 0) << "In DeleteObject failed err:(" <<ret<<")" << dendl;
-    return ret;
-  }
-
-  /* Now that tail objects are associated with objectID, they are not deleted
-   * as part of this DeleteObj operation. Such tail objects (with no head object
-   * in *.object.table are cleaned up later by GC thread.
-   *
-   * To avoid races between writes/reads & GC delete, mtime is maintained for each
-   * tail object. This mtime is updated when tail object is written and also when
-   * its corresponding head object is deleted (like here in this case).
-   */
-  DBOpParams update_params = del_params;
-  update_params.op.obj.state.mtime = real_clock::now();
-  ret = store->ProcessOp(dpp, "UpdateObjectData", &update_params);
-
-  if (ret) {
-    ldpp_dout(dpp, 0) << "Updating tail objects mtime failed err:(" <<ret<<")" << dendl;
-  }
-  return ret;
-}
-
-/*
- * a) if no versionID specified,
- *  - create a delete marker with 
- *    - new version/instanceID (if bucket versioned)
- *    - null versionID (if versioning suspended)
- */
-int DB::Object::Delete::create_dm(const DoutPrefixProvider *dpp,
-                                             DBOpParams& del_params) {
-
-  DB *store = target->get_store();
-  bool versioning_suspended = ((params.versioning_status & BUCKET_VERSIONS_SUSPENDED) == BUCKET_VERSIONS_SUSPENDED); 
-  int ret = -1;
-  DBOpParams olh_params = {};
-  std::string version_id;
-  DBOpParams next_params = del_params;
-
-  version_id = del_params.op.obj.state.obj.key.instance;
-
-  DBOpParams dm_params = del_params;
-
-  // create delete marker
-
-  store->InitializeParams(dpp, &dm_params);
-  target->InitializeParamsfromObject(dpp, &dm_params);
-  dm_params.op.obj.category = RGWObjCategory::None;
-
-  if (versioning_suspended) {
-    dm_params.op.obj.state.obj.key.instance = "null";
-  } else {
-    store->gen_rand_obj_instance_name(&dm_params.op.obj.state.obj.key);
-    dm_params.op.obj.obj_id = dm_params.op.obj.state.obj.key.instance;
-  }
-
-  dm_params.op.obj.flags |= (rgw_bucket_dir_entry::FLAG_DELETE_MARKER);
-
-  ret = store->ProcessOp(dpp, "PutObject", &dm_params);
-
-  if (ret) {
-    ldpp_dout(dpp, 0) << "delete_olh: failed to create delete marker - err:(" <<ret<<")" << dendl;
-    return ret;
-  }
-  result.delete_marker = true;
-  result.version_id = dm_params.op.obj.state.obj.key.instance;
-  return ret;
-}
-
-int DB::get_entry(const std::string& oid, const std::string& marker,
-                             std::unique_ptr<rgw::sal::Lifecycle::LCEntry>* entry)
-{
-  int ret = 0;
-  const DoutPrefixProvider *dpp = get_def_dpp();
-
-  DBOpParams params = {};
-  InitializeParams(dpp, &params);
-
-  params.op.lc_entry.index = oid;
-  params.op.lc_entry.entry.set_bucket(marker);
-
-  params.op.query_str = "get_entry";
-  ret = ProcessOp(dpp, "GetLCEntry", &params);
-
-  if (ret) {
-    ldpp_dout(dpp, 0)<<"In GetLCEntry failed err:(" <<ret<<") " << dendl;
-    goto out;
-  }
-
-  if (!params.op.lc_entry.entry.get_start_time() == 0) { //ensure entry found
-    rgw::sal::Lifecycle::LCEntry* e;
-    e = new rgw::sal::StoreLifecycle::StoreLCEntry(params.op.lc_entry.entry);
-    if (!e) {
-      ret = -ENOMEM;
-      goto out;
-    }
-    entry->reset(e);
-  }
-
-out:
-  return ret;
-}
-
-int DB::get_next_entry(const std::string& oid, const std::string& marker,
-                             std::unique_ptr<rgw::sal::Lifecycle::LCEntry>* entry)
-{
-  int ret = 0;
-  const DoutPrefixProvider *dpp = get_def_dpp();
-
-  DBOpParams params = {};
-  InitializeParams(dpp, &params);
-
-  params.op.lc_entry.index = oid;
-  params.op.lc_entry.entry.set_bucket(marker);
-
-  params.op.query_str = "get_next_entry";
-  ret = ProcessOp(dpp, "GetLCEntry", &params);
-
-  if (ret) {
-    ldpp_dout(dpp, 0)<<"In GetLCEntry failed err:(" <<ret<<") " << dendl;
-    goto out;
-  }
-
-  if (!params.op.lc_entry.entry.get_start_time() == 0) { //ensure entry found
-    rgw::sal::Lifecycle::LCEntry* e;
-    e = new rgw::sal::StoreLifecycle::StoreLCEntry(params.op.lc_entry.entry);
-    if (!e) {
-      ret = -ENOMEM;
-      goto out;
-    }
-    entry->reset(e);
-  }
-
-out:
-  return ret;
-}
-
-int DB::set_entry(const std::string& oid, rgw::sal::Lifecycle::LCEntry& entry)
-{
-  int ret = 0;
-  const DoutPrefixProvider *dpp = get_def_dpp();
-
-  DBOpParams params = {};
-  InitializeParams(dpp, &params);
-
-  params.op.lc_entry.index = oid;
-  params.op.lc_entry.entry = entry;
-
-  ret = ProcessOp(dpp, "InsertLCEntry", &params);
-
-  if (ret) {
-    ldpp_dout(dpp, 0)<<"In InsertLCEntry failed err:(" <<ret<<") " << dendl;
-    goto out;
-  }
-
-out:
-  return ret;
-}
-
-int DB::list_entries(const std::string& oid, const std::string& marker,
-                                uint32_t max_entries, std::vector<std::unique_ptr<rgw::sal::Lifecycle::LCEntry>>& entries)
-{
-  int ret = 0;
-  const DoutPrefixProvider *dpp = get_def_dpp();
-
-  entries.clear();
-
-  DBOpParams params = {};
-  InitializeParams(dpp, &params);
-
-  params.op.lc_entry.index = oid;
-  params.op.lc_entry.min_marker = marker;
-  params.op.list_max_count = max_entries;
-
-  ret = ProcessOp(dpp, "ListLCEntries", &params);
-
-  if (ret) {
-    ldpp_dout(dpp, 0)<<"In ListLCEntries failed err:(" <<ret<<") " << dendl;
-    goto out;
-  }
-
-  for (auto& entry : params.op.lc_entry.list_entries) {
-    entries.push_back(std::make_unique<rgw::sal::StoreLifecycle::StoreLCEntry>(std::move(entry)));
-  }
-
-out:
-  return ret;
-}
-
-int DB::rm_entry(const std::string& oid, rgw::sal::Lifecycle::LCEntry& entry)
-{
-  int ret = 0;
-  const DoutPrefixProvider *dpp = get_def_dpp();
-
-  DBOpParams params = {};
-  InitializeParams(dpp, &params);
-
-  params.op.lc_entry.index = oid;
-  params.op.lc_entry.entry = entry;
-
-  ret = ProcessOp(dpp, "RemoveLCEntry", &params);
-
-  if (ret) {
-    ldpp_dout(dpp, 0)<<"In RemoveLCEntry failed err:(" <<ret<<") " << dendl;
-    goto out;
-  }
-
-out:
-  return ret;
-}
-
-int DB::get_head(const std::string& oid, std::unique_ptr<rgw::sal::Lifecycle::LCHead>* head)
-{
-  int ret = 0;
-  const DoutPrefixProvider *dpp = get_def_dpp();
-
-  DBOpParams params = {};
-  InitializeParams(dpp, &params);
-
-  params.op.lc_head.index = oid;
-
-  ret = ProcessOp(dpp, "GetLCHead", &params);
-
-  if (ret) {
-    ldpp_dout(dpp, 0)<<"In GetLCHead failed err:(" <<ret<<") " << dendl;
-    goto out;
-  }
-
-  *head = std::make_unique<rgw::sal::StoreLifecycle::StoreLCHead>(params.op.lc_head.head);
-
-out:
-  return ret;
-}
-
-int DB::put_head(const std::string& oid, rgw::sal::Lifecycle::LCHead& head)
-{
-  int ret = 0;
-  const DoutPrefixProvider *dpp = get_def_dpp();
-
-  DBOpParams params = {};
-  InitializeParams(dpp, &params);
-
-  params.op.lc_head.index = oid;
-  params.op.lc_head.head = head;
-
-  ret = ProcessOp(dpp, "InsertLCHead", &params);
-
-  if (ret) {
-    ldpp_dout(dpp, 0)<<"In InsertLCHead failed err:(" <<ret<<") " << dendl;
-    goto out;
-  }
-
-out:
-  return ret;
-}
-
-int DB::delete_stale_objs(const DoutPrefixProvider *dpp, const std::string& bucket,
-                          uint32_t min_wait) {
-  DBOpParams params = {};
-  int ret = -1;
-
-  params.op.bucket.info.bucket.name = bucket;
-  /* Verify if bucket exists.
-   * XXX: This is needed for now to create objectmap of bucket
-   * in SQLGetBucket
-   */
-  InitializeParams(dpp, &params);
-  ret = ProcessOp(dpp, "GetBucket", &params);
-  if (ret) {
-    ldpp_dout(dpp, 0) << "In GetBucket failed err:(" <<ret<<")" << dendl;
-    return ret;
-  }
-
-  ldpp_dout(dpp, 20) << " Deleting stale_objs of bucket( " << bucket <<")" << dendl;
-  /* XXX: handle reads racing with delete here. Simple approach is maybe
-   * to use locks or sqlite transactions.
-   */
-  InitializeParams(dpp, &params);
-  params.op.obj.state.mtime = (real_clock::now() - make_timespan(min_wait));
-  ret = ProcessOp(dpp, "DeleteStaleObjectData", &params);
-  if (ret) {
-    ldpp_dout(dpp, 0) << "In DeleteStaleObjectData failed err:(" <<ret<<")" << dendl;
-  }
-
-  return ret;
-}
-
-void *DB::GC::entry() {
-  do {
-    std::unique_lock<std::mutex> lk(mtx);
-
-    ldpp_dout(dpp, 2) << " DB GC started " << dendl;
-    int max = 100;
-    RGWUserBuckets buckets;
-    bool is_truncated = false;
-
-    do {
-      std::string& marker = bucket_marker;
-      rgw_user user;
-      user.id = user_marker;
-      buckets.clear();
-      is_truncated = false;
-
-      int r = db->list_buckets(dpp, "all", user, marker, string(),
-                       max, false, &buckets, &is_truncated);
-      if (r < 0) { //do nothing? retry later ?
-        break;
-      }
-
-      for (const auto& ent : buckets.get_buckets()) {
-        const std::string &bname = ent.first;
-
-        r = db->delete_stale_objs(dpp, bname, gc_obj_min_wait);
-
-        if (r < 0) { //do nothing? skip to next entry?
-         ldpp_dout(dpp, 2) << " delete_stale_objs failed for bucket( " << bname <<")" << dendl;
-        }
-        bucket_marker = bname;
-        user_marker = user.id;
-
-        /* XXX: If using locks, unlock here and reacquire in the next iteration */
-        cv.wait_for(lk, std::chrono::milliseconds(100));
-       if (stop_signalled) {
-         goto done;
-       }
-      }
-    } while(is_truncated);
-
-    bucket_marker.clear();
-    cv.wait_for(lk, std::chrono::milliseconds(gc_interval*10));
-  } while(! stop_signalled);
-
-done:
-  return nullptr;
-}
-
-} } // namespace rgw::store
-
diff --git a/src/rgw/store/dbstore/common/dbstore.h b/src/rgw/store/dbstore/common/dbstore.h
deleted file mode 100644 (file)
index 12ab3f0..0000000
+++ /dev/null
@@ -1,2024 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-
-#ifndef DB_STORE_H
-#define DB_STORE_H
-
-#include <errno.h>
-#include <stdlib.h>
-#include <string>
-#include <stdio.h>
-#include <iostream>
-#include <mutex>
-#include <condition_variable>
-// this seems safe to use, at least for now--arguably, we should
-// prefer header-only fmt, in general
-#undef FMT_HEADER_ONLY
-#define FMT_HEADER_ONLY 1
-#include "fmt/format.h"
-#include <map>
-#include "rgw_sal_store.h"
-#include "rgw_common.h"
-#include "rgw_bucket.h"
-#include "global/global_context.h"
-#include "global/global_init.h"
-#include "common/ceph_context.h"
-#include "rgw_obj_manifest.h"
-#include "rgw_multi.h"
-
-namespace rgw { namespace store {
-
-class DB;
-
-struct DBOpUserInfo {
-  RGWUserInfo uinfo = {};
-  obj_version user_version;
-  rgw::sal::Attrs user_attrs;
-};
-
-struct DBOpBucketInfo {
-  RGWBucketEnt ent; // maybe not needed. not used in create/get_bucket
-  RGWBucketInfo info;
-  RGWUser* owner = nullptr;
-  rgw::sal::Attrs bucket_attrs;
-  obj_version bucket_version;
-  ceph::real_time mtime;
-  // used for list query
-  std::string min_marker;
-  std::string max_marker;
-  std::list<RGWBucketEnt> list_entries;
-};
-
-struct DBOpObjectInfo {
-  RGWAccessControlPolicy acls;
-  RGWObjState state = {};
-
-  /* Below are taken from rgw_bucket_dir_entry */
-  RGWObjCategory category;
-  std::string etag;
-  std::string owner;
-  std::string owner_display_name;
-  std::string content_type;
-  std::string storage_class;
-  bool appendable;
-  uint64_t index_ver;
-  std::string tag;
-  uint16_t flags;
-  uint64_t versioned_epoch;
-
-  /* from state.manifest (RGWObjManifest) */
-  std::map<uint64_t, RGWObjManifestPart> objs;
-  uint64_t head_size{0};
-  rgw_placement_rule head_placement_rule;
-  uint64_t max_head_size{0};
-  std::string obj_id;
-  rgw_bucket_placement tail_placement; /* might be different than the original bucket,
-                                          as object might have been copied across pools */
-  std::map<uint64_t, RGWObjManifestRule> rules;
-  std::string tail_instance; /* tail object's instance */
-
-
-  /* Obj's omap <key,value> store */
-  std::map<std::string, bufferlist> omap;
-
-  /* Extra fields */
-  bool is_multipart;
-  std::list<RGWUploadPartInfo> mp_parts;
-
-  bufferlist head_data;
-  std::string min_marker;
-  std::string max_marker;
-  std::string prefix;
-  std::list<rgw_bucket_dir_entry> list_entries;
-  /* XXX: Maybe use std::vector instead of std::list */
-
-  /* for versioned objects */
-  bool is_versioned;
-  uint64_t version_num = 0;
-};
-
-struct DBOpObjectDataInfo {
-  RGWObjState state;
-  uint64_t part_num;
-  std::string multipart_part_str;
-  uint64_t offset;
-  uint64_t size;
-  bufferlist data{};
-};
-
-struct DBOpLCHeadInfo {
-  std::string index;
-  rgw::sal::StoreLifecycle::StoreLCHead head;
-};
-
-struct DBOpLCEntryInfo {
-  std::string index;
-  rgw::sal::StoreLifecycle::StoreLCEntry entry;
-  // used for list query
-  std::string min_marker;
-  std::list<rgw::sal::StoreLifecycle::StoreLCEntry> list_entries;
-};
-
-struct DBOpInfo {
-  std::string name; // Op name
-  /* Support only single access_key for now. So store
-   * it separately as primary access_key_id & secret to
-   * be able to query easily.
-   *
-   * XXX: Swift keys and subuser not supported for now */
-  DBOpUserInfo user;
-  std::string query_str;
-  DBOpBucketInfo bucket;
-  DBOpObjectInfo obj;
-  DBOpObjectDataInfo obj_data;
-  DBOpLCHeadInfo lc_head;
-  DBOpLCEntryInfo lc_entry;
-  uint64_t list_max_count;
-};
-
-struct DBOpParams {
-  CephContext *cct;
-
-  /* Tables */
-  std::string user_table;
-  std::string bucket_table;
-  std::string object_table;
-
-  /* Ops*/
-  DBOpInfo op;
-
-  std::string objectdata_table;
-  std::string object_trigger;
-  std::string object_view;
-  std::string quota_table;
-  std::string lc_head_table;
-  std::string lc_entry_table;
-  std::string obj;
-};
-
-/* Used for prepared schemas.
- * Difference with above structure is that all 
- * the fields are strings here to accommodate any
- * style identifiers used by backend db. By default
- * initialized with sqlitedb style, can be overriden
- * using InitPrepareParams()
- *
- * These identifiers are used in prepare and bind statements
- * to get the right index of each param.
- */
-struct DBOpUserPrepareInfo {
-  static constexpr const char* user_id = ":user_id";
-  static constexpr const char* tenant = ":tenant";
-  static constexpr const char* ns = ":ns";
-  static constexpr const char* display_name = ":display_name";
-  static constexpr const char* user_email = ":user_email";
-  /* Support only single access_key for now. So store
-   * it separately as primary access_key_id & secret to
-   * be able to query easily.
-   *
-   * In future, when need to support & query from multiple
-   * access keys, better to maintain them in a separate table.
-   */
-  static constexpr const char* access_keys_id = ":access_keys_id";
-  static constexpr const char* access_keys_secret = ":access_keys_secret";
-  static constexpr const char* access_keys = ":access_keys";
-  static constexpr const char* swift_keys = ":swift_keys";
-  static constexpr const char* subusers = ":subusers";
-  static constexpr const char* suspended = ":suspended";
-  static constexpr const char* max_buckets = ":max_buckets";
-  static constexpr const char* op_mask = ":op_mask";
-  static constexpr const char* user_caps = ":user_caps";
-  static constexpr const char* admin = ":admin";
-  static constexpr const char* system = ":system";
-  static constexpr const char* placement_name = ":placement_name";
-  static constexpr const char* placement_storage_class = ":placement_storage_class";
-  static constexpr const char* placement_tags = ":placement_tags";
-  static constexpr const char* bucket_quota = ":bucket_quota";
-  static constexpr const char* temp_url_keys = ":temp_url_keys";
-  static constexpr const char* user_quota = ":user_quota";
-  static constexpr const char* type = ":type";
-  static constexpr const char* mfa_ids = ":mfa_ids";
-  static constexpr const char* assumed_role_arn = ":assumed_role_arn";
-  static constexpr const char* user_attrs = ":user_attrs";
-  static constexpr const char* user_ver = ":user_vers";
-  static constexpr const char* user_ver_tag = ":user_ver_tag";
-};
-
-struct DBOpBucketPrepareInfo {
-  static constexpr const char* bucket_name = ":bucket_name";
-  static constexpr const char* tenant = ":tenant";
-  static constexpr const char* marker = ":marker";
-  static constexpr const char* bucket_id = ":bucket_id";
-  static constexpr const char* size = ":size";
-  static constexpr const char* size_rounded = ":size_rounded";
-  static constexpr const char* creation_time = ":creation_time";
-  static constexpr const char* count = ":count";
-  static constexpr const char* placement_name = ":placement_name";
-  static constexpr const char* placement_storage_class = ":placement_storage_class";
-  /* ownerid - maps to DBOpUserPrepareInfo */
-  static constexpr const char* flags = ":flags";
-  static constexpr const char* zonegroup = ":zonegroup";
-  static constexpr const char* has_instance_obj = ":has_instance_obj";
-  static constexpr const char* quota = ":quota";
-  static constexpr const char* requester_pays = ":requester_pays";
-  static constexpr const char* has_website = ":has_website";
-  static constexpr const char* website_conf = ":website_conf";
-  static constexpr const char* swift_versioning = ":swift_versioning";
-  static constexpr const char* swift_ver_location = ":swift_ver_location";
-  static constexpr const char* mdsearch_config = ":mdsearch_config";
-  static constexpr const char* new_bucket_instance_id = ":new_bucket_instance_id";
-  static constexpr const char* obj_lock = ":obj_lock";
-  static constexpr const char* sync_policy_info_groups = ":sync_policy_info_groups";
-  static constexpr const char* bucket_attrs = ":bucket_attrs";
-  static constexpr const char* bucket_ver = ":bucket_vers";
-  static constexpr const char* bucket_ver_tag = ":bucket_ver_tag";
-  static constexpr const char* mtime = ":mtime";
-  static constexpr const char* min_marker = ":min_marker";
-  static constexpr const char* max_marker = ":max_marker";
-};
-
-struct DBOpObjectPrepareInfo {
-  static constexpr const char* obj_name = ":obj_name";
-  static constexpr const char* obj_instance = ":obj_instance";
-  static constexpr const char* obj_ns  = ":obj_ns";
-  static constexpr const char* acls = ":acls";
-  static constexpr const char* index_ver = ":index_ver";
-  static constexpr const char* tag = ":tag";
-  static constexpr const char* flags = ":flags";
-  static constexpr const char* versioned_epoch = ":versioned_epoch";
-  static constexpr const char* obj_category = ":obj_category";
-  static constexpr const char* etag = ":etag";
-  static constexpr const char* owner = ":owner";
-  static constexpr const char* owner_display_name = ":owner_display_name";
-  static constexpr const char* storage_class = ":storage_class";
-  static constexpr const char* appendable = ":appendable";
-  static constexpr const char* content_type = ":content_type";
-  static constexpr const char* index_hash_source = ":index_hash_source";
-  static constexpr const char* obj_size = ":obj_size";
-  static constexpr const char* accounted_size = ":accounted_size";
-  static constexpr const char* mtime = ":mtime";
-  static constexpr const char* epoch = ":epoch";
-  static constexpr const char* obj_tag = ":obj_tag";
-  static constexpr const char* tail_tag = ":tail_tag";
-  static constexpr const char* write_tag = ":write_tag";
-  static constexpr const char* fake_tag = ":fake_tag";
-  static constexpr const char* shadow_obj = ":shadow_obj";
-  static constexpr const char* has_data = ":has_data";
-  static constexpr const char* is_versioned = ":is_versioned";
-  static constexpr const char* version_num = ":version_num";
-  static constexpr const char* pg_ver = ":pg_ver";
-  static constexpr const char* zone_short_id = ":zone_short_id";
-  static constexpr const char* obj_version = ":obj_version";
-  static constexpr const char* obj_version_tag = ":obj_version_tag";
-  static constexpr const char* obj_attrs = ":obj_attrs";
-  static constexpr const char* head_size = ":head_size";
-  static constexpr const char* max_head_size = ":max_head_size";
-  static constexpr const char* obj_id = ":obj_id";
-  static constexpr const char* tail_instance = ":tail_instance";
-  static constexpr const char* head_placement_rule_name = ":head_placement_rule_name";
-  static constexpr const char* head_placement_storage_class  = ":head_placement_storage_class";
-  static constexpr const char* tail_placement_rule_name = ":tail_placement_rule_name";
-  static constexpr const char* tail_placement_storage_class  = ":tail_placement_storage_class";
-  static constexpr const char* manifest_part_objs = ":manifest_part_objs";
-  static constexpr const char* manifest_part_rules = ":manifest_part_rules";
-  static constexpr const char* omap = ":omap";
-  static constexpr const char* is_multipart = ":is_multipart";
-  static constexpr const char* mp_parts = ":mp_parts";
-  static constexpr const char* head_data = ":head_data";
-  static constexpr const char* min_marker = ":min_marker";
-  static constexpr const char* max_marker = ":max_marker";
-  static constexpr const char* prefix = ":prefix";
-  /* Below used to update mp_parts obj name
-   * from meta object to src object on completion */
-  static constexpr const char* new_obj_name = ":new_obj_name";
-  static constexpr const char* new_obj_instance = ":new_obj_instance";
-  static constexpr const char* new_obj_ns  = ":new_obj_ns";
-};
-
-struct DBOpObjectDataPrepareInfo {
-  static constexpr const char* part_num = ":part_num";
-  static constexpr const char* offset = ":offset";
-  static constexpr const char* data = ":data";
-  static constexpr const char* size = ":size";
-  static constexpr const char* multipart_part_str = ":multipart_part_str";
-};
-
-struct DBOpLCEntryPrepareInfo {
-  static constexpr const char* index = ":index";
-  static constexpr const char* bucket_name = ":bucket_name";
-  static constexpr const char* start_time = ":start_time";
-  static constexpr const char* status = ":status";
-  static constexpr const char* min_marker = ":min_marker";
-};
-
-struct DBOpLCHeadPrepareInfo {
-  static constexpr const char* index = ":index";
-  static constexpr const char* start_date = ":start_date";
-  static constexpr const char* marker = ":marker";
-};
-
-struct DBOpPrepareInfo {
-  DBOpUserPrepareInfo user;
-  std::string_view query_str; // view into DBOpInfo::query_str
-  DBOpBucketPrepareInfo bucket;
-  DBOpObjectPrepareInfo obj;
-  DBOpObjectDataPrepareInfo obj_data;
-  DBOpLCHeadPrepareInfo lc_head;
-  DBOpLCEntryPrepareInfo lc_entry;
-  static constexpr const char* list_max_count = ":list_max_count";
-};
-
-struct DBOpPrepareParams {
-  /* Tables */
-  std::string user_table;
-  std::string bucket_table;
-  std::string object_table;
-
-  /* Ops */
-  DBOpPrepareInfo op;
-
-
-  std::string objectdata_table;
-  std::string object_trigger;
-  std::string object_view;
-  std::string quota_table;
-  std::string lc_head_table;
-  std::string lc_entry_table;
-};
-
-struct DBOps {
-  std::shared_ptr<class InsertUserOp> InsertUser;
-  std::shared_ptr<class RemoveUserOp> RemoveUser;
-  std::shared_ptr<class GetUserOp> GetUser;
-  std::shared_ptr<class InsertBucketOp> InsertBucket;
-  std::shared_ptr<class UpdateBucketOp> UpdateBucket;
-  std::shared_ptr<class RemoveBucketOp> RemoveBucket;
-  std::shared_ptr<class GetBucketOp> GetBucket;
-  std::shared_ptr<class ListUserBucketsOp> ListUserBuckets;
-  std::shared_ptr<class InsertLCEntryOp> InsertLCEntry;
-  std::shared_ptr<class RemoveLCEntryOp> RemoveLCEntry;
-  std::shared_ptr<class GetLCEntryOp> GetLCEntry;
-  std::shared_ptr<class ListLCEntriesOp> ListLCEntries;
-  std::shared_ptr<class  InsertLCHeadOp> InsertLCHead;
-  std::shared_ptr<class RemoveLCHeadOp> RemoveLCHead;
-  std::shared_ptr<class GetLCHeadOp> GetLCHead;
-};
-
-class ObjectOp {
-  public:
-    ObjectOp() {};
-
-    virtual ~ObjectOp() {}
-
-    std::shared_ptr<class PutObjectOp> PutObject;
-    std::shared_ptr<class DeleteObjectOp> DeleteObject;
-    std::shared_ptr<class GetObjectOp> GetObject;
-    std::shared_ptr<class UpdateObjectOp> UpdateObject;
-    std::shared_ptr<class ListBucketObjectsOp> ListBucketObjects;
-    std::shared_ptr<class ListVersionedObjectsOp> ListVersionedObjects;
-    std::shared_ptr<class PutObjectDataOp> PutObjectData;
-    std::shared_ptr<class UpdateObjectDataOp> UpdateObjectData;
-    std::shared_ptr<class GetObjectDataOp> GetObjectData;
-    std::shared_ptr<class DeleteObjectDataOp> DeleteObjectData;
-    std::shared_ptr<class DeleteStaleObjectDataOp> DeleteStaleObjectData;
-
-    virtual int InitializeObjectOps(std::string db_name, const DoutPrefixProvider *dpp) { return 0; }
-};
-
-class DBOp {
-  private:
-    static constexpr std::string_view CreateUserTableQ =
-      /* Corresponds to rgw::sal::User
-       *
-       * For now only UserID is made Primary key.
-       * If multiple tenants are stored in single .db handle, should
-       * make both (UserID, Tenant) as Primary Key.
-       *
-       * XXX:
-       * - AccessKeys, SwiftKeys, Subusers (map<>) are stored as blob.
-       *   To enable easy query, first accesskey is stored in separate fields
-       *   AccessKeysID, AccessKeysSecret.
-       *   In future, may be have separate table to store these keys and
-       *   query on that table.
-       * - Quota stored as blob .. should be linked to quota table.
-       */
-      "CREATE TABLE IF NOT EXISTS '{}' (       \
-      UserID TEXT NOT NULL UNIQUE,             \
-      Tenant TEXT ,            \
-      NS TEXT ,                \
-      DisplayName TEXT , \
-      UserEmail TEXT , \
-      AccessKeysID TEXT ,      \
-      AccessKeysSecret TEXT ,  \
-      AccessKeys BLOB ,        \
-      SwiftKeys BLOB , \
-      SubUsers BLOB ,          \
-      Suspended INTEGER ,      \
-      MaxBuckets INTEGER ,     \
-      OpMask   INTEGER ,       \
-      UserCaps BLOB ,          \
-      Admin    INTEGER ,       \
-      System INTEGER ,         \
-      PlacementName TEXT ,     \
-      PlacementStorageClass TEXT ,     \
-      PlacementTags BLOB ,     \
-      BucketQuota BLOB ,       \
-      TempURLKeys BLOB ,       \
-      UserQuota BLOB , \
-      TYPE INTEGER ,           \
-      MfaIDs BLOB ,    \
-      AssumedRoleARN TEXT , \
-      UserAttrs   BLOB,   \
-      UserVersion   INTEGER,    \
-      UserVersionTag TEXT,      \
-      PRIMARY KEY (UserID) \n);";
-
-    static constexpr std::string_view CreateBucketTableQ =
-      /* Corresponds to rgw::sal::Bucket
-       *  
-       *  For now only BucketName is made Primary key. Since buckets should
-       *  be unique across users in rgw, OwnerID is not made part of primary key.
-       *  However it is still referenced as foreign key
-       *
-       *  If multiple tenants are stored in single .db handle, should
-       *  make both (BucketName, Tenant) as Primary Key. Also should
-       *  reference (UserID, Tenant) as Foreign key.
-       *
-       * leaving below RADOS specific fields
-       *   - rgw_data_placement_target explicit_placement (struct rgw_bucket)
-       *   - rgw::BucketLayout layout (struct RGWBucketInfo)
-       *   - const static uint32_t NUM_SHARDS_BLIND_BUCKET (struct RGWBucketInfo),
-       *     should be '0' indicating no sharding.
-       *   - cls_rgw_reshard_status reshard_status (struct RGWBucketInfo)
-       *
-       * XXX:
-       *   - Quota stored as blob .. should be linked to quota table.
-       *   - WebsiteConf stored as BLOB..if required, should be split
-       *   - Storing bucket_version (struct RGWBucket), objv_tracker
-       *     (struct RGWBucketInfo) separately. Are they same?
-       *
-       */
-      "CREATE TABLE IF NOT EXISTS '{}' ( \
-      BucketName TEXT NOT NULL UNIQUE , \
-      Tenant TEXT,        \
-      Marker TEXT,        \
-      BucketID TEXT,      \
-      Size   INTEGER,     \
-      SizeRounded INTEGER,\
-      CreationTime BLOB,  \
-      Count  INTEGER,     \
-      PlacementName TEXT ,     \
-      PlacementStorageClass TEXT ,     \
-      OwnerID TEXT NOT NULL, \
-      Flags   INTEGER,       \
-      Zonegroup TEXT,         \
-      HasInstanceObj BOOLEAN, \
-      Quota   BLOB,       \
-      RequesterPays BOOLEAN,  \
-      HasWebsite  BOOLEAN,    \
-      WebsiteConf BLOB,   \
-      SwiftVersioning BOOLEAN, \
-      SwiftVerLocation TEXT,  \
-      MdsearchConfig  BLOB,   \
-      NewBucketInstanceID TEXT,\
-      ObjectLock BLOB, \
-      SyncPolicyInfoGroups BLOB, \
-      BucketAttrs   BLOB,   \
-      BucketVersion   INTEGER,    \
-      BucketVersionTag TEXT,      \
-      Mtime   BLOB,   \
-      PRIMARY KEY (BucketName) \
-      FOREIGN KEY (OwnerID) \
-      REFERENCES '{}' (UserID) ON DELETE CASCADE ON UPDATE CASCADE \n);";
-
-    static constexpr std::string_view CreateObjectTableTriggerQ =
-      "CREATE TRIGGER IF NOT EXISTS '{}' \
-          AFTER INSERT ON '{}' \
-       BEGIN \
-          UPDATE '{}' \
-          SET VersionNum = (SELECT COALESCE(max(VersionNum), 0) from '{}' where ObjName = new.ObjName) + 1 \
-          where ObjName = new.ObjName and ObjInstance = new.ObjInstance; \
-       END;";
-
-    static constexpr std::string_view CreateObjectTableQ =
-      /* Corresponds to rgw::sal::Object
-       *
-       *  For now only BucketName, ObjName is made Primary key.
-       *  If multiple tenants are stored in single .db handle, should
-       *  include Tenant too in the Primary Key. Also should
-       *  reference (BucketID, Tenant) as Foreign key.
-       * 
-       * referring to 
-       * - rgw_bucket_dir_entry - following are added for now
-       *   flags,
-       *   versioned_epoch
-       *   tag
-       *   index_ver
-       *   meta.category
-       *   meta.etag
-       *   meta.storageclass
-       *   meta.appendable
-       *   meta.content_type
-       *   meta.owner
-       *   meta.owner_display_name
-       *
-       * - RGWObjState. Below are omitted from that struct
-       *    as they seem in-memory variables
-       *    * is_atomic, has_atts, exists, prefetch_data, keep_tail, 
-       * - RGWObjManifest
-       *
-       * Extra field added "IsMultipart" to flag multipart uploads,
-       * HeadData to store first chunk data.
-       */
-      "CREATE TABLE IF NOT EXISTS '{}' ( \
-      ObjName TEXT NOT NULL , \
-      ObjInstance TEXT, \
-      ObjNS TEXT, \
-      BucketName TEXT NOT NULL , \
-      ACLs    BLOB,   \
-      IndexVer    INTEGER,    \
-      Tag TEXT,   \
-      Flags INTEGER, \
-      VersionedEpoch INTEGER, \
-      ObjCategory INTEGER,    \
-      Etag   TEXT,    \
-      Owner TEXT, \
-      OwnerDisplayName TEXT,  \
-      StorageClass    TEXT,   \
-      Appendable  BOOL,   \
-      ContentType TEXT,   \
-      IndexHashSource TEXT, \
-      ObjSize  INTEGER,   \
-      AccountedSize INTEGER,  \
-      Mtime   BLOB,   \
-      Epoch  INTEGER, \
-      ObjTag  BLOB,   \
-      TailTag BLOB,   \
-      WriteTag    TEXT,   \
-      FakeTag BOOL,   \
-      ShadowObj   TEXT,   \
-      HasData  BOOL,  \
-      IsVersioned BOOL,  \
-      VersionNum  INTEGER, \
-      PGVer   INTEGER, \
-      ZoneShortID  INTEGER,  \
-      ObjVersion   INTEGER,    \
-      ObjVersionTag TEXT,      \
-      ObjAttrs    BLOB,   \
-      HeadSize    INTEGER,    \
-      MaxHeadSize    INTEGER,    \
-      ObjID      TEXT NOT NULL, \
-      TailInstance  TEXT, \
-      HeadPlacementRuleName   TEXT, \
-      HeadPlacementRuleStorageClass TEXT, \
-      TailPlacementRuleName   TEXT, \
-      TailPlacementStorageClass TEXT, \
-      ManifestPartObjs    BLOB,   \
-      ManifestPartRules   BLOB,   \
-      Omap    BLOB,   \
-      IsMultipart     BOOL,   \
-      MPPartsList    BLOB,   \
-      HeadData  BLOB,   \
-      PRIMARY KEY (ObjName, ObjInstance, BucketName), \
-      FOREIGN KEY (BucketName) \
-      REFERENCES '{}' (BucketName) ON DELETE CASCADE ON UPDATE CASCADE \n);";
-
-    static constexpr std::string_view CreateObjectDataTableQ =
-      /* Extra field 'MultipartPartStr' added which signifies multipart
-       * <uploadid + partnum>. For regular object, it is '0.0'
-       *
-       *  - part: a collection of stripes that make a contiguous part of an
-       object. A regular object will only have one part (although might have
-       many stripes), a multipart object might have many parts. Each part
-       has a fixed stripe size (ObjChunkSize), although the last stripe of a
-       part might be smaller than that.
-       */
-      "CREATE TABLE IF NOT EXISTS '{}' ( \
-      ObjName TEXT NOT NULL , \
-      ObjInstance TEXT, \
-      ObjNS TEXT, \
-      BucketName TEXT NOT NULL , \
-      ObjID      TEXT NOT NULL , \
-      MultipartPartStr TEXT, \
-      PartNum  INTEGER NOT NULL, \
-      Offset   INTEGER, \
-      Size      INTEGER, \
-      Mtime  BLOB,       \
-      Data     BLOB,             \
-      PRIMARY KEY (ObjName, BucketName, ObjInstance, ObjID, MultipartPartStr, PartNum), \
-      FOREIGN KEY (BucketName) \
-      REFERENCES '{}' (BucketName) ON DELETE CASCADE ON UPDATE CASCADE \n);";
-
-    static constexpr std::string_view CreateObjectViewQ =
-      /* This query creats temporary view with entries from ObjectData table which have
-       * corresponding head object (i.e, with same ObjName, ObjInstance, ObjNS, ObjID)
-       * in the Object table.
-       *
-       * GC thread can use this view to delete stale entries from the ObjectData table which
-       * do not exist in this view.
-       *
-       * XXX: This view is throwing ForeignKey mismatch error, mostly may be because all the keys
-       * of objectdata table are not referenced here. So this view is not used atm.
-       */
-      "CREATE TEMP VIEW IF NOT EXISTS '{}' AS \
-      SELECT s.ObjName, s.ObjInstance, s.ObjID from '{}' as s INNER JOIN '{}' USING \
-      (ObjName, BucketName, ObjInstance, ObjID);";
-
-
-    static constexpr std::string_view CreateQuotaTableQ =
-      "CREATE TABLE IF NOT EXISTS '{}' ( \
-      QuotaID INTEGER PRIMARY KEY AUTOINCREMENT UNIQUE , \
-      MaxSizeSoftThreshold INTEGER ,   \
-      MaxObjsSoftThreshold INTEGER ,   \
-      MaxSize  INTEGER ,               \
-      MaxObjects INTEGER ,             \
-      Enabled Boolean ,                \
-      CheckOnRaw Boolean \n);";
-
-    static constexpr std::string_view CreateLCEntryTableQ =
-      "CREATE TABLE IF NOT EXISTS '{}' ( \
-      LCIndex  TEXT NOT NULL , \
-      BucketName TEXT NOT NULL , \
-      StartTime  INTEGER , \
-      Status     INTEGER , \
-      PRIMARY KEY (LCIndex, BucketName) \n);";
-
-    static constexpr std::string_view CreateLCHeadTableQ =
-      "CREATE TABLE IF NOT EXISTS '{}' ( \
-      LCIndex  TEXT NOT NULL , \
-      Marker TEXT , \
-      StartDate  INTEGER , \
-      PRIMARY KEY (LCIndex) \n);";
-
-    static constexpr std::string_view DropQ = "DROP TABLE IF EXISTS '{}'";
-    static constexpr std::string_view ListAllQ = "SELECT  * from '{}'";
-
-  public:
-    DBOp() {}
-    virtual ~DBOp() {}
-    std::mutex mtx; // to protect prepared stmt
-
-    static std::string CreateTableSchema(std::string_view type,
-                                         const DBOpParams *params) {
-      if (!type.compare("User"))
-        return fmt::format(CreateUserTableQ,
-            params->user_table);
-      if (!type.compare("Bucket"))
-        return fmt::format(CreateBucketTableQ,
-            params->bucket_table,
-            params->user_table);
-      if (!type.compare("Object"))
-        return fmt::format(CreateObjectTableQ,
-            params->object_table,
-            params->bucket_table);
-      if (!type.compare("ObjectTrigger"))
-        return fmt::format(CreateObjectTableTriggerQ,
-            params->object_trigger,
-            params->object_table,
-            params->object_table,
-            params->object_table);
-      if (!type.compare("ObjectData"))
-        return fmt::format(CreateObjectDataTableQ,
-            params->objectdata_table,
-            params->bucket_table);
-      if (!type.compare("ObjectView"))
-        return fmt::format(CreateObjectTableQ,
-            params->object_view,
-            params->objectdata_table,
-            params->object_table);
-      if (!type.compare("Quota"))
-        return fmt::format(CreateQuotaTableQ,
-            params->quota_table);
-      if (!type.compare("LCHead"))
-        return fmt::format(CreateLCHeadTableQ,
-            params->lc_head_table);
-      if (!type.compare("LCEntry"))
-        return fmt::format(CreateLCEntryTableQ,
-            params->lc_entry_table,
-            params->bucket_table);
-
-      ceph_abort_msgf("incorrect table type %.*s", type.size(), type.data());
-    }
-
-    static std::string DeleteTableSchema(std::string_view table) {
-      return fmt::format(DropQ, table);
-    }
-    static std::string ListTableSchema(std::string_view table) {
-      return fmt::format(ListAllQ, table);
-    }
-
-    virtual int Prepare(const DoutPrefixProvider *dpp, DBOpParams *params) { return 0; }
-    virtual int Bind(const DoutPrefixProvider *dpp, DBOpParams *params) { return 0; }
-    virtual int Execute(const DoutPrefixProvider *dpp, DBOpParams *params) { return 0; }
-};
-
-class InsertUserOp : virtual public DBOp {
-  private:
-    /* For existing entires, -
-     * (1) INSERT or REPLACE - it will delete previous entry and then
-     * inserts new one. Since it deletes previos enties, it will
-     * trigger all foriegn key cascade deletes or other triggers.
-     * (2) INSERT or UPDATE - this will set NULL values to unassigned
-     * fields.
-     * more info: https://code-examples.net/en/q/377728
-     *
-     * For now using INSERT or REPLACE. If required of updating existing
-     * record, will use another query.
-     */
-    static constexpr std::string_view Query = "INSERT OR REPLACE INTO '{}'     \
-                          (UserID, Tenant, NS, DisplayName, UserEmail, \
-                           AccessKeysID, AccessKeysSecret, AccessKeys, SwiftKeys,\
-                           SubUsers, Suspended, MaxBuckets, OpMask, UserCaps, Admin, \
-                           System, PlacementName, PlacementStorageClass, PlacementTags, \
-                           BucketQuota, TempURLKeys, UserQuota, Type, MfaIDs, AssumedRoleARN, \
-                           UserAttrs, UserVersion, UserVersionTag) \
-                          VALUES ({}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, \
-                              {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {});";
-
-  public:
-    virtual ~InsertUserOp() {}
-
-    static std::string Schema(DBOpPrepareParams &params) {
-      return fmt::format(Query, params.user_table,
-          params.op.user.user_id, params.op.user.tenant, params.op.user.ns,
-          params.op.user.display_name, params.op.user.user_email,
-          params.op.user.access_keys_id, params.op.user.access_keys_secret,
-          params.op.user.access_keys, params.op.user.swift_keys,
-          params.op.user.subusers, params.op.user.suspended,
-          params.op.user.max_buckets, params.op.user.op_mask,
-          params.op.user.user_caps, params.op.user.admin, params.op.user.system,
-          params.op.user.placement_name, params.op.user.placement_storage_class,
-          params.op.user.placement_tags, params.op.user.bucket_quota,
-          params.op.user.temp_url_keys, params.op.user.user_quota,
-          params.op.user.type, params.op.user.mfa_ids,
-          params.op.user.assumed_role_arn, params.op.user.user_attrs,
-          params.op.user.user_ver, params.op.user.user_ver_tag);
-    }
-
-};
-
-class RemoveUserOp: virtual public DBOp {
-  private:
-    static constexpr std::string_view Query =
-      "DELETE from '{}' where UserID = {}";
-
-  public:
-    virtual ~RemoveUserOp() {}
-
-    static std::string Schema(DBOpPrepareParams &params) {
-      return fmt::format(Query, params.user_table,
-          params.op.user.user_id);
-    }
-};
-
-class GetUserOp: virtual public DBOp {
-  private:
-    /* If below query columns are updated, make sure to update the indexes
-     * in list_user() cbk in sqliteDB.cc */
-    static constexpr std::string_view Query = "SELECT \
-                          UserID, Tenant, NS, DisplayName, UserEmail, \
-                          AccessKeysID, AccessKeysSecret, AccessKeys, SwiftKeys,\
-                          SubUsers, Suspended, MaxBuckets, OpMask, UserCaps, Admin, \
-                          System, PlacementName, PlacementStorageClass, PlacementTags, \
-                          BucketQuota, TempURLKeys, UserQuota, Type, MfaIDs, AssumedRoleARN, \
-                          UserAttrs, UserVersion, UserVersionTag from '{}' where UserID = {}";
-
-    static constexpr std::string_view QueryByEmail = "SELECT \
-                                 UserID, Tenant, NS, DisplayName, UserEmail, \
-                                 AccessKeysID, AccessKeysSecret, AccessKeys, SwiftKeys,\
-                                 SubUsers, Suspended, MaxBuckets, OpMask, UserCaps, Admin, \
-                                 System, PlacementName, PlacementStorageClass, PlacementTags, \
-                                 BucketQuota, TempURLKeys, UserQuota, Type, MfaIDs, AssumedRoleARN, \
-                                 UserAttrs, UserVersion, UserVersionTag from '{}' where UserEmail = {}";
-
-    static constexpr std::string_view QueryByAccessKeys = "SELECT \
-                                      UserID, Tenant, NS, DisplayName, UserEmail, \
-                                      AccessKeysID, AccessKeysSecret, AccessKeys, SwiftKeys,\
-                                      SubUsers, Suspended, MaxBuckets, OpMask, UserCaps, Admin, \
-                                      System, PlacementName, PlacementStorageClass, PlacementTags, \
-                                      BucketQuota, TempURLKeys, UserQuota, Type, MfaIDs, AssumedRoleARN, \
-                                      UserAttrs, UserVersion, UserVersionTag from '{}' where AccessKeysID = {}";
-
-    static constexpr std::string_view QueryByUserID = "SELECT \
-                                  UserID, Tenant, NS, DisplayName, UserEmail, \
-                                  AccessKeysID, AccessKeysSecret, AccessKeys, SwiftKeys,\
-                                  SubUsers, Suspended, MaxBuckets, OpMask, UserCaps, Admin, \
-                                  System, PlacementName, PlacementStorageClass, PlacementTags, \
-                                  BucketQuota, TempURLKeys, UserQuota, Type, MfaIDs, AssumedRoleARN, \
-                                  UserAttrs, UserVersion, UserVersionTag \
-                                  from '{}' where UserID = {}";
-
-  public:
-    virtual ~GetUserOp() {}
-
-    static std::string Schema(DBOpPrepareParams &params) {
-      if (params.op.query_str == "email") {
-        return fmt::format(QueryByEmail, params.user_table,
-            params.op.user.user_email);
-      } else if (params.op.query_str == "access_key") {
-        return fmt::format(QueryByAccessKeys,
-            params.user_table,
-            params.op.user.access_keys_id);
-      } else if (params.op.query_str == "user_id") {
-        return fmt::format(QueryByUserID,
-            params.user_table,
-            params.op.user.user_id);
-      } else {
-        return fmt::format(Query, params.user_table,
-            params.op.user.user_id);
-      }
-    }
-};
-
-class InsertBucketOp: virtual public DBOp {
-  private:
-    static constexpr std::string_view Query =
-      "INSERT OR REPLACE INTO '{}' \
-      (BucketName, Tenant, Marker, BucketID, Size, SizeRounded, CreationTime, \
-       Count, PlacementName, PlacementStorageClass, OwnerID, Flags, Zonegroup, \
-       HasInstanceObj, Quota, RequesterPays, HasWebsite, WebsiteConf, \
-       SwiftVersioning, SwiftVerLocation, \
-       MdsearchConfig, NewBucketInstanceID, ObjectLock, \
-       SyncPolicyInfoGroups, BucketAttrs, BucketVersion, BucketVersionTag, Mtime) \
-      VALUES ({}, {}, {}, {}, {}, {}, {}, {}, {}, \
-          {}, {}, {}, {}, {}, {}, {}, {}, {}, \
-          {}, {}, {}, {}, {}, {}, {}, {}, {}, {})";
-
-  public:
-    virtual ~InsertBucketOp() {}
-
-    static std::string Schema(DBOpPrepareParams &params) {
-      return fmt::format(Query, params.bucket_table,
-          params.op.bucket.bucket_name, params.op.bucket.tenant,
-          params.op.bucket.marker, params.op.bucket.bucket_id,
-          params.op.bucket.size, params.op.bucket.size_rounded,
-          params.op.bucket.creation_time, params.op.bucket.count,
-          params.op.bucket.placement_name, params.op.bucket.placement_storage_class,
-          params.op.user.user_id,
-          params.op.bucket.flags, params.op.bucket.zonegroup, params.op.bucket.has_instance_obj,
-          params.op.bucket.quota, params.op.bucket.requester_pays, params.op.bucket.has_website,
-          params.op.bucket.website_conf, params.op.bucket.swift_versioning,
-          params.op.bucket.swift_ver_location, params.op.bucket.mdsearch_config,
-          params.op.bucket.new_bucket_instance_id, params.op.bucket.obj_lock,
-          params.op.bucket.sync_policy_info_groups, params.op.bucket.bucket_attrs,
-          params.op.bucket.bucket_ver, params.op.bucket.bucket_ver_tag,
-          params.op.bucket.mtime);
-    }
-};
-
-class UpdateBucketOp: virtual public DBOp {
-  private:
-    // Updates Info, Mtime, Version
-    static constexpr std::string_view InfoQuery =
-      "UPDATE '{}' SET Tenant = {}, Marker = {}, BucketID = {}, CreationTime = {}, \
-      Count = {}, PlacementName = {}, PlacementStorageClass = {}, OwnerID = {}, Flags = {}, \
-      Zonegroup = {}, HasInstanceObj = {}, Quota = {}, RequesterPays = {}, HasWebsite = {}, \
-      WebsiteConf = {}, SwiftVersioning = {}, SwiftVerLocation = {}, MdsearchConfig = {}, \
-      NewBucketInstanceID = {}, ObjectLock = {}, SyncPolicyInfoGroups = {}, \
-      BucketVersion = {}, Mtime = {} WHERE BucketName = {}";
-    // Updates Attrs, OwnerID, Mtime, Version
-    static constexpr std::string_view AttrsQuery =
-      "UPDATE '{}' SET OwnerID = {}, BucketAttrs = {}, Mtime = {}, BucketVersion = {} \
-      WHERE BucketName = {}";
-    // Updates OwnerID, CreationTime, Mtime, Version
-    static constexpr std::string_view OwnerQuery =
-      "UPDATE '{}' SET OwnerID = {}, CreationTime = {}, Mtime = {}, BucketVersion = {} WHERE BucketName = {}";
-
-  public:
-    virtual ~UpdateBucketOp() {}
-
-    static std::string Schema(DBOpPrepareParams &params) {
-      if (params.op.query_str == "info") {
-        return fmt::format(InfoQuery, params.bucket_table,
-            params.op.bucket.tenant, params.op.bucket.marker, params.op.bucket.bucket_id,
-            params.op.bucket.creation_time, params.op.bucket.count,
-            params.op.bucket.placement_name, params.op.bucket.placement_storage_class,
-            params.op.user.user_id,
-            params.op.bucket.flags, params.op.bucket.zonegroup, params.op.bucket.has_instance_obj,
-            params.op.bucket.quota, params.op.bucket.requester_pays, params.op.bucket.has_website,
-            params.op.bucket.website_conf, params.op.bucket.swift_versioning,
-            params.op.bucket.swift_ver_location, params.op.bucket.mdsearch_config,
-            params.op.bucket.new_bucket_instance_id, params.op.bucket.obj_lock,
-            params.op.bucket.sync_policy_info_groups,
-            params.op.bucket.bucket_ver, params.op.bucket.mtime,
-            params.op.bucket.bucket_name);
-      }
-      if (params.op.query_str == "attrs") {
-        return fmt::format(AttrsQuery, params.bucket_table,
-            params.op.user.user_id, params.op.bucket.bucket_attrs,
-            params.op.bucket.mtime,
-            params.op.bucket.bucket_ver, params.op.bucket.bucket_name);
-      }
-      if (params.op.query_str == "owner") {
-        return fmt::format(OwnerQuery, params.bucket_table,
-            params.op.user.user_id, params.op.bucket.creation_time,
-            params.op.bucket.mtime,
-            params.op.bucket.bucket_ver, params.op.bucket.bucket_name);
-      }
-      return "";
-    }
-};
-
-class RemoveBucketOp: virtual public DBOp {
-  private:
-    static constexpr std::string_view Query =
-      "DELETE from '{}' where BucketName = {}";
-
-  public:
-    virtual ~RemoveBucketOp() {}
-
-    static std::string Schema(DBOpPrepareParams &params) {
-      return fmt::format(Query, params.bucket_table,
-          params.op.bucket.bucket_name);
-    }
-};
-
-class GetBucketOp: virtual public DBOp {
-  private:
-    static constexpr std::string_view Query = "SELECT  \
-                          BucketName, BucketTable.Tenant, Marker, BucketID, Size, SizeRounded, CreationTime, \
-                          Count, BucketTable.PlacementName, BucketTable.PlacementStorageClass, OwnerID, Flags, Zonegroup, \
-                          HasInstanceObj, Quota, RequesterPays, HasWebsite, WebsiteConf, \
-                          SwiftVersioning, SwiftVerLocation, \
-                          MdsearchConfig, NewBucketInstanceID, ObjectLock, \
-                          SyncPolicyInfoGroups, BucketAttrs, BucketVersion, BucketVersionTag, Mtime, NS \
-                          from '{}' as BucketTable INNER JOIN '{}' ON OwnerID = UserID where BucketName = {}";
-
-  public:
-    virtual ~GetBucketOp() {}
-
-    static std::string Schema(DBOpPrepareParams &params) {
-      //return fmt::format(Query, params.op.bucket.bucket_name,
-      //          params.bucket_table, params.user_table);
-      return fmt::format(Query,
-          params.bucket_table, params.user_table,
-          params.op.bucket.bucket_name);
-    }
-};
-
-class ListUserBucketsOp: virtual public DBOp {
-  private:
-    // once we have stats also stored, may have to update this query to join
-    // these two tables.
-    static constexpr std::string_view Query = "SELECT  \
-                          BucketName, Tenant, Marker, BucketID, Size, SizeRounded, CreationTime, \
-                          Count, PlacementName, PlacementStorageClass, OwnerID, Flags, Zonegroup, \
-                          HasInstanceObj, Quota, RequesterPays, HasWebsite, WebsiteConf, \
-                          SwiftVersioning, SwiftVerLocation, \
-                          MdsearchConfig, NewBucketInstanceID, ObjectLock, \
-                          SyncPolicyInfoGroups, BucketAttrs, BucketVersion, BucketVersionTag, Mtime \
-                          FROM '{}' WHERE OwnerID = {} AND BucketName > {} ORDER BY BucketName ASC LIMIT {}";
-
-    /* BucketNames are unique across users. Hence userid/OwnerID is not used as
-     * marker or for ordering here in the below query 
-     */
-    static constexpr std::string_view AllQuery = "SELECT  \
-                          BucketName, Tenant, Marker, BucketID, Size, SizeRounded, CreationTime, \
-                          Count, PlacementName, PlacementStorageClass, OwnerID, Flags, Zonegroup, \
-                          HasInstanceObj, Quota, RequesterPays, HasWebsite, WebsiteConf, \
-                          SwiftVersioning, SwiftVerLocation, \
-                          MdsearchConfig, NewBucketInstanceID, ObjectLock, \
-                          SyncPolicyInfoGroups, BucketAttrs, BucketVersion, BucketVersionTag, Mtime \
-                          FROM '{}' WHERE BucketName > {} ORDER BY BucketName ASC LIMIT {}";
-
-  public:
-    virtual ~ListUserBucketsOp() {}
-
-    static std::string Schema(DBOpPrepareParams &params) {
-      if (params.op.query_str == "all") {
-        return fmt::format(AllQuery, params.bucket_table,
-          params.op.bucket.min_marker,
-          params.op.list_max_count);
-      } else {
-        return fmt::format(Query, params.bucket_table,
-          params.op.user.user_id, params.op.bucket.min_marker,
-          params.op.list_max_count);
-      }
-    }
-};
-
-class PutObjectOp: virtual public DBOp {
-  private:
-    static constexpr std::string_view Query =
-      "INSERT OR REPLACE INTO '{}' \
-      (ObjName, ObjInstance, ObjNS, BucketName, ACLs, IndexVer, Tag, \
-       Flags, VersionedEpoch, ObjCategory, Etag, Owner, OwnerDisplayName, \
-       StorageClass, Appendable, ContentType, IndexHashSource, ObjSize, \
-       AccountedSize, Mtime, Epoch, ObjTag, TailTag, WriteTag, FakeTag, \
-       ShadowObj, HasData, IsVersioned, VersionNum, PGVer, ZoneShortID, \
-       ObjVersion, ObjVersionTag, ObjAttrs, HeadSize, MaxHeadSize, \
-       ObjID, TailInstance, HeadPlacementRuleName, HeadPlacementRuleStorageClass, \
-       TailPlacementRuleName, TailPlacementStorageClass, \
-       ManifestPartObjs, ManifestPartRules, Omap, IsMultipart, MPPartsList, \
-       HeadData)     \
-      VALUES ({}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, \
-          {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, \
-          {}, {}, {}, \
-          {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {})";
-
-  public:
-    virtual ~PutObjectOp() {}
-
-    static std::string Schema(DBOpPrepareParams &params) {
-      return fmt::format(Query,
-          params.object_table, params.op.obj.obj_name,
-          params.op.obj.obj_instance, params.op.obj.obj_ns,
-          params.op.bucket.bucket_name, params.op.obj.acls, params.op.obj.index_ver,
-          params.op.obj.tag, params.op.obj.flags, params.op.obj.versioned_epoch,
-          params.op.obj.obj_category, params.op.obj.etag, params.op.obj.owner,
-          params.op.obj.owner_display_name, params.op.obj.storage_class,
-          params.op.obj.appendable, params.op.obj.content_type,
-          params.op.obj.index_hash_source, params.op.obj.obj_size,
-          params.op.obj.accounted_size, params.op.obj.mtime,
-          params.op.obj.epoch, params.op.obj.obj_tag, params.op.obj.tail_tag,
-          params.op.obj.write_tag, params.op.obj.fake_tag, params.op.obj.shadow_obj,
-          params.op.obj.has_data, params.op.obj.is_versioned,
-          params.op.obj.version_num,
-          params.op.obj.pg_ver, params.op.obj.zone_short_id,
-          params.op.obj.obj_version, params.op.obj.obj_version_tag,
-          params.op.obj.obj_attrs, params.op.obj.head_size,
-          params.op.obj.max_head_size, params.op.obj.obj_id,
-          params.op.obj.tail_instance,
-          params.op.obj.head_placement_rule_name,
-          params.op.obj.head_placement_storage_class,
-          params.op.obj.tail_placement_rule_name,
-          params.op.obj.tail_placement_storage_class,
-          params.op.obj.manifest_part_objs,
-          params.op.obj.manifest_part_rules, params.op.obj.omap,
-          params.op.obj.is_multipart, params.op.obj.mp_parts,
-          params.op.obj.head_data);
-    }
-};
-
-class DeleteObjectOp: virtual public DBOp {
-  private:
-    static constexpr std::string_view Query =
-      "DELETE from '{}' where BucketName = {} and ObjName = {} and ObjInstance = {}";
-
-  public:
-    virtual ~DeleteObjectOp() {}
-
-    static std::string Schema(DBOpPrepareParams &params) {
-      return fmt::format(Query, params.object_table,
-          params.op.bucket.bucket_name,
-          params.op.obj.obj_name,
-          params.op.obj.obj_instance);
-    }
-};
-
-class GetObjectOp: virtual public DBOp {
-  private:
-    static constexpr std::string_view Query =
-      "SELECT  \
-      ObjName, ObjInstance, ObjNS, BucketName, ACLs, IndexVer, Tag, \
-      Flags, VersionedEpoch, ObjCategory, Etag, Owner, OwnerDisplayName, \
-      StorageClass, Appendable, ContentType, IndexHashSource, ObjSize, \
-      AccountedSize, Mtime, Epoch, ObjTag, TailTag, WriteTag, FakeTag, \
-      ShadowObj, HasData, IsVersioned, VersionNum, PGVer, ZoneShortID, \
-      ObjVersion, ObjVersionTag, ObjAttrs, HeadSize, MaxHeadSize, \
-      ObjID, TailInstance, HeadPlacementRuleName, HeadPlacementRuleStorageClass, \
-      TailPlacementRuleName, TailPlacementStorageClass, \
-      ManifestPartObjs, ManifestPartRules, Omap, IsMultipart, MPPartsList, \
-      HeadData from '{}' \
-      where BucketName = {} and ObjName = {} and ObjInstance = {}";
-
-  public:
-    virtual ~GetObjectOp() {}
-
-    static std::string Schema(DBOpPrepareParams &params) {
-      return fmt::format(Query,
-          params.object_table,
-          params.op.bucket.bucket_name,
-          params.op.obj.obj_name,
-          params.op.obj.obj_instance);
-    }
-};
-
-class ListBucketObjectsOp: virtual public DBOp {
-  private:
-    // once we have stats also stored, may have to update this query to join
-    // these two tables.
-    static constexpr std::string_view Query =
-      "SELECT  \
-      ObjName, ObjInstance, ObjNS, BucketName, ACLs, IndexVer, Tag, \
-      Flags, VersionedEpoch, ObjCategory, Etag, Owner, OwnerDisplayName, \
-      StorageClass, Appendable, ContentType, IndexHashSource, ObjSize, \
-      AccountedSize, Mtime, Epoch, ObjTag, TailTag, WriteTag, FakeTag, \
-      ShadowObj, HasData, IsVersioned, VersionNum, PGVer, ZoneShortID, \
-      ObjVersion, ObjVersionTag, ObjAttrs, HeadSize, MaxHeadSize, \
-      ObjID, TailInstance, HeadPlacementRuleName, HeadPlacementRuleStorageClass, \
-      TailPlacementRuleName, TailPlacementStorageClass, \
-      ManifestPartObjs, ManifestPartRules, Omap, IsMultipart, MPPartsList, HeadData from '{}' \
-      where BucketName = {} and ObjName >= {} and ObjName LIKE {} ORDER BY ObjName ASC, VersionNum DESC LIMIT {}";
-  public:
-    virtual ~ListBucketObjectsOp() {}
-
-    static std::string Schema(DBOpPrepareParams &params) {
-      /* XXX: Include obj_id, delim */
-      return fmt::format(Query,
-          params.object_table,
-          params.op.bucket.bucket_name,
-          params.op.obj.min_marker,
-          params.op.obj.prefix,
-          params.op.list_max_count);
-    }
-};
-
-#define MAX_VERSIONED_OBJECTS 20
-class ListVersionedObjectsOp: virtual public DBOp {
-  private:
-    // once we have stats also stored, may have to update this query to join
-    // these two tables.
-    static constexpr std::string_view Query =
-      "SELECT  \
-      ObjName, ObjInstance, ObjNS, BucketName, ACLs, IndexVer, Tag, \
-      Flags, VersionedEpoch, ObjCategory, Etag, Owner, OwnerDisplayName, \
-      StorageClass, Appendable, ContentType, IndexHashSource, ObjSize, \
-      AccountedSize, Mtime, Epoch, ObjTag, TailTag, WriteTag, FakeTag, \
-      ShadowObj, HasData, IsVersioned, VersionNum, PGVer, ZoneShortID, \
-      ObjVersion, ObjVersionTag, ObjAttrs, HeadSize, MaxHeadSize, \
-      ObjID, TailInstance, HeadPlacementRuleName, HeadPlacementRuleStorageClass, \
-      TailPlacementRuleName, TailPlacementStorageClass, \
-      ManifestPartObjs, ManifestPartRules, Omap, IsMultipart, MPPartsList, \
-      HeadData from '{}' \
-      where BucketName = {} and ObjName = {} ORDER BY VersionNum DESC LIMIT {}";
-  public:
-    virtual ~ListVersionedObjectsOp() {}
-
-    static std::string Schema(DBOpPrepareParams &params) {
-      /* XXX: Include obj_id, delim */
-      return fmt::format(Query,
-          params.object_table,
-          params.op.bucket.bucket_name,
-          params.op.obj.obj_name,
-          params.op.list_max_count);
-    }
-};
-
-class UpdateObjectOp: virtual public DBOp {
-  private:
-    // Updates Omap
-    static constexpr std::string_view OmapQuery =
-      "UPDATE '{}' SET Omap = {}, Mtime = {} \
-      where BucketName = {} and ObjName = {} and ObjInstance = {}";
-    static constexpr std::string_view AttrsQuery =
-      "UPDATE '{}' SET ObjAttrs = {}, Mtime = {}  \
-      where BucketName = {} and ObjName = {} and ObjInstance = {}";
-    static constexpr std::string_view MPQuery =
-      "UPDATE '{}' SET MPPartsList = {}, Mtime = {}  \
-      where BucketName = {} and ObjName = {} and ObjInstance = {}";
-    static constexpr std::string_view MetaQuery =
-      "UPDATE '{}' SET \
-       ObjNS = {}, ACLs = {}, IndexVer = {}, Tag = {}, Flags = {}, VersionedEpoch = {}, \
-       ObjCategory = {}, Etag = {}, Owner = {}, OwnerDisplayName = {}, \
-       StorageClass = {}, Appendable = {}, ContentType = {}, \
-       IndexHashSource = {}, ObjSize = {}, AccountedSize = {}, Mtime = {}, \
-       Epoch = {}, ObjTag = {}, TailTag = {}, WriteTag = {}, FakeTag = {}, \
-       ShadowObj = {}, HasData = {}, IsVersioned = {}, VersionNum = {}, PGVer = {}, \
-       ZoneShortID = {}, ObjVersion = {}, ObjVersionTag = {}, ObjAttrs = {}, \
-       HeadSize = {}, MaxHeadSize = {}, ObjID = {}, TailInstance = {}, \
-       HeadPlacementRuleName = {}, HeadPlacementRuleStorageClass = {}, \
-       TailPlacementRuleName = {}, TailPlacementStorageClass = {}, \
-       ManifestPartObjs = {}, ManifestPartRules = {}, Omap = {}, \
-       IsMultipart = {}, MPPartsList = {}, HeadData = {} \
-       WHERE ObjName = {} and ObjInstance = {} and BucketName = {}";
-
-  public:
-    virtual ~UpdateObjectOp() {}
-
-    static std::string Schema(DBOpPrepareParams &params) {
-      if (params.op.query_str == "omap") {
-        return fmt::format(OmapQuery,
-            params.object_table, params.op.obj.omap,
-            params.op.obj.mtime,
-            params.op.bucket.bucket_name,
-            params.op.obj.obj_name,
-            params.op.obj.obj_instance);
-      }
-      if (params.op.query_str == "attrs") {
-        return fmt::format(AttrsQuery,
-            params.object_table, params.op.obj.obj_attrs,
-            params.op.obj.mtime,
-            params.op.bucket.bucket_name,
-            params.op.obj.obj_name,
-            params.op.obj.obj_instance);
-      }
-      if (params.op.query_str == "mp") {
-        return fmt::format(MPQuery,
-            params.object_table, params.op.obj.mp_parts,
-            params.op.obj.mtime,
-            params.op.bucket.bucket_name,
-            params.op.obj.obj_name,
-            params.op.obj.obj_instance);
-      }
-      if (params.op.query_str == "meta") {
-        return fmt::format(MetaQuery,
-          params.object_table,
-          params.op.obj.obj_ns, params.op.obj.acls, params.op.obj.index_ver,
-          params.op.obj.tag, params.op.obj.flags, params.op.obj.versioned_epoch,
-          params.op.obj.obj_category, params.op.obj.etag, params.op.obj.owner,
-          params.op.obj.owner_display_name, params.op.obj.storage_class,
-          params.op.obj.appendable, params.op.obj.content_type,
-          params.op.obj.index_hash_source, params.op.obj.obj_size,
-          params.op.obj.accounted_size, params.op.obj.mtime,
-          params.op.obj.epoch, params.op.obj.obj_tag, params.op.obj.tail_tag,
-          params.op.obj.write_tag, params.op.obj.fake_tag, params.op.obj.shadow_obj,
-          params.op.obj.has_data, params.op.obj.is_versioned, params.op.obj.version_num,
-          params.op.obj.pg_ver, params.op.obj.zone_short_id,
-          params.op.obj.obj_version, params.op.obj.obj_version_tag,
-          params.op.obj.obj_attrs, params.op.obj.head_size,
-          params.op.obj.max_head_size, params.op.obj.obj_id,
-          params.op.obj.tail_instance,
-          params.op.obj.head_placement_rule_name,
-          params.op.obj.head_placement_storage_class,
-          params.op.obj.tail_placement_rule_name,
-          params.op.obj.tail_placement_storage_class,
-          params.op.obj.manifest_part_objs,
-          params.op.obj.manifest_part_rules, params.op.obj.omap,
-          params.op.obj.is_multipart, params.op.obj.mp_parts,
-          params.op.obj.head_data, 
-          params.op.obj.obj_name, params.op.obj.obj_instance,
-          params.op.bucket.bucket_name);
-      }
-      return "";
-    }
-};
-
-class PutObjectDataOp: virtual public DBOp {
-  private:
-    static constexpr std::string_view Query =
-      "INSERT OR REPLACE INTO '{}' \
-      (ObjName, ObjInstance, ObjNS, BucketName, ObjID, MultipartPartStr, PartNum, Offset, Size, Mtime, Data) \
-      VALUES ({}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {})";
-
-  public:
-    virtual ~PutObjectDataOp() {}
-
-    static std::string Schema(DBOpPrepareParams &params) {
-      return fmt::format(Query,
-          params.objectdata_table,
-          params.op.obj.obj_name, params.op.obj.obj_instance,
-          params.op.obj.obj_ns,
-          params.op.bucket.bucket_name,
-          params.op.obj.obj_id,
-          params.op.obj_data.multipart_part_str,
-          params.op.obj_data.part_num,
-          params.op.obj_data.offset,
-          params.op.obj_data.size,
-          params.op.obj.mtime,
-          params.op.obj_data.data);
-    }
-};
-
-/* XXX: Recheck if this is really needed */
-class UpdateObjectDataOp: virtual public DBOp {
-  private:
-    static constexpr std::string_view Query =
-      "UPDATE '{}' \
-      SET Mtime = {} WHERE ObjName = {} and ObjInstance = {} and \
-      BucketName = {} and ObjID = {}";
-
-  public:
-    virtual ~UpdateObjectDataOp() {}
-
-    static std::string Schema(DBOpPrepareParams &params) {
-      return fmt::format(Query,
-          params.objectdata_table,
-          params.op.obj.mtime,
-          params.op.obj.obj_name, params.op.obj.obj_instance,
-          params.op.bucket.bucket_name,
-          params.op.obj.obj_id);
-    }
-};
-
-class GetObjectDataOp: virtual public DBOp {
-  private:
-    static constexpr std::string_view Query =
-      "SELECT  \
-      ObjName, ObjInstance, ObjNS, BucketName, ObjID, MultipartPartStr, PartNum, Offset, Size, Mtime, Data \
-      from '{}' where BucketName = {} and ObjName = {} and ObjInstance = {} and ObjID = {} ORDER BY MultipartPartStr, PartNum";
-
-  public:
-    virtual ~GetObjectDataOp() {}
-
-    static std::string Schema(DBOpPrepareParams &params) {
-      return fmt::format(Query,
-          params.objectdata_table,
-          params.op.bucket.bucket_name,
-          params.op.obj.obj_name,
-          params.op.obj.obj_instance,
-          params.op.obj.obj_id);
-    }
-};
-
-class DeleteObjectDataOp: virtual public DBOp {
-  private:
-    static constexpr std::string_view Query =
-      "DELETE from '{}' where BucketName = {} and ObjName = {} and ObjInstance = {} and ObjID = {}";
-
-  public:
-    virtual ~DeleteObjectDataOp() {}
-
-    static std::string Schema(DBOpPrepareParams &params) {
-      return fmt::format(Query,
-          params.objectdata_table,
-          params.op.bucket.bucket_name,
-          params.op.obj.obj_name,
-          params.op.obj.obj_instance,
-          params.op.obj.obj_id);
-    }
-};
-
-class DeleteStaleObjectDataOp: virtual public DBOp {
-  private:
-    static constexpr std::string_view Query =
-      "DELETE from '{}' WHERE (ObjName, ObjInstance, ObjID) NOT IN (SELECT s.ObjName, s.ObjInstance, s.ObjID from '{}' as s INNER JOIN '{}' USING (ObjName, BucketName, ObjInstance, ObjID)) and Mtime < {}";
-
-  public:
-    virtual ~DeleteStaleObjectDataOp() {}
-
-    static std::string Schema(DBOpPrepareParams &params) {
-      return fmt::format(Query,
-          params.objectdata_table,
-          params.objectdata_table,
-          params.object_table,
-          params.op.obj.mtime);
-    }
-};
-
-class InsertLCEntryOp: virtual public DBOp {
-  private:
-    static constexpr std::string_view Query =
-      "INSERT OR REPLACE INTO '{}' \
-      (LCIndex, BucketName, StartTime, Status) \
-      VALUES ({}, {}, {}, {})";
-
-  public:
-    virtual ~InsertLCEntryOp() {}
-
-    static std::string Schema(DBOpPrepareParams &params) {
-      return fmt::format(Query, params.lc_entry_table,
-          params.op.lc_entry.index, params.op.lc_entry.bucket_name,
-          params.op.lc_entry.start_time, params.op.lc_entry.status);
-    }
-};
-
-class RemoveLCEntryOp: virtual public DBOp {
-  private:
-    static constexpr std::string_view Query =
-      "DELETE from '{}' where LCIndex = {} and BucketName = {}";
-
-  public:
-    virtual ~RemoveLCEntryOp() {}
-
-    static std::string Schema(DBOpPrepareParams &params) {
-      return fmt::format(Query, params.lc_entry_table,
-          params.op.lc_entry.index, params.op.lc_entry.bucket_name);
-    }
-};
-
-class GetLCEntryOp: virtual public DBOp {
-  private:
-    static constexpr std::string_view Query = "SELECT  \
-                          LCIndex, BucketName, StartTime, Status \
-                          from '{}' where LCIndex = {} and BucketName = {}";
-    static constexpr std::string_view NextQuery = "SELECT  \
-                          LCIndex, BucketName, StartTime, Status \
-                          from '{}' where LCIndex = {} and BucketName > {} ORDER BY BucketName ASC";
-
-  public:
-    virtual ~GetLCEntryOp() {}
-
-    static std::string Schema(DBOpPrepareParams &params) {
-      if (params.op.query_str == "get_next_entry") {
-        return fmt::format(NextQuery, params.lc_entry_table,
-            params.op.lc_entry.index, params.op.lc_entry.bucket_name);
-      }
-      // default 
-      return fmt::format(Query, params.lc_entry_table,
-          params.op.lc_entry.index, params.op.lc_entry.bucket_name);
-    }
-};
-
-class ListLCEntriesOp: virtual public DBOp {
-  private:
-    static constexpr std::string_view Query = "SELECT  \
-                          LCIndex, BucketName, StartTime, Status \
-                          FROM '{}' WHERE LCIndex = {} AND BucketName > {} ORDER BY BucketName ASC LIMIT {}";
-
-  public:
-    virtual ~ListLCEntriesOp() {}
-
-    static std::string Schema(DBOpPrepareParams &params) {
-      return fmt::format(Query, params.lc_entry_table,
-          params.op.lc_entry.index, params.op.lc_entry.min_marker,
-          params.op.list_max_count);
-    }
-};
-
-class InsertLCHeadOp: virtual public DBOp {
-  private:
-    static constexpr std::string_view Query =
-      "INSERT OR REPLACE INTO '{}' \
-      (LCIndex, Marker, StartDate) \
-      VALUES ({}, {}, {})";
-
-  public:
-    virtual ~InsertLCHeadOp() {}
-
-    static std::string Schema(DBOpPrepareParams &params) {
-      return fmt::format(Query, params.lc_head_table,
-          params.op.lc_head.index, params.op.lc_head.marker,
-          params.op.lc_head.start_date);
-    }
-};
-
-class RemoveLCHeadOp: virtual public DBOp {
-  private:
-    static constexpr std::string_view Query =
-      "DELETE from '{}' where LCIndex = {}";
-
-  public:
-    virtual ~RemoveLCHeadOp() {}
-
-    static std::string Schema(DBOpPrepareParams &params) {
-      return fmt::format(Query, params.lc_head_table,
-          params.op.lc_head.index);
-    }
-};
-
-class GetLCHeadOp: virtual public DBOp {
-  private:
-    static constexpr std::string_view Query = "SELECT  \
-                          LCIndex, Marker, StartDate \
-                          from '{}' where LCIndex = {}";
-
-  public:
-    virtual ~GetLCHeadOp() {}
-
-    static std::string Schema(DBOpPrepareParams &params) {
-      return fmt::format(Query, params.lc_head_table,
-          params.op.lc_head.index);
-    }
-};
-
-/* taken from rgw_rados.h::RGWOLHInfo */
-struct DBOLHInfo {
-  rgw_obj target;
-  bool removed;
-  DBOLHInfo() : removed(false) {}
-  void encode(bufferlist& bl) const {
-    ENCODE_START(1, 1, bl);
-    encode(target, bl);
-    encode(removed, bl);
-    ENCODE_FINISH(bl);
-  }
-
-  void decode(bufferlist::const_iterator& bl) {
-    DECODE_START(1, bl);
-    decode(target, bl);
-    decode(removed, bl);
-    DECODE_FINISH(bl);
-  }
-};
-WRITE_CLASS_ENCODER(DBOLHInfo)
-
-class DB {
-  private:
-    const std::string db_name;
-    rgw::sal::Driver* driver;
-    const std::string user_table;
-    const std::string bucket_table;
-    const std::string quota_table;
-    const std::string lc_head_table;
-    const std::string lc_entry_table;
-    static std::map<std::string, class ObjectOp*> objectmap;
-
-  protected:
-    void *db;
-    CephContext *cct;
-    const DoutPrefix dp;
-    uint64_t max_bucket_id = 0;
-    // XXX: default ObjStripeSize or ObjChunk size - 4M, make them configurable?
-    uint64_t ObjHeadSize = 1024; /* 1K - default head data size */
-    uint64_t ObjChunkSize = (get_blob_limit() - 1000); /* 1000 to accommodate other fields */
-    // Below mutex is to protect objectmap and other shared
-    // objects if any.
-    std::mutex mtx;
-
-  public:
-    DB(std::string db_name, CephContext *_cct) : db_name(db_name),
-    user_table(db_name+"_user_table"),
-    bucket_table(db_name+"_bucket_table"),
-    quota_table(db_name+"_quota_table"),
-    lc_head_table(db_name+"_lc_head_table"),
-    lc_entry_table(db_name+"_lc_entry_table"),
-    cct(_cct),
-    dp(_cct, ceph_subsys_rgw, "rgw DBStore backend: ")
-  {}
-    /* DB() {}*/
-
-    DB(CephContext *_cct) : db_name("default_db"),
-    user_table(db_name+"_user_table"),
-    bucket_table(db_name+"_bucket_table"),
-    quota_table(db_name+"_quota_table"),
-    lc_head_table(db_name+"_lc_head_table"),
-    lc_entry_table(db_name+"_lc_entry_table"),
-    cct(_cct),
-    dp(_cct, ceph_subsys_rgw, "rgw DBStore backend: ")
-  {}
-    virtual    ~DB() {}
-
-    const std::string getDBname() { return db_name; }
-    const std::string getDBfile() { return db_name + ".db"; }
-    const std::string getUserTable() { return user_table; }
-    const std::string getBucketTable() { return bucket_table; }
-    const std::string getQuotaTable() { return quota_table; }
-    const std::string getLCHeadTable() { return lc_head_table; }
-    const std::string getLCEntryTable() { return lc_entry_table; }
-    const std::string getObjectTable(std::string bucket) {
-      return db_name+"_"+bucket+"_object_table"; }
-    const std::string getObjectDataTable(std::string bucket) {
-      return db_name+"_"+bucket+"_objectdata_table"; }
-    const std::string getObjectView(std::string bucket) {
-      return db_name+"_"+bucket+"_object_view"; }
-    const std::string getObjectTrigger(std::string bucket) {
-      return db_name+"_"+bucket+"_object_trigger"; }
-
-    std::map<std::string, class ObjectOp*> getObjectMap();
-
-    struct DBOps dbops; // DB operations, make it private?
-
-    void set_driver(rgw::sal::Driver* _driver) {
-      driver = _driver;
-    }
-
-    void set_context(CephContext *_cct) {
-      cct = _cct;
-    }
-
-    CephContext *ctx() { return cct; }
-    const DoutPrefixProvider *get_def_dpp() { return &dp; }
-
-    int Initialize(std::string logfile, int loglevel);
-    int Destroy(const DoutPrefixProvider *dpp);
-    int LockInit(const DoutPrefixProvider *dpp);
-    int LockDestroy(const DoutPrefixProvider *dpp);
-    int Lock(const DoutPrefixProvider *dpp);
-    int Unlock(const DoutPrefixProvider *dpp);
-
-    int InitializeParams(const DoutPrefixProvider *dpp, DBOpParams *params);
-    int ProcessOp(const DoutPrefixProvider *dpp, std::string_view Op, DBOpParams *params);
-    std::shared_ptr<class DBOp> getDBOp(const DoutPrefixProvider *dpp, std::string_view Op, const DBOpParams *params);
-    int objectmapInsert(const DoutPrefixProvider *dpp, std::string bucket, class ObjectOp* ptr);
-    int objectmapDelete(const DoutPrefixProvider *dpp, std::string bucket);
-
-    virtual uint64_t get_blob_limit() { return 0; };
-    virtual void *openDB(const DoutPrefixProvider *dpp) { return NULL; }
-    virtual int closeDB(const DoutPrefixProvider *dpp) { return 0; }
-    virtual int createTables(const DoutPrefixProvider *dpp) { return 0; }
-    virtual int InitializeDBOps(const DoutPrefixProvider *dpp) { return 0; }
-    virtual int InitPrepareParams(const DoutPrefixProvider *dpp,
-                                  DBOpPrepareParams &p_params,
-                                  DBOpParams* params) = 0;
-    virtual int createLCTables(const DoutPrefixProvider *dpp) = 0;
-
-    virtual int ListAllBuckets(const DoutPrefixProvider *dpp, DBOpParams *params) = 0;
-    virtual int ListAllUsers(const DoutPrefixProvider *dpp, DBOpParams *params) = 0;
-    virtual int ListAllObjects(const DoutPrefixProvider *dpp, DBOpParams *params) = 0;
-
-    int get_user(const DoutPrefixProvider *dpp,
-        const std::string& query_str, const std::string& query_str_val,
-        RGWUserInfo& uinfo, std::map<std::string, bufferlist> *pattrs,
-        RGWObjVersionTracker *pobjv_tracker);
-    int store_user(const DoutPrefixProvider *dpp,
-        RGWUserInfo& uinfo, bool exclusive, std::map<std::string, bufferlist> *pattrs,
-        RGWObjVersionTracker *pobjv_tracker, RGWUserInfo* pold_info);
-    int remove_user(const DoutPrefixProvider *dpp,
-        RGWUserInfo& uinfo, RGWObjVersionTracker *pobjv_tracker);
-    int get_bucket_info(const DoutPrefixProvider *dpp, const std::string& query_str,
-        const std::string& query_str_val,
-        RGWBucketInfo& info, rgw::sal::Attrs* pattrs, ceph::real_time* pmtime,
-        obj_version* pbucket_version);
-    int create_bucket(const DoutPrefixProvider *dpp,
-        const RGWUserInfo& owner, rgw_bucket& bucket,
-        const std::string& zonegroup_id,
-        const rgw_placement_rule& placement_rule,
-        const std::string& swift_ver_location,
-        const RGWQuotaInfo * pquota_info,
-        std::map<std::string, bufferlist>& attrs,
-        RGWBucketInfo& info,
-        obj_version *pobjv,
-        obj_version *pep_objv,
-        real_time creation_time,
-        rgw_bucket *pmaster_bucket,
-        uint32_t *pmaster_num_shards,
-        optional_yield y,
-        bool exclusive);
-
-    int next_bucket_id() { return ++max_bucket_id; };
-
-    int remove_bucket(const DoutPrefixProvider *dpp, const RGWBucketInfo info);
-    int list_buckets(const DoutPrefixProvider *dpp, const std::string& query_str,
-        rgw_user& user,
-        const std::string& marker,
-        const std::string& end_marker,
-        uint64_t max,
-        bool need_stats,
-        RGWUserBuckets *buckets,
-        bool *is_truncated);
-    int update_bucket(const DoutPrefixProvider *dpp, const std::string& query_str,
-        RGWBucketInfo& info, bool exclusive,
-        const rgw_user* powner_id, std::map<std::string, bufferlist>* pattrs,
-        ceph::real_time* pmtime, RGWObjVersionTracker* pobjv);
-
-    uint64_t get_max_head_size() { return ObjHeadSize; }
-    uint64_t get_max_chunk_size() { return ObjChunkSize; }
-    void gen_rand_obj_instance_name(rgw_obj_key *target_key);
-
-    // db raw obj string is of format -
-    // "<bucketname>_<objname>_<objinstance>_<multipart-part-str>_<partnum>"
-    static constexpr std::string_view raw_obj_oid = "{0}_{1}_{2}_{3}_{4}";
-
-    std::string to_oid(std::string_view bucket, std::string_view obj_name,
-                       std::string_view obj_instance, std::string_view obj_id,
-                       std::string_view mp_str, uint64_t partnum) {
-      return fmt::format(raw_obj_oid, bucket, obj_name, obj_instance, obj_id, mp_str, partnum);
-    }
-    int from_oid(const std::string& oid, std::string& bucket, std::string& obj_name, std::string& obj_id,
-        std::string& obj_instance,
-        std::string& mp_str, uint64_t& partnum) {
-      // TODO: use ceph::split() from common/split.h
-      // XXX: doesn't this break if obj_name has underscores in it?
-      std::vector<std::string> result;
-      boost::split(result, oid, boost::is_any_of("_"));
-      bucket = result[0];
-      obj_name = result[1];
-      obj_instance = result[2];
-      obj_id = result[3];
-      mp_str = result[4];
-      partnum = stoi(result[5]);
-
-      return 0;
-    }
-
-    struct raw_obj {
-      DB* db;
-
-      std::string bucket_name;
-      std::string obj_name;
-      std::string obj_instance;
-      std::string obj_ns;
-      std::string obj_id;
-      std::string multipart_part_str;
-      uint64_t part_num;
-
-      std::string obj_table;
-      std::string obj_data_table;
-
-      raw_obj(DB* _db) {
-        db = _db;
-      }
-
-      raw_obj(DB* _db, std::string& _bname, std::string& _obj_name, std::string& _obj_instance,
-          std::string& _obj_ns, std::string& _obj_id, std::string _mp_part_str, int _part_num) {
-        db = _db;
-        bucket_name = _bname;
-        obj_name = _obj_name;
-        obj_instance = _obj_instance;
-        obj_ns = _obj_ns;
-        obj_id = _obj_id;
-        multipart_part_str = _mp_part_str;
-        part_num = _part_num;
-
-        obj_table = bucket_name+".object.table";
-        obj_data_table = bucket_name+".objectdata.table";
-      }
-
-      raw_obj(DB* _db, std::string& oid) {
-        int r;
-
-        db = _db;
-        r = db->from_oid(oid, bucket_name, obj_name, obj_instance, obj_id, multipart_part_str,
-            part_num);
-        if (r < 0) {
-          multipart_part_str = "0.0";
-          part_num = 0;
-        }
-
-        obj_table = db->getObjectTable(bucket_name);
-        obj_data_table = db->getObjectDataTable(bucket_name);
-      }
-
-      int InitializeParamsfromRawObj (const DoutPrefixProvider *dpp, DBOpParams* params);
-
-      int read(const DoutPrefixProvider *dpp, int64_t ofs, uint64_t end, bufferlist& bl);
-      int write(const DoutPrefixProvider *dpp, int64_t ofs, int64_t write_ofs, uint64_t len, bufferlist& bl);
-    };
-
-    class GC : public Thread {
-      const DoutPrefixProvider *dpp;
-      DB *db;
-      /* Default time interval for GC 
-       * XXX: Make below options configurable
-       *
-       * gc_interval: The time between successive gc thread runs
-       * gc_obj_min_wait: Min. time to wait before deleting any data post its creation.
-       *                    
-       */
-      std::mutex mtx;
-      std::condition_variable cv;
-      bool stop_signalled = false;
-      uint32_t gc_interval = 24*60*60; //sec ; default: 24*60*60
-      uint32_t gc_obj_min_wait = 60*60; //60*60sec default
-      std::string bucket_marker;
-      std::string user_marker;
-
-    public:
-      GC(const DoutPrefixProvider *_dpp, DB* _db) :
-            dpp(_dpp), db(_db) {}
-
-      void *entry() override;
-
-      void signal_stop() {
-       std::lock_guard<std::mutex> lk_guard(mtx);
-       stop_signalled = true;
-       cv.notify_one();
-      }
-
-      friend class DB;
-    };
-    std::unique_ptr<DB::GC> gc_worker;
-
-    class Bucket {
-      friend class DB;
-      DB* store;
-
-      RGWBucketInfo bucket_info;
-
-      public:
-        Bucket(DB *_store, const RGWBucketInfo& _binfo) : store(_store), bucket_info(_binfo) {}
-        DB *get_store() { return store; }
-        rgw_bucket& get_bucket() { return bucket_info.bucket; }
-        RGWBucketInfo& get_bucket_info() { return bucket_info; }
-
-      class List {
-      protected:
-        // absolute maximum number of objects that
-        // list_objects_(un)ordered can return
-        static constexpr int64_t bucket_list_objects_absolute_max = 25000;
-
-        DB::Bucket *target;
-        rgw_obj_key next_marker;
-
-      public:
-
-        struct Params {
-          std::string prefix;
-          std::string delim;
-          rgw_obj_key marker;
-          rgw_obj_key end_marker;
-          std::string ns;
-          bool enforce_ns;
-          RGWAccessListFilter* access_list_filter;
-          RGWBucketListNameFilter force_check_filter;
-          bool list_versions;
-         bool allow_unordered;
-
-          Params() :
-               enforce_ns(true),
-               access_list_filter(nullptr),
-               list_versions(false),
-               allow_unordered(false)
-               {}
-        } params;
-
-        explicit List(DB::Bucket *_target) : target(_target) {}
-
-        /* XXX: Handle ordered and unordered separately.
-         * For now returning only ordered entries */
-        int list_objects(const DoutPrefixProvider *dpp, int64_t max,
-                          std::vector<rgw_bucket_dir_entry> *result,
-                          std::map<std::string, bool> *common_prefixes, bool *is_truncated);
-        rgw_obj_key& get_next_marker() {
-          return next_marker;
-        }
-      };
-    };
-
-    class Object {
-      friend class DB;
-      DB* store;
-
-      RGWBucketInfo bucket_info;
-      rgw_obj obj;
-
-      RGWObjState obj_state;
-      std::string obj_id;
-
-      bool versioning_disabled;
-
-      bool bs_initialized;
-
-      public:
-      Object(DB *_store, const RGWBucketInfo& _bucket_info, const rgw_obj& _obj) : store(_store), bucket_info(_bucket_info),
-      obj(_obj),
-      versioning_disabled(false),
-      bs_initialized(false) {}
-
-      Object(DB *_store, const RGWBucketInfo& _bucket_info, const rgw_obj& _obj, const std::string& _obj_id) : store(_store), bucket_info(_bucket_info), obj(_obj), obj_id(_obj_id) {}
-
-      struct Read {
-        DB::Object *source;
-
-        struct GetObjState {
-          rgw_obj obj;
-        } state;
-
-        struct ConditionParams {
-          const ceph::real_time *mod_ptr;
-          const ceph::real_time *unmod_ptr;
-          bool high_precision_time;
-          uint32_t mod_zone_id;
-          uint64_t mod_pg_ver;
-          const char *if_match;
-          const char *if_nomatch;
-
-          ConditionParams() :
-            mod_ptr(NULL), unmod_ptr(NULL), high_precision_time(false), mod_zone_id(0), mod_pg_ver(0),
-            if_match(NULL), if_nomatch(NULL) {}
-        } conds;
-
-        struct Params {
-          ceph::real_time *lastmod;
-          uint64_t *obj_size;
-         std::map<std::string, bufferlist> *attrs;
-          rgw_obj *target_obj;
-
-          Params() : lastmod(nullptr), obj_size(nullptr), attrs(nullptr),
-          target_obj(nullptr) {}
-        } params;
-
-        explicit Read(DB::Object *_source) : source(_source) {}
-
-        int prepare(const DoutPrefixProvider *dpp);
-        static int range_to_ofs(uint64_t obj_size, int64_t &ofs, int64_t &end);
-        int read(int64_t ofs, int64_t end, bufferlist& bl, const DoutPrefixProvider *dpp);
-        int iterate(const DoutPrefixProvider *dpp, int64_t ofs, int64_t end, RGWGetDataCB *cb);
-        int get_attr(const DoutPrefixProvider *dpp, const char *name, bufferlist& dest);
-      };
-
-      struct Write {
-        DB::Object *target;
-        RGWObjState obj_state;
-        std::string mp_part_str = "0.0"; // multipart num
-
-        struct MetaParams {
-          ceph::real_time *mtime;
-         std::map<std::string, bufferlist>* rmattrs;
-          const bufferlist *data;
-          RGWObjManifest *manifest;
-          const std::string *ptag;
-          std::list<rgw_obj_index_key> *remove_objs;
-          ceph::real_time set_mtime;
-          rgw_user owner;
-          RGWObjCategory category;
-          int flags;
-          const char *if_match;
-          const char *if_nomatch;
-          std::optional<uint64_t> olh_epoch;
-          ceph::real_time delete_at;
-          bool canceled;
-          const std::string *user_data;
-          rgw_zone_set *zones_trace;
-          bool modify_tail;
-          bool completeMultipart;
-          bool appendable;
-
-          MetaParams() : mtime(NULL), rmattrs(NULL), data(NULL), manifest(NULL), ptag(NULL),
-          remove_objs(NULL), category(RGWObjCategory::Main), flags(0),
-          if_match(NULL), if_nomatch(NULL), canceled(false), user_data(nullptr), zones_trace(nullptr),
-          modify_tail(false),  completeMultipart(false), appendable(false) {}
-        } meta;
-
-        explicit Write(DB::Object *_target) : target(_target) {}
-
-        void set_mp_part_str(std::string _mp_part_str) { mp_part_str = _mp_part_str;}
-        int prepare(const DoutPrefixProvider* dpp);
-        int write_data(const DoutPrefixProvider* dpp,
-                               bufferlist& data, uint64_t ofs);
-        int _do_write_meta(const DoutPrefixProvider *dpp,
-            uint64_t size, uint64_t accounted_size,
-           std::map<std::string, bufferlist>& attrs,
-            bool assume_noent, bool modify_tail);
-        int write_meta(const DoutPrefixProvider *dpp, uint64_t size,
-           uint64_t accounted_size, std::map<std::string, bufferlist>& attrs);
-      };
-
-      struct Delete {
-        DB::Object *target;
-
-        struct DeleteParams {
-          rgw_user bucket_owner;
-          int versioning_status;
-          ACLOwner obj_owner; /* needed for creation of deletion marker */
-          uint64_t olh_epoch;
-          std::string marker_version_id;
-          uint32_t bilog_flags;
-          std::list<rgw_obj_index_key> *remove_objs;
-          ceph::real_time expiration_time;
-          ceph::real_time unmod_since;
-          ceph::real_time mtime; /* for setting delete marker mtime */
-          bool high_precision_time;
-          rgw_zone_set *zones_trace;
-          bool abortmp;
-          uint64_t parts_accounted_size;
-
-          DeleteParams() : versioning_status(0), olh_epoch(0), bilog_flags(0), remove_objs(NULL), high_precision_time(false), zones_trace(nullptr), abortmp(false), parts_accounted_size(0) {}
-        } params;
-
-        struct DeleteResult {
-          bool delete_marker;
-          std::string version_id;
-
-          DeleteResult() : delete_marker(false) {}
-        } result;
-
-        explicit Delete(DB::Object *_target) : target(_target) {}
-
-        int delete_obj(const DoutPrefixProvider *dpp);
-        int delete_obj_impl(const DoutPrefixProvider *dpp, DBOpParams& del_params);
-        int create_dm(const DoutPrefixProvider *dpp, DBOpParams& del_params);
-      };
-
-      /* XXX: the parameters may be subject to change. All we need is bucket name
-       * & obj name,instance - keys */
-      int get_object_impl(const DoutPrefixProvider *dpp, DBOpParams& params);
-      int get_obj_state(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info,
-                        const rgw_obj& obj,
-                        bool follow_olh, RGWObjState **state);
-      int get_state(const DoutPrefixProvider *dpp, RGWObjState **pstate, bool follow_olh);
-      int list_versioned_objects(const DoutPrefixProvider *dpp,
-                                 std::list<rgw_bucket_dir_entry>& list_entries);
-
-      DB *get_store() { return store; }
-      rgw_obj& get_obj() { return obj; }
-      RGWBucketInfo& get_bucket_info() { return bucket_info; }
-
-      int InitializeParamsfromObject(const DoutPrefixProvider *dpp, DBOpParams* params);
-      int set_attrs(const DoutPrefixProvider *dpp, std::map<std::string, bufferlist>& setattrs,
-          std::map<std::string, bufferlist>* rmattrs);
-      int transition(const DoutPrefixProvider *dpp,
-                     const rgw_placement_rule& rule, const real_time& mtime,
-                     uint64_t olh_epoch);
-      int obj_omap_set_val_by_key(const DoutPrefixProvider *dpp, const std::string& key, bufferlist& val, bool must_exist);
-      int obj_omap_get_vals_by_keys(const DoutPrefixProvider *dpp, const std::string& oid,
-          const std::set<std::string>& keys,
-          std::map<std::string, bufferlist>* vals);
-      int obj_omap_get_all(const DoutPrefixProvider *dpp, std::map<std::string, bufferlist> *m);
-      int obj_omap_get_vals(const DoutPrefixProvider *dpp, const std::string& marker, uint64_t count,
-          std::map<std::string, bufferlist> *m, bool* pmore);
-      using iterate_obj_cb = int (*)(const DoutPrefixProvider*, const raw_obj&, off_t, off_t,
-          bool, RGWObjState*, void*);
-      int add_mp_part(const DoutPrefixProvider *dpp, RGWUploadPartInfo info);
-      int get_mp_parts_list(const DoutPrefixProvider *dpp, std::list<RGWUploadPartInfo>& info);
-
-      int iterate_obj(const DoutPrefixProvider *dpp,
-          const RGWBucketInfo& bucket_info, const rgw_obj& obj,
-          off_t ofs, off_t end, uint64_t max_chunk_size,
-          iterate_obj_cb cb, void *arg);
-    };
-    int get_obj_iterate_cb(const DoutPrefixProvider *dpp,
-        const raw_obj& read_obj, off_t obj_ofs,
-        off_t len, bool is_head_obj,
-        RGWObjState *astate, void *arg);
-
-    int get_entry(const std::string& oid, const std::string& marker,
-                 std::unique_ptr<rgw::sal::Lifecycle::LCEntry>* entry);
-    int get_next_entry(const std::string& oid, const std::string& marker,
-                 std::unique_ptr<rgw::sal::Lifecycle::LCEntry>* entry);
-    int set_entry(const std::string& oid, rgw::sal::Lifecycle::LCEntry& entry);
-    int list_entries(const std::string& oid, const std::string& marker,
-                          uint32_t max_entries, std::vector<std::unique_ptr<rgw::sal::Lifecycle::LCEntry>>& entries);
-    int rm_entry(const std::string& oid, rgw::sal::Lifecycle::LCEntry& entry);
-    int get_head(const std::string& oid, std::unique_ptr<rgw::sal::Lifecycle::LCHead>* head);
-    int put_head(const std::string& oid, rgw::sal::Lifecycle::LCHead& head);
-    int delete_stale_objs(const DoutPrefixProvider *dpp, const std::string& bucket,
-                          uint32_t min_wait);
-    int createGC(const DoutPrefixProvider *_dpp);
-    int stopGC();
-};
-
-struct db_get_obj_data {
-  DB* store;
-  RGWGetDataCB* client_cb = nullptr;
-  uint64_t offset; // next offset to write to client
-
-  db_get_obj_data(DB* db, RGWGetDataCB* cb, uint64_t offset) :
-    store(db), client_cb(cb), offset(offset) {}
-  ~db_get_obj_data() {}
-};
-
-} } // namespace rgw::store
-
-#endif
diff --git a/src/rgw/store/dbstore/common/dbstore_log.h b/src/rgw/store/dbstore/common/dbstore_log.h
deleted file mode 100644 (file)
index 8d981d5..0000000
+++ /dev/null
@@ -1,18 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-
-#ifndef DB_STORE_LOG_H
-#define DB_STORE_LOG_H
-
-#include <cerrno>
-#include <cstdlib>
-#include <string>
-#include <cstdio>
-#include <iostream>
-#include <fstream>
-#include "common/dout.h"
-
-#undef dout_prefix
-#define dout_prefix *_dout << "rgw dbstore: "
-
-#endif
diff --git a/src/rgw/store/dbstore/config/sqlite.cc b/src/rgw/store/dbstore/config/sqlite.cc
deleted file mode 100644 (file)
index 051dc34..0000000
+++ /dev/null
@@ -1,2072 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab ft=cpp
-
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2022 Red Hat, Inc.
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation. See file COPYING.
- *
- */
-
-#include <charconv>
-#include <initializer_list>
-#include <map>
-
-#undef FMT_HEADER_ONLY
-#define FMT_HEADER_ONLY 1
-#include <fmt/format.h>
-
-#include <sqlite3.h>
-
-#include "include/buffer.h"
-#include "include/encoding.h"
-#include "common/dout.h"
-#include "common/random_string.h"
-#include "rgw_zone.h"
-
-#include "common/connection_pool.h"
-#include "sqlite/connection.h"
-#include "sqlite/error.h"
-#include "sqlite/statement.h"
-#include "sqlite_schema.h"
-#include "sqlite.h"
-
-#define dout_subsys ceph_subsys_rgw_dbstore
-
-namespace rgw::dbstore::config {
-
-struct Prefix : DoutPrefixPipe {
-  std::string_view prefix;
-  Prefix(const DoutPrefixProvider& dpp, std::string_view prefix)
-      : DoutPrefixPipe(dpp), prefix(prefix) {}
-  unsigned get_subsys() const override { return dout_subsys; }
-  void add_prefix(std::ostream& out) const override {
-    out << prefix;
-  }
-};
-
-namespace {
-
-// parameter names for prepared statement bindings
-static constexpr const char* P1 = ":1";
-static constexpr const char* P2 = ":2";
-static constexpr const char* P3 = ":3";
-static constexpr const char* P4 = ":4";
-static constexpr const char* P5 = ":5";
-static constexpr const char* P6 = ":6";
-
-
-void read_text_rows(const DoutPrefixProvider* dpp,
-                    const sqlite::stmt_execution& stmt,
-                    std::span<std::string> entries,
-                    sal::ListResult<std::string>& result)
-{
-  result.entries = sqlite::read_text_rows(dpp, stmt, entries);
-  if (result.entries.size() < entries.size()) { // end of listing
-    result.next.clear();
-  } else {
-    result.next = result.entries.back();
-  }
-}
-
-struct RealmRow {
-  RGWRealm info;
-  int ver;
-  std::string tag;
-};
-
-void read_realm_row(const sqlite::stmt_execution& stmt, RealmRow& row)
-{
-  row.info.id = sqlite::column_text(stmt, 0);
-  row.info.name = sqlite::column_text(stmt, 1);
-  row.info.current_period = sqlite::column_text(stmt, 2);
-  row.info.epoch = sqlite::column_int(stmt, 3);
-  row.ver = sqlite::column_int(stmt, 4);
-  row.tag = sqlite::column_text(stmt, 5);
-}
-
-void read_period_row(const sqlite::stmt_execution& stmt, RGWPeriod& row)
-{
-  // just read the Data column and decode everything else from that
-  std::string data = sqlite::column_text(stmt, 3);
-
-  bufferlist bl = bufferlist::static_from_string(data);
-  auto p = bl.cbegin();
-  decode(row, p);
-}
-
-struct ZoneGroupRow {
-  RGWZoneGroup info;
-  int ver;
-  std::string tag;
-};
-
-void read_zonegroup_row(const sqlite::stmt_execution& stmt, ZoneGroupRow& row)
-{
-  std::string data = sqlite::column_text(stmt, 3);
-  row.ver = sqlite::column_int(stmt, 4);
-  row.tag = sqlite::column_text(stmt, 5);
-
-  bufferlist bl = bufferlist::static_from_string(data);
-  auto p = bl.cbegin();
-  decode(row.info, p);
-}
-
-struct ZoneRow {
-  RGWZoneParams info;
-  int ver;
-  std::string tag;
-};
-
-void read_zone_row(const sqlite::stmt_execution& stmt, ZoneRow& row)
-{
-  std::string data = sqlite::column_text(stmt, 3);
-  row.ver = sqlite::column_int(stmt, 4);
-  row.tag = sqlite::column_text(stmt, 5);
-
-  bufferlist bl = bufferlist::static_from_string(data);
-  auto p = bl.cbegin();
-  decode(row.info, p);
-}
-
-std::string generate_version_tag(CephContext* cct)
-{
-  static constexpr auto TAG_LEN = 24;
-  return gen_rand_alphanumeric(cct, TAG_LEN);
-}
-
-using SQLiteConnectionHandle = ConnectionHandle<sqlite::Connection>;
-
-using SQLiteConnectionPool = ConnectionPool<
-    sqlite::Connection, sqlite::ConnectionFactory>;
-
-} // anonymous namespace
-
-class SQLiteImpl : public SQLiteConnectionPool {
- public:
-  using SQLiteConnectionPool::SQLiteConnectionPool;
-};
-
-
-SQLiteConfigStore::SQLiteConfigStore(std::unique_ptr<SQLiteImpl> impl)
-  : impl(std::move(impl))
-{
-}
-
-SQLiteConfigStore::~SQLiteConfigStore() = default;
-
-
-// Realm
-
-class SQLiteRealmWriter : public sal::RealmWriter {
-  SQLiteImpl* impl;
-  int ver;
-  std::string tag;
-  std::string realm_id;
-  std::string realm_name;
- public:
-  SQLiteRealmWriter(SQLiteImpl* impl, int ver, std::string tag,
-                    std::string_view realm_id, std::string_view realm_name)
-    : impl(impl), ver(ver), tag(std::move(tag)),
-      realm_id(realm_id), realm_name(realm_name)
-  {}
-
-  int write(const DoutPrefixProvider* dpp, optional_yield y,
-            const RGWRealm& info) override
-  {
-    Prefix prefix{*dpp, "dbconfig:sqlite:realm_write "}; dpp = &prefix;
-
-    if (!impl) {
-      return -EINVAL; // can't write after a conflict or delete
-    }
-    if (realm_id != info.id || realm_name != info.name) {
-      return -EINVAL; // can't modify realm id or name directly
-    }
-
-    try {
-      auto conn = impl->get(dpp);
-      auto& stmt = conn->statements["realm_upd"];
-      if (!stmt) {
-        const std::string sql = fmt::format(schema::realm_update5,
-                                            P1, P2, P3, P4, P5);
-        stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql);
-      }
-      auto binding = sqlite::stmt_binding{stmt.get()};
-      sqlite::bind_text(dpp, binding, P1, info.id);
-      sqlite::bind_text(dpp, binding, P2, info.current_period);
-      sqlite::bind_int(dpp, binding, P3, info.epoch);
-      sqlite::bind_int(dpp, binding, P4, ver);
-      sqlite::bind_text(dpp, binding, P5, tag);
-
-      auto reset = sqlite::stmt_execution{stmt.get()};
-      sqlite::eval0(dpp, reset);
-
-      if (!::sqlite3_changes(conn->db.get())) { // VersionNumber/Tag mismatch
-        // our version is no longer consistent, so later writes would fail too
-        impl = nullptr;
-        return -ECANCELED;
-      }
-    } catch (const sqlite::error& e) {
-      ldpp_dout(dpp, 20) << "realm update failed: " << e.what() << dendl;
-      if (e.code() == sqlite::errc::foreign_key_constraint) {
-        return -EINVAL; // refers to nonexistent CurrentPeriod
-      } else if (e.code() == sqlite::errc::busy) {
-        return -EBUSY;
-      }
-      return -EIO;
-    }
-    ++ver;
-    return 0;
-  }
-
-  int rename(const DoutPrefixProvider* dpp, optional_yield y,
-             RGWRealm& info, std::string_view new_name) override
-  {
-    Prefix prefix{*dpp, "dbconfig:sqlite:realm_rename "}; dpp = &prefix;
-
-    if (!impl) {
-      return -EINVAL; // can't write after conflict or delete
-    }
-    if (realm_id != info.id || realm_name != info.name) {
-      return -EINVAL; // can't modify realm id or name directly
-    }
-    if (new_name.empty()) {
-      ldpp_dout(dpp, 0) << "realm cannot have an empty name" << dendl;
-      return -EINVAL;
-    }
-
-    try {
-      auto conn = impl->get(dpp);
-      auto& stmt = conn->statements["realm_rename"];
-      if (!stmt) {
-        const std::string sql = fmt::format(schema::realm_rename4,
-                                            P1, P2, P3, P4);
-        stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql);
-      }
-      auto binding = sqlite::stmt_binding{stmt.get()};
-      sqlite::bind_text(dpp, binding, P1, realm_id);
-      sqlite::bind_text(dpp, binding, P2, new_name);
-      sqlite::bind_int(dpp, binding, P3, ver);
-      sqlite::bind_text(dpp, binding, P4, tag);
-
-      auto reset = sqlite::stmt_execution{stmt.get()};
-      sqlite::eval0(dpp, reset);
-
-      if (!::sqlite3_changes(conn->db.get())) { // VersionNumber/Tag mismatch
-        impl = nullptr;
-        return -ECANCELED;
-      }
-    } catch (const sqlite::error& e) {
-      ldpp_dout(dpp, 20) << "realm rename failed: " << e.what() << dendl;
-      if (e.code() == sqlite::errc::unique_constraint) {
-        return -EEXIST; // Name already taken
-      } else if (e.code() == sqlite::errc::busy) {
-        return -EBUSY;
-      }
-      return -EIO;
-    }
-    info.name = std::string{new_name};
-    ++ver;
-    return 0;
-  }
-
-  int remove(const DoutPrefixProvider* dpp, optional_yield y) override
-  {
-    Prefix prefix{*dpp, "dbconfig:sqlite:realm_remove "}; dpp = &prefix;
-
-    if (!impl) {
-      return -EINVAL; // can't write after conflict or delete
-    }
-    try {
-      auto conn = impl->get(dpp);
-      auto& stmt = conn->statements["realm_del"];
-      if (!stmt) {
-        const std::string sql = fmt::format(schema::realm_delete3, P1, P2, P3);
-        stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql);
-      }
-      auto binding = sqlite::stmt_binding{stmt.get()};
-      sqlite::bind_text(dpp, binding, P1, realm_id);
-      sqlite::bind_int(dpp, binding, P2, ver);
-      sqlite::bind_text(dpp, binding, P3, tag);
-
-      auto reset = sqlite::stmt_execution{stmt.get()};
-      sqlite::eval0(dpp, reset);
-
-      impl = nullptr; // prevent any further writes after delete
-      if (!::sqlite3_changes(conn->db.get())) {
-        return -ECANCELED; // VersionNumber/Tag mismatch
-      }
-    } catch (const sqlite::error& e) {
-      ldpp_dout(dpp, 20) << "realm delete failed: " << e.what() << dendl;
-      if (e.code() == sqlite::errc::busy) {
-        return -EBUSY;
-      }
-      return -EIO;
-    }
-    return 0;
-  }
-}; // SQLiteRealmWriter
-
-
-int SQLiteConfigStore::write_default_realm_id(const DoutPrefixProvider* dpp,
-                                              optional_yield y, bool exclusive,
-                                              std::string_view realm_id)
-{
-  Prefix prefix{*dpp, "dbconfig:sqlite:write_default_realm_id "}; dpp = &prefix;
-
-  if (realm_id.empty()) {
-    ldpp_dout(dpp, 0) << "requires a realm id" << dendl;
-    return -EINVAL;
-  }
-
-  try {
-    auto conn = impl->get(dpp);
-    sqlite::stmt_ptr* stmt = nullptr;
-    if (exclusive) {
-      stmt = &conn->statements["def_realm_ins"];
-      if (!*stmt) {
-        const std::string sql = fmt::format(schema::default_realm_insert1, P1);
-        *stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql);
-      }
-    } else {
-      stmt = &conn->statements["def_realm_ups"];
-      if (!*stmt) {
-        const std::string sql = fmt::format(schema::default_realm_upsert1, P1);
-        *stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql);
-      }
-    }
-    auto binding = sqlite::stmt_binding{stmt->get()};
-    sqlite::bind_text(dpp, binding, P1, realm_id);
-
-    auto reset = sqlite::stmt_execution{stmt->get()};
-    sqlite::eval0(dpp, reset);
-  } catch (const sqlite::error& e) {
-    ldpp_dout(dpp, 20) << "default realm insert failed: " << e.what() << dendl;
-    if (e.code() == sqlite::errc::primary_key_constraint) {
-      return -EEXIST;
-    } else if (e.code() == sqlite::errc::busy) {
-      return -EBUSY;
-    }
-    return -EIO;
-  }
-  return 0;
-}
-
-int SQLiteConfigStore::read_default_realm_id(const DoutPrefixProvider* dpp,
-                                             optional_yield y,
-                                             std::string& realm_id)
-{
-  Prefix prefix{*dpp, "dbconfig:sqlite:read_default_realm_id "}; dpp = &prefix;
-
-  try {
-    auto conn = impl->get(dpp);
-    auto& stmt = conn->statements["def_realm_sel"];
-    if (!stmt) {
-      static constexpr std::string_view sql = schema::default_realm_select0;
-      stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql);
-    }
-    auto reset = sqlite::stmt_execution{stmt.get()};
-    sqlite::eval1(dpp, reset);
-
-    realm_id = sqlite::column_text(reset, 0);
-  } catch (const sqlite::error& e) {
-    ldpp_dout(dpp, 20) << "default realm select failed: " << e.what() << dendl;
-    if (e.code() == sqlite::errc::busy) {
-      return -EBUSY;
-    }
-    return -EIO;
-  }
-  return 0;
-}
-
-int SQLiteConfigStore::delete_default_realm_id(const DoutPrefixProvider* dpp,
-                                               optional_yield y)
-
-{
-  Prefix prefix{*dpp, "dbconfig:sqlite:delete_default_realm_id "}; dpp = &prefix;
-
-  try {
-    auto conn = impl->get(dpp);
-    auto& stmt = conn->statements["def_realm_del"];
-    if (!stmt) {
-      static constexpr std::string_view sql = schema::default_realm_delete0;
-      stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql);
-    }
-    auto reset = sqlite::stmt_execution{stmt.get()};
-    sqlite::eval0(dpp, reset);
-
-    if (!::sqlite3_changes(conn->db.get())) {
-      return -ENOENT;
-    }
-  } catch (const sqlite::error& e) {
-    ldpp_dout(dpp, 20) << "default realm delete failed: " << e.what() << dendl;
-    if (e.code() == sqlite::errc::busy) {
-      return -EBUSY;
-    }
-    return -EIO;
-  }
-  return 0;
-}
-
-
-int SQLiteConfigStore::create_realm(const DoutPrefixProvider* dpp,
-                                    optional_yield y, bool exclusive,
-                                    const RGWRealm& info,
-                                    std::unique_ptr<sal::RealmWriter>* writer)
-{
-  Prefix prefix{*dpp, "dbconfig:sqlite:create_realm "}; dpp = &prefix;
-
-  if (info.id.empty()) {
-    ldpp_dout(dpp, 0) << "realm cannot have an empty id" << dendl;
-    return -EINVAL;
-  }
-  if (info.name.empty()) {
-    ldpp_dout(dpp, 0) << "realm cannot have an empty name" << dendl;
-    return -EINVAL;
-  }
-
-  int ver = 1;
-  auto tag = generate_version_tag(dpp->get_cct());
-
-  try {
-    auto conn = impl->get(dpp);
-    sqlite::stmt_ptr* stmt = nullptr;
-    if (exclusive) {
-      stmt = &conn->statements["realm_ins"];
-      if (!*stmt) {
-        const std::string sql = fmt::format(schema::realm_insert4,
-                                            P1, P2, P3, P4);
-        *stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql);
-      }
-    } else {
-      stmt = &conn->statements["realm_ups"];
-      if (!*stmt) {
-        const std::string sql = fmt::format(schema::realm_upsert4,
-                                            P1, P2, P3, P4);
-        *stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql);
-      }
-    }
-    auto binding = sqlite::stmt_binding{stmt->get()};
-    sqlite::bind_text(dpp, binding, P1, info.id);
-    sqlite::bind_text(dpp, binding, P2, info.name);
-    sqlite::bind_int(dpp, binding, P3, ver);
-    sqlite::bind_text(dpp, binding, P4, tag);
-
-    auto reset = sqlite::stmt_execution{stmt->get()};
-    sqlite::eval0(dpp, reset);
-  } catch (const sqlite::error& e) {
-    ldpp_dout(dpp, 20) << "realm insert failed: " << e.what() << dendl;
-    if (e.code() == sqlite::errc::primary_key_constraint) {
-      return -EEXIST; // ID already taken
-    } else if (e.code() == sqlite::errc::unique_constraint) {
-      return -EEXIST; // Name already taken
-    } else if (e.code() == sqlite::errc::busy) {
-      return -EBUSY;
-    }
-    return -EIO;
-  }
-
-  if (writer) {
-    *writer = std::make_unique<SQLiteRealmWriter>(
-        impl.get(), ver, std::move(tag), info.id, info.name);
-  }
-  return 0;
-}
-
-int SQLiteConfigStore::read_realm_by_id(const DoutPrefixProvider* dpp,
-                                        optional_yield y,
-                                        std::string_view realm_id,
-                                        RGWRealm& info,
-                                        std::unique_ptr<sal::RealmWriter>* writer)
-{
-  Prefix prefix{*dpp, "dbconfig:sqlite:read_realm_by_id "}; dpp = &prefix;
-
-  if (realm_id.empty()) {
-    ldpp_dout(dpp, 0) << "requires a realm id" << dendl;
-    return -EINVAL;
-  }
-
-  RealmRow row;
-  try {
-    auto conn = impl->get(dpp);
-    auto& stmt = conn->statements["realm_sel_id"];
-    if (!stmt) {
-      const std::string sql = fmt::format(schema::realm_select_id1, P1);
-      stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql);
-    }
-    auto binding = sqlite::stmt_binding{stmt.get()};
-    sqlite::bind_text(dpp, binding, P1, realm_id);
-
-    auto reset = sqlite::stmt_execution{stmt.get()};
-    sqlite::eval1(dpp, reset);
-
-    read_realm_row(reset, row);
-  } catch (const buffer::error& e) {
-    ldpp_dout(dpp, 20) << "realm decode failed: " << e.what() << dendl;
-    return -EIO;
-  } catch (const sqlite::error& e) {
-    ldpp_dout(dpp, 20) << "realm select failed: " << e.what() << dendl;
-    if (e.code() == sqlite::errc::done) {
-      return -ENOENT;
-    } else if (e.code() == sqlite::errc::busy) {
-      return -EBUSY;
-    }
-    return -EIO;
-  }
-
-  info = std::move(row.info);
-  if (writer) {
-    *writer = std::make_unique<SQLiteRealmWriter>(
-        impl.get(), row.ver, std::move(row.tag), info.id, info.name);
-  }
-  return 0;
-}
-
-static void realm_select_by_name(const DoutPrefixProvider* dpp,
-                                 sqlite::Connection& conn,
-                                 std::string_view realm_name,
-                                 RealmRow& row)
-{
-  auto& stmt = conn.statements["realm_sel_name"];
-  if (!stmt) {
-    const std::string sql = fmt::format(schema::realm_select_name1, P1);
-    stmt = sqlite::prepare_statement(dpp, conn.db.get(), sql);
-  }
-  auto binding = sqlite::stmt_binding{stmt.get()};
-  sqlite::bind_text(dpp, binding, P1, realm_name);
-
-  auto reset = sqlite::stmt_execution{stmt.get()};
-  sqlite::eval1(dpp, reset);
-
-  read_realm_row(reset, row);
-}
-
-int SQLiteConfigStore::read_realm_by_name(const DoutPrefixProvider* dpp,
-                                          optional_yield y,
-                                          std::string_view realm_name,
-                                          RGWRealm& info,
-                                          std::unique_ptr<sal::RealmWriter>* writer)
-{
-  Prefix prefix{*dpp, "dbconfig:sqlite:read_realm_by_name "}; dpp = &prefix;
-
-  if (realm_name.empty()) {
-    ldpp_dout(dpp, 0) << "requires a realm name" << dendl;
-    return -EINVAL;
-  }
-
-  RealmRow row;
-  try {
-    auto conn = impl->get(dpp);
-    realm_select_by_name(dpp, *conn, realm_name, row);
-  } catch (const buffer::error& e) {
-    ldpp_dout(dpp, 20) << "realm decode failed: " << e.what() << dendl;
-    return -EIO;
-  } catch (const sqlite::error& e) {
-    ldpp_dout(dpp, 20) << "realm select failed: " << e.what() << dendl;
-    if (e.code() == sqlite::errc::done) {
-      return -ENOENT;
-    } else if (e.code() == sqlite::errc::busy) {
-      return -EBUSY;
-    }
-    return -EIO;
-  }
-
-  info = std::move(row.info);
-  if (writer) {
-    *writer = std::make_unique<SQLiteRealmWriter>(
-        impl.get(), row.ver, std::move(row.tag), info.id, info.name);
-  }
-  return 0;
-}
-
-int SQLiteConfigStore::read_default_realm(const DoutPrefixProvider* dpp,
-                                          optional_yield y,
-                                          RGWRealm& info,
-                                          std::unique_ptr<sal::RealmWriter>* writer)
-{
-  Prefix prefix{*dpp, "dbconfig:sqlite:read_default_realm "}; dpp = &prefix;
-
-  RealmRow row;
-  try {
-    auto conn = impl->get(dpp);
-    auto& stmt = conn->statements["realm_sel_def"];
-    if (!stmt) {
-      static constexpr std::string_view sql = schema::realm_select_default0;
-      stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql);
-    }
-    auto reset = sqlite::stmt_execution{stmt.get()};
-    sqlite::eval1(dpp, reset);
-
-    read_realm_row(reset, row);
-  } catch (const buffer::error& e) {
-    ldpp_dout(dpp, 20) << "realm decode failed: " << e.what() << dendl;
-    return -EIO;
-  } catch (const sqlite::error& e) {
-    ldpp_dout(dpp, 20) << "realm select failed: " << e.what() << dendl;
-    if (e.code() == sqlite::errc::done) {
-      return -ENOENT;
-    } else if (e.code() == sqlite::errc::busy) {
-      return -EBUSY;
-    }
-    return -EIO;
-  }
-
-  info = std::move(row.info);
-  if (writer) {
-    *writer = std::make_unique<SQLiteRealmWriter>(
-        impl.get(), row.ver, std::move(row.tag), info.id, info.name);
-  }
-  return 0;
-}
-
-int SQLiteConfigStore::read_realm_id(const DoutPrefixProvider* dpp,
-                                     optional_yield y,
-                                     std::string_view realm_name,
-                                     std::string& realm_id)
-{
-  Prefix prefix{*dpp, "dbconfig:sqlite:read_realm_id "}; dpp = &prefix;
-
-  if (realm_name.empty()) {
-    ldpp_dout(dpp, 0) << "requires a realm name" << dendl;
-    return -EINVAL;
-  }
-
-  try {
-    auto conn = impl->get(dpp);
-
-    RealmRow row;
-    realm_select_by_name(dpp, *conn, realm_name, row);
-
-    realm_id = std::move(row.info.id);
-  } catch (const buffer::error& e) {
-    ldpp_dout(dpp, 20) << "realm decode failed: " << e.what() << dendl;
-    return -EIO;
-  } catch (const sqlite::error& e) {
-    ldpp_dout(dpp, 20) << "realm select failed: " << e.what() << dendl;
-    if (e.code() == sqlite::errc::done) {
-      return -ENOENT;
-    } else if (e.code() == sqlite::errc::busy) {
-      return -EBUSY;
-    }
-    return -EIO;
-  }
-
-  return 0;
-}
-
-int SQLiteConfigStore::realm_notify_new_period(const DoutPrefixProvider* dpp,
-                                               optional_yield y,
-                                               const RGWPeriod& period)
-{
-  return -ENOTSUP;
-}
-
-int SQLiteConfigStore::list_realm_names(const DoutPrefixProvider* dpp,
-                                        optional_yield y, const std::string& marker,
-                                        std::span<std::string> entries,
-                                        sal::ListResult<std::string>& result)
-{
-  Prefix prefix{*dpp, "dbconfig:sqlite:list_realm_names "}; dpp = &prefix;
-
-  try {
-    auto conn = impl->get(dpp);
-    auto& stmt = conn->statements["realm_sel_names"];
-    if (!stmt) {
-      const std::string sql = fmt::format(schema::realm_select_names2, P1, P2);
-      stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql);
-    }
-    auto binding = sqlite::stmt_binding{stmt.get()};
-    sqlite::bind_text(dpp, binding, P1, marker);
-    sqlite::bind_int(dpp, binding, P2, entries.size());
-
-    auto reset = sqlite::stmt_execution{stmt.get()};
-    read_text_rows(dpp, reset, entries, result);
-  } catch (const sqlite::error& e) {
-    ldpp_dout(dpp, 20) << "realm select failed: " << e.what() << dendl;
-    if (e.code() == sqlite::errc::busy) {
-      return -EBUSY;
-    }
-    return -EIO;
-  }
-  return 0;
-}
-
-
-// Period
-
-int SQLiteConfigStore::create_period(const DoutPrefixProvider* dpp,
-                                     optional_yield y, bool exclusive,
-                                     const RGWPeriod& info)
-{
-  Prefix prefix{*dpp, "dbconfig:sqlite:create_period "}; dpp = &prefix;
-
-  if (info.id.empty()) {
-    ldpp_dout(dpp, 0) << "period cannot have an empty id" << dendl;
-    return -EINVAL;
-  }
-
-  bufferlist bl;
-  encode(info, bl);
-  const auto data = std::string_view{bl.c_str(), bl.length()};
-
-  try {
-    auto conn = impl->get(dpp);
-    sqlite::stmt_ptr* stmt = nullptr;
-    if (exclusive) {
-      stmt = &conn->statements["period_ins"];
-      if (!*stmt) {
-        const std::string sql = fmt::format(schema::period_insert4,
-                                            P1, P2, P3, P4);
-        *stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql);
-      }
-    } else {
-      stmt = &conn->statements["period_ups"];
-      if (!*stmt) {
-        const std::string sql = fmt::format(schema::period_upsert4,
-                                            P1, P2, P3, P4);
-        *stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql);
-      }
-    }
-    auto binding = sqlite::stmt_binding{stmt->get()};
-    sqlite::bind_text(dpp, binding, P1, info.id);
-    sqlite::bind_int(dpp, binding, P2, info.epoch);
-    sqlite::bind_text(dpp, binding, P3, info.realm_id);
-    sqlite::bind_text(dpp, binding, P4, data);
-
-    auto reset = sqlite::stmt_execution{stmt->get()};
-    sqlite::eval0(dpp, reset);
-  } catch (const sqlite::error& e) {
-    ldpp_dout(dpp, 20) << "period insert failed: " << e.what() << dendl;
-    if (e.code() == sqlite::errc::foreign_key_constraint) {
-      return -EINVAL; // refers to nonexistent RealmID
-    } else if (e.code() == sqlite::errc::busy) {
-      return -EBUSY;
-    }
-    return -EIO;
-  }
-  return 0;
-}
-
-static void period_select_epoch(const DoutPrefixProvider* dpp,
-                                sqlite::Connection& conn,
-                                std::string_view id, uint32_t epoch,
-                                RGWPeriod& row)
-{
-  auto& stmt = conn.statements["period_sel_epoch"];
-  if (!stmt) {
-    const std::string sql = fmt::format(schema::period_select_epoch2, P1, P2);
-    stmt = sqlite::prepare_statement(dpp, conn.db.get(), sql);
-  }
-  auto binding = sqlite::stmt_binding{stmt.get()};
-  sqlite::bind_text(dpp, binding, P1, id);
-  sqlite::bind_int(dpp, binding, P2, epoch);
-
-  auto reset = sqlite::stmt_execution{stmt.get()};
-  sqlite::eval1(dpp, reset);
-
-  read_period_row(reset, row);
-}
-
-static void period_select_latest(const DoutPrefixProvider* dpp,
-                                 sqlite::Connection& conn,
-                                 std::string_view id, RGWPeriod& row)
-{
-  auto& stmt = conn.statements["period_sel_latest"];
-  if (!stmt) {
-    const std::string sql = fmt::format(schema::period_select_latest1, P1);
-    stmt = sqlite::prepare_statement(dpp, conn.db.get(), sql);
-  }
-  auto binding = sqlite::stmt_binding{stmt.get()};
-  sqlite::bind_text(dpp, binding, P1, id);
-
-  auto reset = sqlite::stmt_execution{stmt.get()};
-  sqlite::eval1(dpp, reset);
-
-  read_period_row(reset, row);
-}
-
-int SQLiteConfigStore::read_period(const DoutPrefixProvider* dpp,
-                                   optional_yield y,
-                                   std::string_view period_id,
-                                   std::optional<uint32_t> epoch,
-                                   RGWPeriod& info)
-{
-  Prefix prefix{*dpp, "dbconfig:sqlite:read_period "}; dpp = &prefix;
-
-  if (period_id.empty()) {
-    ldpp_dout(dpp, 0) << "requires a period id" << dendl;
-    return -EINVAL;
-  }
-
-  try {
-    auto conn = impl->get(dpp);
-    if (epoch) {
-      period_select_epoch(dpp, *conn, period_id, *epoch, info);
-    } else {
-      period_select_latest(dpp, *conn, period_id, info);
-    }
-  } catch (const buffer::error& e) {
-    ldpp_dout(dpp, 20) << "period decode failed: " << e.what() << dendl;
-    return -EIO;
-  } catch (const sqlite::error& e) {
-    ldpp_dout(dpp, 20) << "period select failed: " << e.what() << dendl;
-    if (e.code() == sqlite::errc::done) {
-      return -ENOENT;
-    } else if (e.code() == sqlite::errc::busy) {
-      return -EBUSY;
-    }
-    return -EIO;
-  }
-  return 0;
-}
-
-int SQLiteConfigStore::delete_period(const DoutPrefixProvider* dpp,
-                                     optional_yield y,
-                                     std::string_view period_id)
-{
-  Prefix prefix{*dpp, "dbconfig:sqlite:delete_period "}; dpp = &prefix;
-
-  if (period_id.empty()) {
-    ldpp_dout(dpp, 0) << "requires a period id" << dendl;
-    return -EINVAL;
-  }
-
-  try {
-    auto conn = impl->get(dpp);
-    auto& stmt = conn->statements["period_del"];
-    if (!stmt) {
-      const std::string sql = fmt::format(schema::period_delete1, P1);
-      stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql);
-    }
-    auto binding = sqlite::stmt_binding{stmt.get()};
-    sqlite::bind_text(dpp, binding, P1, period_id);
-
-    auto reset = sqlite::stmt_execution{stmt.get()};
-    sqlite::eval0(dpp, reset);
-
-    if (!::sqlite3_changes(conn->db.get())) {
-      return -ENOENT;
-    }
-  } catch (const sqlite::error& e) {
-    ldpp_dout(dpp, 20) << "period delete failed: " << e.what() << dendl;
-    if (e.code() == sqlite::errc::busy) {
-      return -EBUSY;
-    }
-    return -EIO;
-  }
-  return 0;
-}
-
-int SQLiteConfigStore::list_period_ids(const DoutPrefixProvider* dpp,
-                                       optional_yield y,
-                                       const std::string& marker,
-                                       std::span<std::string> entries,
-                                       sal::ListResult<std::string>& result)
-{
-  Prefix prefix{*dpp, "dbconfig:sqlite:list_period_ids "}; dpp = &prefix;
-
-  try {
-    auto conn = impl->get(dpp);
-    auto& stmt = conn->statements["period_sel_ids"];
-    if (!stmt) {
-      const std::string sql = fmt::format(schema::period_select_ids2, P1, P2);
-      stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql);
-    }
-    auto binding = sqlite::stmt_binding{stmt.get()};
-    sqlite::bind_text(dpp, binding, P1, marker);
-    sqlite::bind_int(dpp, binding, P2, entries.size());
-
-    auto reset = sqlite::stmt_execution{stmt.get()};
-    read_text_rows(dpp, reset, entries, result);
-  } catch (const sqlite::error& e) {
-    ldpp_dout(dpp, 20) << "period select failed: " << e.what() << dendl;
-    if (e.code() == sqlite::errc::busy) {
-      return -EBUSY;
-    }
-    return -EIO;
-  }
-  return 0;
-}
-
-
-// ZoneGroup
-
-class SQLiteZoneGroupWriter : public sal::ZoneGroupWriter {
-  SQLiteImpl* impl;
-  int ver;
-  std::string tag;
-  std::string zonegroup_id;
-  std::string zonegroup_name;
- public:
-  SQLiteZoneGroupWriter(SQLiteImpl* impl, int ver, std::string tag,
-                        std::string_view zonegroup_id,
-                        std::string_view zonegroup_name)
-    : impl(impl), ver(ver), tag(std::move(tag)),
-      zonegroup_id(zonegroup_id), zonegroup_name(zonegroup_name)
-  {}
-
-  int write(const DoutPrefixProvider* dpp, optional_yield y,
-            const RGWZoneGroup& info) override
-  {
-    Prefix prefix{*dpp, "dbconfig:sqlite:zonegroup_write "}; dpp = &prefix;
-
-    if (!impl) {
-      return -EINVAL; // can't write after conflict or delete
-    }
-    if (zonegroup_id != info.id || zonegroup_name != info.name) {
-      return -EINVAL; // can't modify zonegroup id or name directly
-    }
-
-    bufferlist bl;
-    encode(info, bl);
-    const auto data = std::string_view{bl.c_str(), bl.length()};
-
-    try {
-      auto conn = impl->get(dpp);
-      auto& stmt = conn->statements["zonegroup_upd"];
-      if (!stmt) {
-        const std::string sql = fmt::format(schema::zonegroup_update5,
-                                            P1, P2, P3, P4, P5);
-        stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql);
-      }
-      auto binding = sqlite::stmt_binding{stmt.get()};
-      sqlite::bind_text(dpp, binding, P1, info.id);
-      sqlite::bind_text(dpp, binding, P2, info.realm_id);
-      sqlite::bind_text(dpp, binding, P3, data);
-      sqlite::bind_int(dpp, binding, P4, ver);
-      sqlite::bind_text(dpp, binding, P5, tag);
-
-      auto reset = sqlite::stmt_execution{stmt.get()};
-      sqlite::eval0(dpp, reset);
-
-      if (!::sqlite3_changes(conn->db.get())) { // VersionNumber/Tag mismatch
-        impl = nullptr;
-        return -ECANCELED;
-      }
-    } catch (const sqlite::error& e) {
-      ldpp_dout(dpp, 20) << "zonegroup update failed: " << e.what() << dendl;
-      if (e.code() == sqlite::errc::foreign_key_constraint) {
-        return -EINVAL; // refers to nonexistent RealmID
-      } else if (e.code() == sqlite::errc::busy) {
-        return -EBUSY;
-      }
-      return -EIO;
-    }
-    return 0;
-  }
-
-  int rename(const DoutPrefixProvider* dpp, optional_yield y,
-             RGWZoneGroup& info, std::string_view new_name) override
-  {
-    Prefix prefix{*dpp, "dbconfig:sqlite:zonegroup_rename "}; dpp = &prefix;
-
-    if (!impl) {
-      return -EINVAL; // can't write after conflict or delete
-    }
-    if (zonegroup_id != info.get_id() || zonegroup_name != info.get_name()) {
-      return -EINVAL; // can't modify zonegroup id or name directly
-    }
-    if (new_name.empty()) {
-      ldpp_dout(dpp, 0) << "zonegroup cannot have an empty name" << dendl;
-      return -EINVAL;
-    }
-
-    try {
-      auto conn = impl->get(dpp);
-      auto& stmt = conn->statements["zonegroup_rename"];
-      if (!stmt) {
-        const std::string sql = fmt::format(schema::zonegroup_rename4,
-                                            P1, P2, P3, P4);
-        stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql);
-      }
-      auto binding = sqlite::stmt_binding{stmt.get()};
-      sqlite::bind_text(dpp, binding, P1, info.id);
-      sqlite::bind_text(dpp, binding, P2, new_name);
-      sqlite::bind_int(dpp, binding, P3, ver);
-      sqlite::bind_text(dpp, binding, P4, tag);
-
-      auto reset = sqlite::stmt_execution{stmt.get()};
-      sqlite::eval0(dpp, reset);
-
-      if (!::sqlite3_changes(conn->db.get())) { // VersionNumber/Tag mismatch
-        impl = nullptr;
-        return -ECANCELED;
-      }
-    } catch (const sqlite::error& e) {
-      ldpp_dout(dpp, 20) << "zonegroup rename failed: " << e.what() << dendl;
-      if (e.code() == sqlite::errc::unique_constraint) {
-        return -EEXIST; // Name already taken
-      } else if (e.code() == sqlite::errc::busy) {
-        return -EBUSY;
-      }
-      return -EIO;
-    }
-    info.name = std::string{new_name};
-    return 0;
-  }
-
-  int remove(const DoutPrefixProvider* dpp, optional_yield y) override
-  {
-    Prefix prefix{*dpp, "dbconfig:sqlite:zonegroup_remove "}; dpp = &prefix;
-
-    if (!impl) {
-      return -EINVAL; // can't write after conflict or delete
-    }
-    try {
-      auto conn = impl->get(dpp);
-      auto& stmt = conn->statements["zonegroup_del"];
-      if (!stmt) {
-        const std::string sql = fmt::format(schema::zonegroup_delete3,
-                                            P1, P2, P3);
-        stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql);
-      }
-      auto binding = sqlite::stmt_binding{stmt.get()};
-      sqlite::bind_text(dpp, binding, P1, zonegroup_id);
-      sqlite::bind_int(dpp, binding, P2, ver);
-      sqlite::bind_text(dpp, binding, P3, tag);
-
-      auto reset = sqlite::stmt_execution{stmt.get()};
-      sqlite::eval0(dpp, reset);
-
-      impl = nullptr;
-      if (!::sqlite3_changes(conn->db.get())) { // VersionNumber/Tag mismatch
-        return -ECANCELED;
-      }
-    } catch (const sqlite::error& e) {
-      ldpp_dout(dpp, 20) << "zonegroup delete failed: " << e.what() << dendl;
-      if (e.code() == sqlite::errc::busy) {
-        return -EBUSY;
-      }
-      return -EIO;
-    }
-    return 0;
-  }
-}; // SQLiteZoneGroupWriter
-
-
-int SQLiteConfigStore::write_default_zonegroup_id(const DoutPrefixProvider* dpp,
-                                                  optional_yield y, bool exclusive,
-                                                  std::string_view realm_id,
-                                                  std::string_view zonegroup_id)
-{
-  Prefix prefix{*dpp, "dbconfig:sqlite:write_default_zonegroup_id "}; dpp = &prefix;
-
-  try {
-    auto conn = impl->get(dpp);
-    sqlite::stmt_ptr* stmt = nullptr;
-    if (exclusive) {
-      stmt = &conn->statements["def_zonegroup_ins"];
-      if (!*stmt) {
-        const std::string sql = fmt::format(schema::default_zonegroup_insert2,
-                                            P1, P2);
-        *stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql);
-      }
-    } else {
-      stmt = &conn->statements["def_zonegroup_ups"];
-      if (!*stmt) {
-        const std::string sql = fmt::format(schema::default_zonegroup_upsert2,
-                                            P1, P2);
-        *stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql);
-      }
-    }
-    auto binding = sqlite::stmt_binding{stmt->get()};
-    sqlite::bind_text(dpp, binding, P1, realm_id);
-    sqlite::bind_text(dpp, binding, P2, zonegroup_id);
-
-    auto reset = sqlite::stmt_execution{stmt->get()};
-    sqlite::eval0(dpp, reset);
-  } catch (const sqlite::error& e) {
-    ldpp_dout(dpp, 20) << "default zonegroup insert failed: " << e.what() << dendl;
-    if (e.code() == sqlite::errc::busy) {
-      return -EBUSY;
-    }
-    return -EIO;
-  }
-  return 0;
-}
-
-int SQLiteConfigStore::read_default_zonegroup_id(const DoutPrefixProvider* dpp,
-                                                 optional_yield y,
-                                                 std::string_view realm_id,
-                                                 std::string& zonegroup_id)
-{
-  Prefix prefix{*dpp, "dbconfig:sqlite:read_default_zonegroup_id "}; dpp = &prefix;
-
-  try {
-    auto conn = impl->get(dpp);
-    auto& stmt = conn->statements["def_zonegroup_sel"];
-    if (!stmt) {
-      const std::string sql = fmt::format(schema::default_zonegroup_select1, P1);
-      stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql);
-    }
-    auto binding = sqlite::stmt_binding{stmt.get()};
-    sqlite::bind_text(dpp, binding, P1, realm_id);
-
-    auto reset = sqlite::stmt_execution{stmt.get()};
-    sqlite::eval1(dpp, reset);
-
-    zonegroup_id = sqlite::column_text(reset, 0);
-  } catch (const sqlite::error& e) {
-    ldpp_dout(dpp, 20) << "default zonegroup select failed: " << e.what() << dendl;
-    if (e.code() == sqlite::errc::done) {
-      return -ENOENT;
-    } else if (e.code() == sqlite::errc::busy) {
-      return -EBUSY;
-    }
-    return -EIO;
-  }
-  return 0;
-}
-
-int SQLiteConfigStore::delete_default_zonegroup_id(const DoutPrefixProvider* dpp,
-                                                   optional_yield y,
-                                                   std::string_view realm_id)
-{
-  Prefix prefix{*dpp, "dbconfig:sqlite:delete_default_zonegroup_id "}; dpp = &prefix;
-
-  try {
-    auto conn = impl->get(dpp);
-    auto& stmt = conn->statements["def_zonegroup_del"];
-    if (!stmt) {
-      const std::string sql = fmt::format(schema::default_zonegroup_delete1, P1);
-      stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql);
-    }
-    auto binding = sqlite::stmt_binding{stmt.get()};
-    sqlite::bind_text(dpp, binding, P1, realm_id);
-
-    auto reset = sqlite::stmt_execution{stmt.get()};
-    sqlite::eval0(dpp, reset);
-
-    if (!::sqlite3_changes(conn->db.get())) {
-      return -ENOENT;
-    }
-  } catch (const sqlite::error& e) {
-    ldpp_dout(dpp, 20) << "default zonegroup delete failed: " << e.what() << dendl;
-    if (e.code() == sqlite::errc::busy) {
-      return -EBUSY;
-    }
-    return -EIO;
-  }
-  return 0;
-}
-
-
-int SQLiteConfigStore::create_zonegroup(const DoutPrefixProvider* dpp,
-                                        optional_yield y, bool exclusive,
-                                        const RGWZoneGroup& info,
-                                        std::unique_ptr<sal::ZoneGroupWriter>* writer)
-{
-  Prefix prefix{*dpp, "dbconfig:sqlite:create_zonegroup "}; dpp = &prefix;
-
-  if (info.id.empty()) {
-    ldpp_dout(dpp, 0) << "zonegroup cannot have an empty id" << dendl;
-    return -EINVAL;
-  }
-  if (info.name.empty()) {
-    ldpp_dout(dpp, 0) << "zonegroup cannot have an empty name" << dendl;
-    return -EINVAL;
-  }
-
-  int ver = 1;
-  auto tag = generate_version_tag(dpp->get_cct());
-
-  bufferlist bl;
-  encode(info, bl);
-  const auto data = std::string_view{bl.c_str(), bl.length()};
-
-  try {
-    auto conn = impl->get(dpp);
-    sqlite::stmt_ptr* stmt = nullptr;
-    if (exclusive) {
-      stmt = &conn->statements["zonegroup_ins"];
-      if (!*stmt) {
-        const std::string sql = fmt::format(schema::zonegroup_insert6,
-                                            P1, P2, P3, P4, P5, P6);
-        *stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql);
-      }
-    } else {
-      stmt = &conn->statements["zonegroup_ups"];
-      if (!*stmt) {
-        const std::string sql = fmt::format(schema::zonegroup_upsert6,
-                                            P1, P2, P3, P4, P5, P6);
-        *stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql);
-      }
-    }
-    auto binding = sqlite::stmt_binding{stmt->get()};
-    sqlite::bind_text(dpp, binding, P1, info.id);
-    sqlite::bind_text(dpp, binding, P2, info.name);
-    sqlite::bind_text(dpp, binding, P3, info.realm_id);
-    sqlite::bind_text(dpp, binding, P4, data);
-    sqlite::bind_int(dpp, binding, P5, ver);
-    sqlite::bind_text(dpp, binding, P6, tag);
-
-    auto reset = sqlite::stmt_execution{stmt->get()};
-    sqlite::eval0(dpp, reset);
-  } catch (const sqlite::error& e) {
-    ldpp_dout(dpp, 20) << "zonegroup insert failed: " << e.what() << dendl;
-    if (e.code() == sqlite::errc::foreign_key_constraint) {
-      return -EINVAL; // refers to nonexistent RealmID
-    } else if (e.code() == sqlite::errc::primary_key_constraint) {
-      return -EEXIST; // ID already taken
-    } else if (e.code() == sqlite::errc::unique_constraint) {
-      return -EEXIST; // Name already taken
-    } else if (e.code() == sqlite::errc::busy) {
-      return -EBUSY;
-    }
-    return -EIO;
-  }
-
-  if (writer) {
-    *writer = std::make_unique<SQLiteZoneGroupWriter>(
-        impl.get(), ver, std::move(tag), info.id, info.name);
-  }
-  return 0;
-}
-
-int SQLiteConfigStore::read_zonegroup_by_id(const DoutPrefixProvider* dpp,
-                                            optional_yield y,
-                                            std::string_view zonegroup_id,
-                                            RGWZoneGroup& info,
-                                            std::unique_ptr<sal::ZoneGroupWriter>* writer)
-{
-  Prefix prefix{*dpp, "dbconfig:sqlite:read_zonegroup_by_id "}; dpp = &prefix;
-
-  if (zonegroup_id.empty()) {
-    ldpp_dout(dpp, 0) << "requires a zonegroup id" << dendl;
-    return -EINVAL;
-  }
-
-  ZoneGroupRow row;
-  try {
-    auto conn = impl->get(dpp);
-    auto& stmt = conn->statements["zonegroup_sel_id"];
-    if (!stmt) {
-      const std::string sql = fmt::format(schema::zonegroup_select_id1, P1);
-      stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql);
-    }
-    auto binding = sqlite::stmt_binding{stmt.get()};
-    sqlite::bind_text(dpp, binding, P1, zonegroup_id);
-
-    auto reset = sqlite::stmt_execution{stmt.get()};
-    sqlite::eval1(dpp, reset);
-
-    read_zonegroup_row(reset, row);
-  } catch (const buffer::error& e) {
-    ldpp_dout(dpp, 20) << "zonegroup decode failed: " << e.what() << dendl;
-    return -EIO;
-  } catch (const sqlite::error& e) {
-    ldpp_dout(dpp, 20) << "zonegroup select failed: " << e.what() << dendl;
-    if (e.code() == sqlite::errc::done) {
-      return -ENOENT;
-    } else if (e.code() == sqlite::errc::busy) {
-      return -EBUSY;
-    }
-    return -EIO;
-  }
-
-  info = std::move(row.info);
-  if (writer) {
-    *writer = std::make_unique<SQLiteZoneGroupWriter>(
-        impl.get(), row.ver, std::move(row.tag), info.id, info.name);
-  }
-  return 0;
-}
-
-int SQLiteConfigStore::read_zonegroup_by_name(const DoutPrefixProvider* dpp,
-                                              optional_yield y,
-                                              std::string_view zonegroup_name,
-                                              RGWZoneGroup& info,
-                                              std::unique_ptr<sal::ZoneGroupWriter>* writer)
-{
-  Prefix prefix{*dpp, "dbconfig:sqlite:read_zonegroup_by_name "}; dpp = &prefix;
-
-  if (zonegroup_name.empty()) {
-    ldpp_dout(dpp, 0) << "requires a zonegroup name" << dendl;
-    return -EINVAL;
-  }
-
-  ZoneGroupRow row;
-  try {
-    auto conn = impl->get(dpp);
-    auto& stmt = conn->statements["zonegroup_sel_name"];
-    if (!stmt) {
-      const std::string sql = fmt::format(schema::zonegroup_select_name1, P1);
-      stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql);
-    }
-    auto binding = sqlite::stmt_binding{stmt.get()};
-    sqlite::bind_text(dpp, binding, P1, zonegroup_name);
-
-    auto reset = sqlite::stmt_execution{stmt.get()};
-    sqlite::eval1(dpp, reset);
-
-    read_zonegroup_row(reset, row);
-  } catch (const buffer::error& e) {
-    ldpp_dout(dpp, 20) << "zonegroup decode failed: " << e.what() << dendl;
-    return -EIO;
-  } catch (const sqlite::error& e) {
-    ldpp_dout(dpp, 20) << "zonegroup select failed: " << e.what() << dendl;
-    if (e.code() == sqlite::errc::done) {
-      return -ENOENT;
-    } else if (e.code() == sqlite::errc::busy) {
-      return -EBUSY;
-    }
-    return -EIO;
-  }
-
-  info = std::move(row.info);
-  if (writer) {
-    *writer = std::make_unique<SQLiteZoneGroupWriter>(
-        impl.get(), row.ver, std::move(row.tag), info.id, info.name);
-  }
-  return 0;
-}
-
-int SQLiteConfigStore::read_default_zonegroup(const DoutPrefixProvider* dpp,
-                                              optional_yield y,
-                                              std::string_view realm_id,
-                                              RGWZoneGroup& info,
-                                              std::unique_ptr<sal::ZoneGroupWriter>* writer)
-{
-  Prefix prefix{*dpp, "dbconfig:sqlite:read_default_zonegroup "}; dpp = &prefix;
-
-  ZoneGroupRow row;
-  try {
-    auto conn = impl->get(dpp);
-    auto& stmt = conn->statements["zonegroup_sel_def"];
-    if (!stmt) {
-      static constexpr std::string_view sql = schema::zonegroup_select_default0;
-      stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql);
-    }
-    auto reset = sqlite::stmt_execution{stmt.get()};
-    sqlite::eval1(dpp, reset);
-
-    read_zonegroup_row(reset, row);
-  } catch (const buffer::error& e) {
-    ldpp_dout(dpp, 20) << "zonegroup decode failed: " << e.what() << dendl;
-    return -EIO;
-  } catch (const sqlite::error& e) {
-    ldpp_dout(dpp, 20) << "zonegroup select failed: " << e.what() << dendl;
-    if (e.code() == sqlite::errc::done) {
-      return -ENOENT;
-    } else if (e.code() == sqlite::errc::busy) {
-      return -EBUSY;
-    }
-    return -EIO;
-  }
-
-  info = std::move(row.info);
-  if (writer) {
-    *writer = std::make_unique<SQLiteZoneGroupWriter>(
-        impl.get(), row.ver, std::move(row.tag), info.id, info.name);
-  }
-  return 0;
-}
-
-int SQLiteConfigStore::list_zonegroup_names(const DoutPrefixProvider* dpp,
-                                            optional_yield y,
-                                            const std::string& marker,
-                                            std::span<std::string> entries,
-                                            sal::ListResult<std::string>& result)
-{
-  Prefix prefix{*dpp, "dbconfig:sqlite:list_zonegroup_names "}; dpp = &prefix;
-
-  try {
-    auto conn = impl->get(dpp);
-    auto& stmt = conn->statements["zonegroup_sel_names"];
-    if (!stmt) {
-      const std::string sql = fmt::format(schema::zonegroup_select_names2, P1, P2);
-      stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql);
-    }
-    auto binding = sqlite::stmt_binding{stmt.get()};
-    auto reset = sqlite::stmt_execution{stmt.get()};
-
-    sqlite::bind_text(dpp, binding, P1, marker);
-    sqlite::bind_int(dpp, binding, P2, entries.size());
-
-    read_text_rows(dpp, reset, entries, result);
-  } catch (const sqlite::error& e) {
-    ldpp_dout(dpp, 20) << "zonegroup select failed: " << e.what() << dendl;
-    if (e.code() == sqlite::errc::busy) {
-      return -EBUSY;
-    }
-    return -EIO;
-  }
-  return 0;
-}
-
-
-// Zone
-
-class SQLiteZoneWriter : public sal::ZoneWriter {
-  SQLiteImpl* impl;
-  int ver;
-  std::string tag;
-  std::string zone_id;
-  std::string zone_name;
- public:
-  SQLiteZoneWriter(SQLiteImpl* impl, int ver, std::string tag,
-                   std::string_view zone_id, std::string_view zone_name)
-    : impl(impl), ver(ver), tag(std::move(tag)),
-      zone_id(zone_id), zone_name(zone_name)
-  {}
-
-  int write(const DoutPrefixProvider* dpp, optional_yield y,
-            const RGWZoneParams& info) override
-  {
-    Prefix prefix{*dpp, "dbconfig:sqlite:zone_write "}; dpp = &prefix;
-
-    if (!impl) {
-      return -EINVAL; // can't write after conflict or delete
-    }
-    if (zone_id != info.id || zone_name != info.name) {
-      return -EINVAL; // can't modify zone id or name directly
-    }
-
-    bufferlist bl;
-    encode(info, bl);
-    const auto data = std::string_view{bl.c_str(), bl.length()};
-
-    try {
-      auto conn = impl->get(dpp);
-      auto& stmt = conn->statements["zone_upd"];
-      if (!stmt) {
-        const std::string sql = fmt::format(schema::zone_update5,
-                                            P1, P2, P3, P4, P5);
-        stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql);
-      }
-      auto binding = sqlite::stmt_binding{stmt.get()};
-      sqlite::bind_text(dpp, binding, P1, info.id);
-      sqlite::bind_text(dpp, binding, P2, info.realm_id);
-      sqlite::bind_text(dpp, binding, P3, data);
-      sqlite::bind_int(dpp, binding, P4, ver);
-      sqlite::bind_text(dpp, binding, P5, tag);
-
-      auto reset = sqlite::stmt_execution{stmt.get()};
-      sqlite::eval0(dpp, reset);
-
-      if (!::sqlite3_changes(conn->db.get())) { // VersionNumber/Tag mismatch
-        impl = nullptr;
-        return -ECANCELED;
-      }
-    } catch (const sqlite::error& e) {
-      ldpp_dout(dpp, 20) << "zone update failed: " << e.what() << dendl;
-      if (e.code() == sqlite::errc::foreign_key_constraint) {
-        return -EINVAL; // refers to nonexistent RealmID
-      } else if (e.code() == sqlite::errc::busy) {
-        return -EBUSY;
-      }
-      return -EIO;
-    }
-    ++ver;
-    return 0;
-  }
-
-  int rename(const DoutPrefixProvider* dpp, optional_yield y,
-             RGWZoneParams& info, std::string_view new_name) override
-  {
-    Prefix prefix{*dpp, "dbconfig:sqlite:zone_rename "}; dpp = &prefix;
-
-    if (!impl) {
-      return -EINVAL; // can't write after conflict or delete
-    }
-    if (zone_id != info.id || zone_name != info.name) {
-      return -EINVAL; // can't modify zone id or name directly
-    }
-    if (new_name.empty()) {
-      ldpp_dout(dpp, 0) << "zonegroup cannot have an empty name" << dendl;
-      return -EINVAL;
-    }
-
-    try {
-      auto conn = impl->get(dpp);
-      auto& stmt = conn->statements["zone_rename"];
-      if (!stmt) {
-        const std::string sql = fmt::format(schema::zone_rename4, P1, P2, P2, P3);
-        stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql);
-      }
-      auto binding = sqlite::stmt_binding{stmt.get()};
-      sqlite::bind_text(dpp, binding, P1, info.id);
-      sqlite::bind_text(dpp, binding, P2, new_name);
-      sqlite::bind_int(dpp, binding, P3, ver);
-      sqlite::bind_text(dpp, binding, P4, tag);
-
-      auto reset = sqlite::stmt_execution{stmt.get()};
-      sqlite::eval0(dpp, reset);
-
-      if (!::sqlite3_changes(conn->db.get())) { // VersionNumber/Tag mismatch
-        impl = nullptr;
-        return -ECANCELED;
-      }
-    } catch (const sqlite::error& e) {
-      ldpp_dout(dpp, 20) << "zone rename failed: " << e.what() << dendl;
-      if (e.code() == sqlite::errc::unique_constraint) {
-        return -EEXIST; // Name already taken
-      } else if (e.code() == sqlite::errc::busy) {
-        return -EBUSY;
-      }
-      return -EIO;
-    }
-    info.name = std::string{new_name};
-    ++ver;
-    return 0;
-  }
-
-  int remove(const DoutPrefixProvider* dpp, optional_yield y) override
-  {
-    Prefix prefix{*dpp, "dbconfig:sqlite:zone_remove "}; dpp = &prefix;
-
-    if (!impl) {
-      return -EINVAL; // can't write after conflict or delete
-    }
-    try {
-      auto conn = impl->get(dpp);
-      auto& stmt = conn->statements["zone_del"];
-      if (!stmt) {
-        const std::string sql = fmt::format(schema::zone_delete3, P1, P2, P3);
-        stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql);
-      }
-      auto binding = sqlite::stmt_binding{stmt.get()};
-      sqlite::bind_text(dpp, binding, P1, zone_id);
-      sqlite::bind_int(dpp, binding, P2, ver);
-      sqlite::bind_text(dpp, binding, P3, tag);
-
-      auto reset = sqlite::stmt_execution{stmt.get()};
-      sqlite::eval0(dpp, reset);
-
-      impl = nullptr;
-      if (!::sqlite3_changes(conn->db.get())) { // VersionNumber/Tag mismatch
-        return -ECANCELED;
-      }
-    } catch (const sqlite::error& e) {
-      ldpp_dout(dpp, 20) << "zone delete failed: " << e.what() << dendl;
-      if (e.code() == sqlite::errc::busy) {
-        return -EBUSY;
-      }
-      return -EIO;
-    }
-    return 0;
-  }
-}; // SQLiteZoneWriter
-
-
-int SQLiteConfigStore::write_default_zone_id(const DoutPrefixProvider* dpp,
-                                             optional_yield y, bool exclusive,
-                                             std::string_view realm_id,
-                                             std::string_view zone_id)
-{
-  Prefix prefix{*dpp, "dbconfig:sqlite:write_default_zone_id "}; dpp = &prefix;
-
-  if (zone_id.empty()) {
-    ldpp_dout(dpp, 0) << "requires a zone id" << dendl;
-    return -EINVAL;
-  }
-
-  try {
-    auto conn = impl->get(dpp);
-    sqlite::stmt_ptr* stmt = nullptr;
-    if (exclusive) {
-      stmt = &conn->statements["def_zone_ins"];
-      if (!*stmt) {
-        const std::string sql = fmt::format(schema::default_zone_insert2, P1, P2);
-        *stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql);
-      }
-    } else {
-      stmt = &conn->statements["def_zone_ups"];
-      if (!*stmt) {
-        const std::string sql = fmt::format(schema::default_zone_upsert2, P1, P2);
-        *stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql);
-      }
-    }
-    auto binding = sqlite::stmt_binding{stmt->get()};
-    sqlite::bind_text(dpp, binding, P1, realm_id);
-    sqlite::bind_text(dpp, binding, P2, zone_id);
-
-    auto reset = sqlite::stmt_execution{stmt->get()};
-    sqlite::eval0(dpp, reset);
-  } catch (const sqlite::error& e) {
-    ldpp_dout(dpp, 20) << "default zone insert failed: " << e.what() << dendl;
-    if (e.code() == sqlite::errc::busy) {
-      return -EBUSY;
-    }
-    return -EIO;
-  }
-  return 0;
-}
-
-int SQLiteConfigStore::read_default_zone_id(const DoutPrefixProvider* dpp,
-                                            optional_yield y,
-                                            std::string_view realm_id,
-                                            std::string& zone_id)
-{
-  Prefix prefix{*dpp, "dbconfig:sqlite:read_default_zone_id "}; dpp = &prefix;
-
-  try {
-    auto conn = impl->get(dpp);
-    auto& stmt = conn->statements["def_zone_sel"];
-    if (!stmt) {
-      const std::string sql = fmt::format(schema::default_zone_select1, P1);
-      stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql);
-    }
-    auto binding = sqlite::stmt_binding{stmt.get()};
-    sqlite::bind_text(dpp, binding, P1, realm_id);
-
-    auto reset = sqlite::stmt_execution{stmt.get()};
-    sqlite::eval1(dpp, reset);
-
-    zone_id = sqlite::column_text(reset, 0);
-  } catch (const sqlite::error& e) {
-    ldpp_dout(dpp, 20) << "default zone select failed: " << e.what() << dendl;
-    if (e.code() == sqlite::errc::done) {
-      return -ENOENT;
-    } else if (e.code() == sqlite::errc::busy) {
-      return -EBUSY;
-    }
-    return -EIO;
-  }
-  return 0;
-}
-
-int SQLiteConfigStore::delete_default_zone_id(const DoutPrefixProvider* dpp,
-                                              optional_yield y,
-                                              std::string_view realm_id)
-{
-  Prefix prefix{*dpp, "dbconfig:sqlite:delete_default_zone_id "}; dpp = &prefix;
-
-  try {
-    auto conn = impl->get(dpp);
-    auto& stmt = conn->statements["def_zone_del"];
-    if (!stmt) {
-      const std::string sql = fmt::format(schema::default_zone_delete1, P1);
-      stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql);
-    }
-    auto binding = sqlite::stmt_binding{stmt.get()};
-    sqlite::bind_text(dpp, binding, P1, realm_id);
-
-    auto reset = sqlite::stmt_execution{stmt.get()};
-    sqlite::eval0(dpp, reset);
-
-    if (!::sqlite3_changes(conn->db.get())) {
-      return -ENOENT;
-    }
-  } catch (const sqlite::error& e) {
-    ldpp_dout(dpp, 20) << "default zone delete failed: " << e.what() << dendl;
-    if (e.code() == sqlite::errc::busy) {
-      return -EBUSY;
-    }
-    return -EIO;
-  }
-  return 0;
-}
-
-
-int SQLiteConfigStore::create_zone(const DoutPrefixProvider* dpp,
-                                   optional_yield y, bool exclusive,
-                                   const RGWZoneParams& info,
-                                   std::unique_ptr<sal::ZoneWriter>* writer)
-{
-  Prefix prefix{*dpp, "dbconfig:sqlite:create_zone "}; dpp = &prefix;
-
-  if (info.id.empty()) {
-    ldpp_dout(dpp, 0) << "zone cannot have an empty id" << dendl;
-    return -EINVAL;
-  }
-  if (info.name.empty()) {
-    ldpp_dout(dpp, 0) << "zone cannot have an empty name" << dendl;
-    return -EINVAL;
-  }
-
-  int ver = 1;
-  auto tag = generate_version_tag(dpp->get_cct());
-
-  bufferlist bl;
-  encode(info, bl);
-  const auto data = std::string_view{bl.c_str(), bl.length()};
-
-  try {
-    auto conn = impl->get(dpp);
-    sqlite::stmt_ptr* stmt = nullptr;
-    if (exclusive) {
-      stmt = &conn->statements["zone_ins"];
-      if (!*stmt) {
-        const std::string sql = fmt::format(schema::zone_insert6,
-                                            P1, P2, P3, P4, P5, P6);
-        *stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql);
-      }
-    } else {
-      stmt = &conn->statements["zone_ups"];
-      if (!*stmt) {
-        const std::string sql = fmt::format(schema::zone_upsert6,
-                                            P1, P2, P3, P4, P5, P6);
-        *stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql);
-      }
-    }
-    auto binding = sqlite::stmt_binding{stmt->get()};
-    sqlite::bind_text(dpp, binding, P1, info.id);
-    sqlite::bind_text(dpp, binding, P2, info.name);
-    sqlite::bind_text(dpp, binding, P3, info.realm_id);
-    sqlite::bind_text(dpp, binding, P4, data);
-    sqlite::bind_int(dpp, binding, P5, ver);
-    sqlite::bind_text(dpp, binding, P6, tag);
-
-    auto reset = sqlite::stmt_execution{stmt->get()};
-    sqlite::eval0(dpp, reset);
-  } catch (const sqlite::error& e) {
-    ldpp_dout(dpp, 20) << "zone insert failed: " << e.what() << dendl;
-    if (e.code() == sqlite::errc::foreign_key_constraint) {
-      return -EINVAL; // refers to nonexistent RealmID
-    } else if (e.code() == sqlite::errc::primary_key_constraint) {
-      return -EEXIST; // ID already taken
-    } else if (e.code() == sqlite::errc::unique_constraint) {
-      return -EEXIST; // Name already taken
-    } else if (e.code() == sqlite::errc::busy) {
-      return -EBUSY;
-    }
-    return -EIO;
-  }
-
-  if (writer) {
-    *writer = std::make_unique<SQLiteZoneWriter>(
-        impl.get(), ver, std::move(tag), info.id, info.name);
-  }
-  return 0;
-}
-
-int SQLiteConfigStore::read_zone_by_id(const DoutPrefixProvider* dpp,
-                                       optional_yield y,
-                                       std::string_view zone_id,
-                                       RGWZoneParams& info,
-                                       std::unique_ptr<sal::ZoneWriter>* writer)
-{
-  Prefix prefix{*dpp, "dbconfig:sqlite:read_zone_by_id "}; dpp = &prefix;
-
-  if (zone_id.empty()) {
-    ldpp_dout(dpp, 0) << "requires a zone id" << dendl;
-    return -EINVAL;
-  }
-
-  ZoneRow row;
-  try {
-    auto conn = impl->get(dpp);
-    auto& stmt = conn->statements["zone_sel_id"];
-    if (!stmt) {
-      const std::string sql = fmt::format(schema::zone_select_id1, P1);
-      stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql);
-    }
-    auto binding = sqlite::stmt_binding{stmt.get()};
-    sqlite::bind_text(dpp, binding, P1, zone_id);
-
-    auto reset = sqlite::stmt_execution{stmt.get()};
-    sqlite::eval1(dpp, reset);
-
-    read_zone_row(reset, row);
-  } catch (const sqlite::error& e) {
-    ldpp_dout(dpp, 20) << "zone select failed: " << e.what() << dendl;
-    if (e.code() == sqlite::errc::done) {
-      return -ENOENT;
-    } else if (e.code() == sqlite::errc::busy) {
-      return -EBUSY;
-    }
-    return -EIO;
-  }
-
-  info = std::move(row.info);
-  if (writer) {
-    *writer = std::make_unique<SQLiteZoneWriter>(
-        impl.get(), row.ver, std::move(row.tag), info.id, info.name);
-  }
-  return 0;
-}
-
-int SQLiteConfigStore::read_zone_by_name(const DoutPrefixProvider* dpp,
-                                         optional_yield y,
-                                         std::string_view zone_name,
-                                         RGWZoneParams& info,
-                                         std::unique_ptr<sal::ZoneWriter>* writer)
-{
-  Prefix prefix{*dpp, "dbconfig:sqlite:read_zone_by_name "}; dpp = &prefix;
-
-  if (zone_name.empty()) {
-    ldpp_dout(dpp, 0) << "requires a zone name" << dendl;
-    return -EINVAL;
-  }
-
-  ZoneRow row;
-  try {
-    auto conn = impl->get(dpp);
-    auto& stmt = conn->statements["zone_sel_name"];
-    if (!stmt) {
-      const std::string sql = fmt::format(schema::zone_select_name1, P1);
-      stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql);
-    }
-    auto binding = sqlite::stmt_binding{stmt.get()};
-    sqlite::bind_text(dpp, binding, P1, zone_name);
-
-    auto reset = sqlite::stmt_execution{stmt.get()};
-    sqlite::eval1(dpp, reset);
-
-    read_zone_row(reset, row);
-  } catch (const sqlite::error& e) {
-    ldpp_dout(dpp, 20) << "zone select failed: " << e.what() << dendl;
-    if (e.code() == sqlite::errc::done) {
-      return -ENOENT;
-    } else if (e.code() == sqlite::errc::busy) {
-      return -EBUSY;
-    }
-    return -EIO;
-  }
-
-  info = std::move(row.info);
-  if (writer) {
-    *writer = std::make_unique<SQLiteZoneWriter>(
-        impl.get(), row.ver, std::move(row.tag), info.id, info.name);
-  }
-  return 0;
-}
-
-int SQLiteConfigStore::read_default_zone(const DoutPrefixProvider* dpp,
-                                         optional_yield y,
-                                         std::string_view realm_id,
-                                         RGWZoneParams& info,
-                                         std::unique_ptr<sal::ZoneWriter>* writer)
-{
-  Prefix prefix{*dpp, "dbconfig:sqlite:read_default_zone "}; dpp = &prefix;
-
-  ZoneRow row;
-  try {
-    auto conn = impl->get(dpp);
-    auto& stmt = conn->statements["zone_sel_def"];
-    if (!stmt) {
-      static constexpr std::string_view sql = schema::zone_select_default0;
-      stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql);
-    }
-    auto reset = sqlite::stmt_execution{stmt.get()};
-    sqlite::eval1(dpp, reset);
-
-    read_zone_row(reset, row);
-  } catch (const sqlite::error& e) {
-    ldpp_dout(dpp, 20) << "zone select failed: " << e.what() << dendl;
-    if (e.code() == sqlite::errc::done) {
-      return -ENOENT;
-    } else if (e.code() == sqlite::errc::busy) {
-      return -EBUSY;
-    }
-    return -EIO;
-  }
-
-  info = std::move(row.info);
-  if (writer) {
-    *writer = std::make_unique<SQLiteZoneWriter>(
-        impl.get(), row.ver, std::move(row.tag), info.id, info.name);
-  }
-  return 0;
-}
-
-int SQLiteConfigStore::list_zone_names(const DoutPrefixProvider* dpp,
-                                       optional_yield y,
-                                       const std::string& marker,
-                                       std::span<std::string> entries,
-                                       sal::ListResult<std::string>& result)
-{
-  Prefix prefix{*dpp, "dbconfig:sqlite:list_zone_names "}; dpp = &prefix;
-
-  try {
-    auto conn = impl->get(dpp);
-    auto& stmt = conn->statements["zone_sel_names"];
-    if (!stmt) {
-      const std::string sql = fmt::format(schema::zone_select_names2, P1, P2);
-      stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql);
-    }
-    auto binding = sqlite::stmt_binding{stmt.get()};
-    sqlite::bind_text(dpp, binding, P1, marker);
-    sqlite::bind_int(dpp, binding, P2, entries.size());
-
-    auto reset = sqlite::stmt_execution{stmt.get()};
-    read_text_rows(dpp, reset, entries, result);
-  } catch (const sqlite::error& e) {
-    ldpp_dout(dpp, 20) << "zone select failed: " << e.what() << dendl;
-    if (e.code() == sqlite::errc::busy) {
-      return -EBUSY;
-    }
-    return -EIO;
-  }
-  return 0;
-}
-
-
-// PeriodConfig
-
-int SQLiteConfigStore::read_period_config(const DoutPrefixProvider* dpp,
-                                          optional_yield y,
-                                          std::string_view realm_id,
-                                          RGWPeriodConfig& info)
-{
-  Prefix prefix{*dpp, "dbconfig:sqlite:read_period_config "}; dpp = &prefix;
-
-  try {
-    auto conn = impl->get(dpp);
-    auto& stmt = conn->statements["period_conf_sel"];
-    if (!stmt) {
-      const std::string sql = fmt::format(schema::period_config_select1, P1);
-      stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql);
-    }
-    auto binding = sqlite::stmt_binding{stmt.get()};
-    sqlite::bind_text(dpp, binding, P1, realm_id);
-
-    auto reset = sqlite::stmt_execution{stmt.get()};
-    sqlite::eval1(dpp, reset);
-
-    std::string data = sqlite::column_text(reset, 0);
-    bufferlist bl = bufferlist::static_from_string(data);
-    auto p = bl.cbegin();
-    decode(info, p);
-
-  } catch (const buffer::error& e) {
-    ldpp_dout(dpp, 20) << "period config decode failed: " << e.what() << dendl;
-    return -EIO;
-  } catch (const sqlite::error& e) {
-    ldpp_dout(dpp, 20) << "period config select failed: " << e.what() << dendl;
-    if (e.code() == sqlite::errc::done) {
-      return -ENOENT;
-    } else if (e.code() == sqlite::errc::busy) {
-      return -EBUSY;
-    }
-    return -EIO;
-  }
-  return 0;
-}
-
-int SQLiteConfigStore::write_period_config(const DoutPrefixProvider* dpp,
-                                           optional_yield y, bool exclusive,
-                                           std::string_view realm_id,
-                                           const RGWPeriodConfig& info)
-{
-  Prefix prefix{*dpp, "dbconfig:sqlite:write_period_config "}; dpp = &prefix;
-
-  bufferlist bl;
-  encode(info, bl);
-  const auto data = std::string_view{bl.c_str(), bl.length()};
-
-  try {
-    auto conn = impl->get(dpp);
-    sqlite::stmt_ptr* stmt = nullptr;
-    if (exclusive) {
-      stmt = &conn->statements["period_conf_ins"];
-      if (!*stmt) {
-        const std::string sql = fmt::format(schema::period_config_insert2, P1, P2);
-        *stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql);
-      }
-    } else {
-      stmt = &conn->statements["period_conf_ups"];
-      if (!*stmt) {
-        const std::string sql = fmt::format(schema::period_config_upsert2, P1, P2);
-        *stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql);
-      }
-    }
-    auto binding = sqlite::stmt_binding{stmt->get()};
-    sqlite::bind_text(dpp, binding, P1, realm_id);
-    sqlite::bind_text(dpp, binding, P2, data);
-
-    auto reset = sqlite::stmt_execution{stmt->get()};
-    sqlite::eval0(dpp, reset);
-  } catch (const buffer::error& e) {
-    ldpp_dout(dpp, 20) << "period config decode failed: " << e.what() << dendl;
-    return -EIO;
-  } catch (const sqlite::error& e) {
-    ldpp_dout(dpp, 20) << "period config insert failed: " << e.what() << dendl;
-    if (e.code() == sqlite::errc::primary_key_constraint) {
-      return -EEXIST;
-    } else if (e.code() == sqlite::errc::busy) {
-      return -EBUSY;
-    }
-    return -EIO;
-  }
-  return 0;
-}
-
-namespace {
-
-int version_cb(void* user, int count, char** values, char** names)
-{
-  if (count != 1) {
-    return EINVAL;
-  }
-  std::string_view name = names[0];
-  if (name != "user_version") {
-    return EINVAL;
-  }
-  std::string_view value = values[0];
-  auto result = std::from_chars(value.begin(), value.end(),
-                                *reinterpret_cast<uint32_t*>(user));
-  if (result.ec != std::errc{}) {
-    return static_cast<int>(result.ec);
-  }
-  return 0;
-}
-
-void apply_schema_migrations(const DoutPrefixProvider* dpp, sqlite3* db)
-{
-  sqlite::execute(dpp, db, "PRAGMA foreign_keys = ON", nullptr, nullptr);
-
-  // initiate a transaction and read the current schema version
-  uint32_t version = 0;
-  sqlite::execute(dpp, db, "BEGIN; PRAGMA user_version", version_cb, &version);
-
-  const uint32_t initial_version = version;
-  ldpp_dout(dpp, 4) << "current schema version " << version << dendl;
-
-  // use the version as an index into schema::migrations
-  auto m = std::next(schema::migrations.begin(), version);
-
-  for (; m != schema::migrations.end(); ++m, ++version) {
-    try {
-      sqlite::execute(dpp, db, m->up, nullptr, nullptr);
-    } catch (const sqlite::error&) {
-      ldpp_dout(dpp, -1) << "ERROR: schema migration failed on v" << version
-          << ": " << m->description << dendl;
-      throw;
-    }
-  }
-
-  if (version > initial_version) {
-    // update the user_version and commit the transaction
-    const auto commit = fmt::format("PRAGMA user_version = {}; COMMIT", version);
-    sqlite::execute(dpp, db, commit.c_str(), nullptr, nullptr);
-
-    ldpp_dout(dpp, 4) << "upgraded database schema to version " << version << dendl;
-  } else {
-    // nothing to commit
-    sqlite::execute(dpp, db, "ROLLBACK", nullptr, nullptr);
-  }
-}
-
-} // anonymous namespace
-
-
-auto create_sqlite_store(const DoutPrefixProvider* dpp, const std::string& uri)
-  -> std::unique_ptr<config::SQLiteConfigStore>
-{
-  Prefix prefix{*dpp, "dbconfig:sqlite:create_sqlite_store "}; dpp = &prefix;
-
-  // build the connection pool
-  int flags = SQLITE_OPEN_CREATE | SQLITE_OPEN_URI | SQLITE_OPEN_READWRITE |
-      SQLITE_OPEN_NOMUTEX;
-  auto factory = sqlite::ConnectionFactory{uri, flags};
-
-  // sqlite does not support concurrent writers. we enforce this limitation by
-  // using a connection pool of size=1
-  static constexpr size_t max_connections = 1;
-  auto impl = std::make_unique<SQLiteImpl>(std::move(factory), max_connections);
-
-  // open a connection to apply schema migrations
-  auto conn = impl->get(dpp);
-  apply_schema_migrations(dpp, conn->db.get());
-
-  return std::make_unique<SQLiteConfigStore>(std::move(impl));
-}
-
-} // namespace rgw::dbstore::config
diff --git a/src/rgw/store/dbstore/config/sqlite.h b/src/rgw/store/dbstore/config/sqlite.h
deleted file mode 100644 (file)
index d79e040..0000000
+++ /dev/null
@@ -1,172 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab ft=cpp
-
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2022 Red Hat, Inc.
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation. See file COPYING.
- *
- */
-
-#pragma once
-
-#include "rgw_sal_config.h"
-
-class DoutPrefixProvider;
-
-namespace rgw::dbstore::config {
-
-struct SQLiteImpl;
-
-class SQLiteConfigStore : public sal::ConfigStore {
- public:
-  explicit SQLiteConfigStore(std::unique_ptr<SQLiteImpl> impl);
-  ~SQLiteConfigStore() override;
-
-  int write_default_realm_id(const DoutPrefixProvider* dpp,
-                             optional_yield y, bool exclusive,
-                             std::string_view realm_id) override;
-  int read_default_realm_id(const DoutPrefixProvider* dpp,
-                            optional_yield y,
-                            std::string& realm_id) override;
-  int delete_default_realm_id(const DoutPrefixProvider* dpp,
-                              optional_yield y) override;
-
-  int create_realm(const DoutPrefixProvider* dpp,
-                   optional_yield y, bool exclusive,
-                   const RGWRealm& info,
-                   std::unique_ptr<sal::RealmWriter>* writer) override;
-  int read_realm_by_id(const DoutPrefixProvider* dpp,
-                       optional_yield y,
-                       std::string_view realm_id,
-                       RGWRealm& info,
-                       std::unique_ptr<sal::RealmWriter>* writer) override;
-  int read_realm_by_name(const DoutPrefixProvider* dpp,
-                         optional_yield y,
-                         std::string_view realm_name,
-                         RGWRealm& info,
-                         std::unique_ptr<sal::RealmWriter>* writer) override;
-  int read_default_realm(const DoutPrefixProvider* dpp,
-                         optional_yield y,
-                         RGWRealm& info,
-                         std::unique_ptr<sal::RealmWriter>* writer) override;
-  int read_realm_id(const DoutPrefixProvider* dpp,
-                    optional_yield y, std::string_view realm_name,
-                    std::string& realm_id) override;
-  int realm_notify_new_period(const DoutPrefixProvider* dpp,
-                              optional_yield y,
-                              const RGWPeriod& period) override;
-  int list_realm_names(const DoutPrefixProvider* dpp,
-                       optional_yield y, const std::string& marker,
-                       std::span<std::string> entries,
-                       sal::ListResult<std::string>& result) override;
-
-  int create_period(const DoutPrefixProvider* dpp,
-                    optional_yield y, bool exclusive,
-                    const RGWPeriod& info) override;
-  int read_period(const DoutPrefixProvider* dpp,
-                  optional_yield y, std::string_view period_id,
-                  std::optional<uint32_t> epoch, RGWPeriod& info) override;
-  int delete_period(const DoutPrefixProvider* dpp,
-                    optional_yield y,
-                    std::string_view period_id) override;
-  int list_period_ids(const DoutPrefixProvider* dpp,
-                      optional_yield y, const std::string& marker,
-                      std::span<std::string> entries,
-                      sal::ListResult<std::string>& result) override;
-
-  int write_default_zonegroup_id(const DoutPrefixProvider* dpp,
-                                 optional_yield y, bool exclusive,
-                                 std::string_view realm_id,
-                                 std::string_view zonegroup_id) override;
-  int read_default_zonegroup_id(const DoutPrefixProvider* dpp,
-                                optional_yield y,
-                                std::string_view realm_id,
-                                std::string& zonegroup_id) override;
-  int delete_default_zonegroup_id(const DoutPrefixProvider* dpp,
-                                  optional_yield y,
-                                  std::string_view realm_id) override;
-
-  int create_zonegroup(const DoutPrefixProvider* dpp,
-                       optional_yield y, bool exclusive,
-                       const RGWZoneGroup& info,
-                       std::unique_ptr<sal::ZoneGroupWriter>* writer) override;
-  int read_zonegroup_by_id(const DoutPrefixProvider* dpp,
-                           optional_yield y,
-                           std::string_view zonegroup_id,
-                           RGWZoneGroup& info,
-                           std::unique_ptr<sal::ZoneGroupWriter>* writer) override;
-  int read_zonegroup_by_name(const DoutPrefixProvider* dpp,
-                             optional_yield y,
-                             std::string_view zonegroup_name,
-                             RGWZoneGroup& info,
-                             std::unique_ptr<sal::ZoneGroupWriter>* writer) override;
-  int read_default_zonegroup(const DoutPrefixProvider* dpp,
-                             optional_yield y,
-                             std::string_view realm_id,
-                             RGWZoneGroup& info,
-                             std::unique_ptr<sal::ZoneGroupWriter>* writer) override;
-  int list_zonegroup_names(const DoutPrefixProvider* dpp,
-                           optional_yield y, const std::string& marker,
-                           std::span<std::string> entries,
-                           sal::ListResult<std::string>& result) override;
-
-  int write_default_zone_id(const DoutPrefixProvider* dpp,
-                            optional_yield y, bool exclusive,
-                            std::string_view realm_id,
-                            std::string_view zone_id) override;
-  int read_default_zone_id(const DoutPrefixProvider* dpp,
-                           optional_yield y,
-                           std::string_view realm_id,
-                           std::string& zone_id) override;
-  int delete_default_zone_id(const DoutPrefixProvider* dpp,
-                             optional_yield y,
-                             std::string_view realm_id) override;
-
-  int create_zone(const DoutPrefixProvider* dpp,
-                  optional_yield y, bool exclusive,
-                  const RGWZoneParams& info,
-                  std::unique_ptr<sal::ZoneWriter>* writer) override;
-  int read_zone_by_id(const DoutPrefixProvider* dpp,
-                      optional_yield y,
-                      std::string_view zone_id,
-                      RGWZoneParams& info,
-                      std::unique_ptr<sal::ZoneWriter>* writer) override;
-  int read_zone_by_name(const DoutPrefixProvider* dpp,
-                        optional_yield y,
-                        std::string_view zone_name,
-                        RGWZoneParams& info,
-                        std::unique_ptr<sal::ZoneWriter>* writer) override;
-  int read_default_zone(const DoutPrefixProvider* dpp,
-                        optional_yield y,
-                        std::string_view realm_id,
-                        RGWZoneParams& info,
-                        std::unique_ptr<sal::ZoneWriter>* writer) override;
-  int list_zone_names(const DoutPrefixProvider* dpp,
-                      optional_yield y, const std::string& marker,
-                      std::span<std::string> entries,
-                      sal::ListResult<std::string>& result) override;
-
-  int read_period_config(const DoutPrefixProvider* dpp,
-                         optional_yield y,
-                         std::string_view realm_id,
-                         RGWPeriodConfig& info) override;
-  int write_period_config(const DoutPrefixProvider* dpp,
-                          optional_yield y, bool exclusive,
-                          std::string_view realm_id,
-                          const RGWPeriodConfig& info) override;
-
- private:
-  std::unique_ptr<SQLiteImpl> impl;
-}; // SQLiteConfigStore
-
-
-auto create_sqlite_store(const DoutPrefixProvider* dpp, const std::string& uri)
-  -> std::unique_ptr<config::SQLiteConfigStore>;
-
-} // namespace rgw::dbstore::config
diff --git a/src/rgw/store/dbstore/config/sqlite_schema.h b/src/rgw/store/dbstore/config/sqlite_schema.h
deleted file mode 100644 (file)
index c8a8fce..0000000
+++ /dev/null
@@ -1,299 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab ft=cpp
-
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2022 Red Hat, Inc.
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation. See file COPYING.
- *
- */
-
-#pragma once
-
-#include <initializer_list>
-
-namespace rgw::dbstore::config::schema {
-
-struct Migration {
-  // human-readable description to help with debugging migration errors
-  const char* description = nullptr;
-  // series of sql statements to apply the schema migration
-  const char* up = nullptr;
-  // series of sql statements to undo the schema migration
-  const char* down = nullptr;
-};
-
-static constexpr std::initializer_list<Migration> migrations {{
-    .description = "create the initial ConfigStore tables",
-    .up = R"(
-CREATE TABLE IF NOT EXISTS Realms (
-  ID TEXT PRIMARY KEY NOT NULL,
-  Name TEXT UNIQUE NOT NULL,
-  CurrentPeriod TEXT,
-  Epoch INTEGER DEFAULT 0,
-  VersionNumber INTEGER,
-  VersionTag TEXT
-);
-CREATE TABLE IF NOT EXISTS Periods (
-  ID TEXT NOT NULL,
-  Epoch INTEGER DEFAULT 0,
-  RealmID TEXT NOT NULL REFERENCES Realms (ID),
-  Data TEXT NOT NULL,
-  PRIMARY KEY (ID, Epoch)
-);
-CREATE TABLE IF NOT EXISTS PeriodConfigs (
-  RealmID TEXT PRIMARY KEY NOT NULL REFERENCES Realms (ID),
-  Data TEXT NOT NULL
-);
-CREATE TABLE IF NOT EXISTS ZoneGroups (
-  ID TEXT PRIMARY KEY NOT NULL,
-  Name TEXT UNIQUE NOT NULL,
-  RealmID TEXT NOT NULL REFERENCES Realms (ID),
-  Data TEXT NOT NULL,
-  VersionNumber INTEGER,
-  VersionTag TEXT
-);
-CREATE TABLE IF NOT EXISTS Zones (
-  ID TEXT PRIMARY KEY NOT NULL,
-  Name TEXT UNIQUE NOT NULL,
-  RealmID TEXT NOT NULL REFERENCES Realms (ID),
-  Data TEXT NOT NULL,
-  VersionNumber INTEGER,
-  VersionTag TEXT
-);
-CREATE TABLE IF NOT EXISTS DefaultRealms (
-  ID TEXT,
-  Empty TEXT PRIMARY KEY
-);
-CREATE TABLE IF NOT EXISTS DefaultZoneGroups (
-  ID TEXT,
-  RealmID TEXT PRIMARY KEY REFERENCES Realms (ID)
-);
-CREATE TABLE IF NOT EXISTS DefaultZones (
-  ID TEXT,
-  RealmID TEXT PRIMARY KEY REFERENCES Realms (ID)
-);
-)",
-    .down = R"(
-DROP TABLE IF EXISTS Realms;
-DROP TABLE IF EXISTS Periods;
-DROP TABLE IF EXISTS PeriodConfigs;
-DROP TABLE IF EXISTS ZoneGroups;
-DROP TABLE IF EXISTS Zones;
-DROP TABLE IF EXISTS DefaultRealms;
-DROP TABLE IF EXISTS DefaultZoneGroups;
-DROP TABLE IF EXISTS DefaultZones;
-)"
-  }
-};
-
-
-// DefaultRealms
-
-static constexpr const char* default_realm_insert1 =
-"INSERT INTO DefaultRealms (ID, Empty) VALUES ({}, '')";
-
-static constexpr const char* default_realm_upsert1 =
-R"(INSERT INTO DefaultRealms (ID, Empty) VALUES ({0}, '')
-ON CONFLICT(Empty) DO UPDATE SET ID = {0})";
-
-static constexpr const char* default_realm_select0 =
-"SELECT ID FROM DefaultRealms LIMIT 1";
-
-static constexpr const char* default_realm_delete0 =
-"DELETE FROM DefaultRealms";
-
-
-// Realms
-
-static constexpr const char* realm_update5 =
-"UPDATE Realms SET CurrentPeriod = {1}, Epoch = {2}, VersionNumber = {3} + 1 \
-WHERE ID = {0} AND VersionNumber = {3} AND VersionTag = {4}";
-
-static constexpr const char* realm_rename4 =
-"UPDATE Realms SET Name = {1}, VersionNumber = {2} + 1 \
-WHERE ID = {0} AND VersionNumber = {2} AND VersionTag = {3}";
-
-static constexpr const char* realm_delete3 =
-"DELETE FROM Realms WHERE ID = {} AND VersionNumber = {} AND VersionTag = {}";
-
-static constexpr const char* realm_insert4 =
-"INSERT INTO Realms (ID, Name, VersionNumber, VersionTag) \
-VALUES ({}, {}, {}, {})";
-
-static constexpr const char* realm_upsert4 =
-"INSERT INTO Realms (ID, Name, VersionNumber, VersionTag) \
-VALUES ({0}, {1}, {2}, {3}) \
-ON CONFLICT(ID) DO UPDATE SET Name = {1}, \
-VersionNumber = {2}, VersionTag = {3}";
-
-static constexpr const char* realm_select_id1 =
-"SELECT * FROM Realms WHERE ID = {} LIMIT 1";
-
-static constexpr const char* realm_select_name1 =
-"SELECT * FROM Realms WHERE Name = {} LIMIT 1";
-
-static constexpr const char* realm_select_default0 =
-"SELECT r.* FROM Realms r \
-INNER JOIN DefaultRealms d \
-ON d.ID = r.ID LIMIT 1";
-
-static constexpr const char* realm_select_names2 =
-"SELECT Name FROM Realms WHERE Name > {} \
-ORDER BY Name ASC LIMIT {}";
-
-
-// Periods
-
-static constexpr const char* period_insert4 =
-"INSERT INTO Periods (ID, Epoch, RealmID, Data) \
-VALUES ({}, {}, {}, {})";
-
-static constexpr const char* period_upsert4 =
-"INSERT INTO Periods (ID, Epoch, RealmID, Data) \
-VALUES ({0}, {1}, {2}, {3}) \
-ON CONFLICT DO UPDATE SET RealmID = {2}, Data = {3}";
-
-static constexpr const char* period_select_epoch2 =
-"SELECT * FROM Periods WHERE ID = {} AND Epoch = {} LIMIT 1";
-
-static constexpr const char* period_select_latest1 =
-"SELECT * FROM Periods WHERE ID = {} ORDER BY Epoch DESC LIMIT 1";
-
-static constexpr const char* period_delete1 =
-"DELETE FROM Periods WHERE ID = {}";
-
-static constexpr const char* period_select_ids2 =
-"SELECT ID FROM Periods WHERE ID > {} ORDER BY ID ASC LIMIT {}";
-
-
-// DefaultZoneGroups
-
-static constexpr const char* default_zonegroup_insert2 =
-"INSERT INTO DefaultZoneGroups (RealmID, ID) VALUES ({}, {})";
-
-static constexpr const char* default_zonegroup_upsert2 =
-"INSERT INTO DefaultZoneGroups (RealmID, ID) \
-VALUES ({0}, {1}) \
-ON CONFLICT(RealmID) DO UPDATE SET ID = {1}";
-
-static constexpr const char* default_zonegroup_select1 =
-"SELECT ID FROM DefaultZoneGroups WHERE RealmID = {}";
-
-static constexpr const char* default_zonegroup_delete1 =
-"DELETE FROM DefaultZoneGroups WHERE RealmID = {}";
-
-
-// ZoneGroups
-
-static constexpr const char* zonegroup_update5 =
-"UPDATE ZoneGroups SET RealmID = {1}, Data = {2}, VersionNumber = {3} + 1 \
-WHERE ID = {0} AND VersionNumber = {3} AND VersionTag = {4}";
-
-static constexpr const char* zonegroup_rename4 =
-"UPDATE ZoneGroups SET Name = {1}, VersionNumber = {2} + 1 \
-WHERE ID = {0} AND VersionNumber = {2} AND VersionTag = {3}";
-
-static constexpr const char* zonegroup_delete3 =
-"DELETE FROM ZoneGroups WHERE ID = {} \
-AND VersionNumber = {} AND VersionTag = {}";
-
-static constexpr const char* zonegroup_insert6 =
-"INSERT INTO ZoneGroups (ID, Name, RealmID, Data, VersionNumber, VersionTag) \
-VALUES ({}, {}, {}, {}, {}, {})";
-
-static constexpr const char* zonegroup_upsert6 =
-"INSERT INTO ZoneGroups (ID, Name, RealmID, Data, VersionNumber, VersionTag) \
-VALUES ({0}, {1}, {2}, {3}, {4}, {5}) \
-ON CONFLICT (ID) DO UPDATE SET Name = {1}, RealmID = {2}, \
-Data = {3}, VersionNumber = {4}, VersionTag = {5}";
-
-static constexpr const char* zonegroup_select_id1 =
-"SELECT * FROM ZoneGroups WHERE ID = {} LIMIT 1";
-
-static constexpr const char* zonegroup_select_name1 =
-"SELECT * FROM ZoneGroups WHERE Name = {} LIMIT 1";
-
-static constexpr const char* zonegroup_select_default0 =
-"SELECT z.* FROM ZoneGroups z \
-INNER JOIN DefaultZoneGroups d \
-ON d.ID = z.ID LIMIT 1";
-
-static constexpr const char* zonegroup_select_names2 =
-"SELECT Name FROM ZoneGroups WHERE Name > {} \
-ORDER BY Name ASC LIMIT {}";
-
-
-// DefaultZones
-
-static constexpr const char* default_zone_insert2 =
-"INSERT INTO DefaultZones (RealmID, ID) VALUES ({}, {})";
-
-static constexpr const char* default_zone_upsert2 =
-"INSERT INTO DefaultZones (RealmID, ID) VALUES ({0}, {1}) \
-ON CONFLICT(RealmID) DO UPDATE SET ID = {1}";
-
-static constexpr const char* default_zone_select1 =
-"SELECT ID FROM DefaultZones WHERE RealmID = {}";
-
-static constexpr const char* default_zone_delete1 =
-"DELETE FROM DefaultZones WHERE RealmID = {}";
-
-
-// Zones
-
-static constexpr const char* zone_update5 =
-"UPDATE Zones SET RealmID = {1}, Data = {2}, VersionNumber = {3} + 1 \
-WHERE ID = {0} AND VersionNumber = {3} AND VersionTag = {4}";
-
-static constexpr const char* zone_rename4 =
-"UPDATE Zones SET Name = {1}, VersionNumber = {2} + 1 \
-WHERE ID = {0} AND VersionNumber = {2} AND VersionTag = {3}";
-
-static constexpr const char* zone_delete3 =
-"DELETE FROM Zones WHERE ID = {} AND VersionNumber = {} AND VersionTag = {}";
-
-static constexpr const char* zone_insert6 =
-"INSERT INTO Zones (ID, Name, RealmID, Data, VersionNumber, VersionTag) \
-VALUES ({}, {}, {}, {}, {}, {})";
-
-static constexpr const char* zone_upsert6 =
-"INSERT INTO Zones (ID, Name, RealmID, Data, VersionNumber, VersionTag) \
-VALUES ({0}, {1}, {2}, {3}, {4}, {5}) \
-ON CONFLICT (ID) DO UPDATE SET Name = {1}, RealmID = {2}, \
-Data = {3}, VersionNumber = {4}, VersionTag = {5}";
-
-static constexpr const char* zone_select_id1 =
-"SELECT * FROM Zones WHERE ID = {} LIMIT 1";
-
-static constexpr const char* zone_select_name1 =
-"SELECT * FROM Zones WHERE Name = {} LIMIT 1";
-
-static constexpr const char* zone_select_default0 =
-"SELECT z.* FROM Zones z \
-INNER JOIN DefaultZones d \
-ON d.ID = z.ID LIMIT 1";
-
-static constexpr const char* zone_select_names2 =
-"SELECT Name FROM Zones WHERE Name > {} \
-ORDER BY Name ASC LIMIT {}";
-
-
-// PeriodConfigs
-
-static constexpr const char* period_config_insert2 =
-"INSERT INTO PeriodConfigs (RealmID, Data) VALUES ({}, {})";
-
-static constexpr const char* period_config_upsert2 =
-"INSERT INTO PeriodConfigs (RealmID, Data) VALUES ({0}, {1}) \
-ON CONFLICT (RealmID) DO UPDATE SET Data = {1}";
-
-static constexpr const char* period_config_select1 =
-"SELECT Data FROM PeriodConfigs WHERE RealmID = {} LIMIT 1";
-
-} // namespace rgw::dbstore::config::schema
diff --git a/src/rgw/store/dbstore/config/store.cc b/src/rgw/store/dbstore/config/store.cc
deleted file mode 100644 (file)
index 66f7471..0000000
+++ /dev/null
@@ -1,40 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab ft=cpp
-
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2022 Red Hat, Inc.
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation. See file COPYING.
- *
- */
-
-#include <stdexcept>
-
-#undef FMT_HEADER_ONLY
-#define FMT_HEADER_ONLY 1
-#include <fmt/format.h>
-
-#include "store.h"
-#ifdef SQLITE_ENABLED
-#include "sqlite.h"
-#endif
-
-namespace rgw::dbstore {
-
-auto create_config_store(const DoutPrefixProvider* dpp, const std::string& uri)
-  -> std::unique_ptr<sal::ConfigStore>
-{
-#ifdef SQLITE_ENABLED
-  if (uri.starts_with("file:")) {
-    return config::create_sqlite_store(dpp, uri);
-  }
-#endif
-  throw std::runtime_error(fmt::format("unrecognized URI {}", uri));
-}
-
-} // namespace rgw::dbstore
diff --git a/src/rgw/store/dbstore/config/store.h b/src/rgw/store/dbstore/config/store.h
deleted file mode 100644 (file)
index 553d9f7..0000000
+++ /dev/null
@@ -1,27 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab ft=cpp
-
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2022 Red Hat, Inc.
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation. See file COPYING.
- *
- */
-
-#pragma once
-
-#include <memory>
-#include "rgw_sal_config.h"
-
-namespace rgw::dbstore {
-
-// ConfigStore factory
-auto create_config_store(const DoutPrefixProvider* dpp, const std::string& uri)
-  -> std::unique_ptr<sal::ConfigStore>;
-
-} // namespace rgw::dbstore
diff --git a/src/rgw/store/dbstore/dbstore_main.cc b/src/rgw/store/dbstore/dbstore_main.cc
deleted file mode 100644 (file)
index 08724d8..0000000
+++ /dev/null
@@ -1,201 +0,0 @@
-#include <stdio.h>
-#include <sqlite3.h>
-#include <stdlib.h>
-#include <string.h>
-#include <pthread.h>
-
-#include "dbstore_mgr.h"
-#include <dbstore.h>
-#include <dbstore_log.h>
-
-using namespace std;
-using namespace rgw::store;
-using DB = rgw::store::DB;
-
-struct thr_args {
-  DB *dbs;
-  int thr_id;
-};
-
-void* process(void *arg)
-{
-  struct thr_args *t_args = (struct thr_args*)arg;
-
-  DB *db = t_args->dbs;
-  int thr_id = t_args->thr_id;
-  int ret = -1;
-
-  cout<<"Entered thread:"<<thr_id<<"\n";
-
-  string user1 = "User1";
-  string bucketa = "rgw";
-  string objecta1 = "bugfixing";
-  string objecta2 = "zipper";
-  string bucketb = "gluster";
-  string objectb1 = "bugfixing";
-  string objectb2 = "delegations";
-
-  string user2 = "User2";
-  string bucketc = "qe";
-  string objectc1 = "rhhi";
-  string objectc2 = "cns";
-
-  DBOpParams params = {};
-  const DoutPrefixProvider *dpp = db->get_def_dpp();
-
-  db->InitializeParams(dpp, &params);
-
-  params.op.user.uinfo.display_name = user1;
-  params.op.user.uinfo.user_id.tenant = "tenant";
-  params.op.user.uinfo.user_id.id = user1;
-  params.op.user.uinfo.suspended = 123;
-  params.op.user.uinfo.max_buckets = 456;
-  params.op.user.uinfo.assumed_role_arn = "role";
-  params.op.user.uinfo.placement_tags.push_back("tags1");
-  params.op.user.uinfo.placement_tags.push_back("tags2");
-
-  RGWAccessKey k1("id1", "key1");
-  RGWAccessKey k2("id2", "key2");
-  params.op.user.uinfo.access_keys.insert(make_pair("key1", k1));
-  params.op.user.uinfo.access_keys.insert(make_pair("key2", k2));
-
-  ret = db->ProcessOp(dpp, "InsertUser", &params);
-  cout << "InsertUser return value: " <<  ret << "\n";
-
-  DBOpParams params2 = {};
-  params.op.user.uinfo.user_id.tenant = "tenant2";
-
-  db->InitializeParams(dpp, &params2);
-  params2.op.user.uinfo.display_name = user1;
-  ret = db->ProcessOp(dpp, "GetUser", &params2);
-
-  cout << "GetUser return value: " <<  ret << "\n";
-
-  cout << "tenant: " << params2.op.user.uinfo.user_id.tenant << "\n";
-  cout << "suspended: " << (int)params2.op.user.uinfo.suspended << "\n";
-  cout << "assumed_role_arn: " << params2.op.user.uinfo.assumed_role_arn << "\n";
-
-  list<string>::iterator it = params2.op.user.uinfo.placement_tags.begin();
-
-  while (it != params2.op.user.uinfo.placement_tags.end()) {
-    cout << "list = " << *it << "\n";
-    it++;
-  }
-
-  map<string, RGWAccessKey>::iterator it2 = params2.op.user.uinfo.access_keys.begin();
-
-  while (it2 != params2.op.user.uinfo.access_keys.end()) {
-    cout << "keys = " << it2->first << "\n";
-    RGWAccessKey k = it2->second;
-    cout << "id = " << k.id << ", keys = " << k.key << "\n";
-    it2++;
-  }
-
-  params.op.bucket.info.bucket.name = bucketa;
-  db->ProcessOp(dpp, "InsertBucket", &params);
-
-  params.op.user.uinfo.display_name = user2;
-  params.op.user.uinfo.user_id.id = user2;
-  db->ProcessOp(dpp, "InsertUser", &params);
-
-  params.op.bucket.info.bucket.name = bucketb;
-  db->ProcessOp(dpp, "InsertBucket", &params);
-
-  db->ProcessOp(dpp, "GetUser", &params);
-  db->ProcessOp(dpp, "GetBucket", &params);
-
-  db->ListAllUsers(dpp, &params);
-  db->ListAllBuckets(dpp, &params);
-
-  params.op.bucket.info.bucket.name = bucketb;
-
-  db->ProcessOp(dpp, "RemoveBucket", &params);
-
-  params.op.user.uinfo.user_id.id = user2;
-  db->ProcessOp(dpp, "RemoveUser", &params);
-
-  db->ListAllUsers(dpp, &params);
-  db->ListAllBuckets(dpp, &params);
-  cout<<"Exiting thread:"<<thr_id<<"\n";
-
-  return 0;
-}
-
-int main(int argc, char *argv[])
-{
-  string tenant = "Redhat";
-  string logfile = "rgw_dbstore_bin.log";
-  int loglevel = 20;
-
-  DBStoreManager *dbsm;
-  DB *dbs;
-  int rc = 0, tnum = 0;
-  void *res;
-
-  pthread_attr_t attr;
-  int num_thr = 2;
-  pthread_t threads[num_thr];
-  struct thr_args t_args[num_thr];
-
-
-  cout << "loglevel  " << loglevel << "\n";
-  // format: ./dbstore-bin logfile loglevel
-  if (argc == 3) {
-       logfile = argv[1];
-       loglevel = (atoi)(argv[2]);
-       cout << "loglevel set to " << loglevel << "\n";
-  }
-
-  vector<const char*> args;
-  auto cct = global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT,
-                CODE_ENVIRONMENT_DAEMON, CINIT_FLAG_NO_MON_CONFIG, 1);
-  dbsm = new DBStoreManager(cct.get(), logfile, loglevel);
-  dbs = dbsm->getDB(tenant, true);
-
-  cout<<"No. of threads being created = "<<num_thr<<"\n";
-
-  /* Initialize thread creation attributes */
-  rc = pthread_attr_init(&attr);
-
-  if (rc != 0) {
-    cout<<" error in pthread_attr_init \n";
-    goto out;
-  }
-
-  for (tnum = 0; tnum < num_thr; tnum++) {
-    t_args[tnum].dbs = dbs;
-    t_args[tnum].thr_id = tnum;
-    rc = pthread_create((pthread_t*)&threads[tnum], &attr, &process,
-        &t_args[tnum]);
-    if (rc != 0) {
-      cout<<" error in pthread_create \n";
-      goto out;
-    }
-
-    cout<<"Created thread (thread-id:"<<tnum<<")\n";
-  }
-
-  /* Destroy the thread attributes object, since it is no
-     longer needed */
-
-  rc = pthread_attr_destroy(&attr);
-  if (rc != 0) {
-    cout<<"error in pthread_attr_destroy \n";
-  }
-
-  /* Now join with each thread, and display its returned value */
-
-  for (tnum = 0; tnum < num_thr; tnum++) {
-    rc = pthread_join(threads[tnum], &res);
-    if (rc != 0) {
-      cout<<"error in pthread_join \n";
-    } else {
-      cout<<"Joined with thread "<<tnum<<"\n";
-    }
-  }
-
-out:
-  dbsm->destroyAllHandles();
-
-  return 0;
-}
diff --git a/src/rgw/store/dbstore/dbstore_mgr.cc b/src/rgw/store/dbstore/dbstore_mgr.cc
deleted file mode 100644 (file)
index 6835f52..0000000
+++ /dev/null
@@ -1,140 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-
-#include "dbstore_mgr.h"
-#include "common/dbstore_log.h"
-
-#include <filesystem>
-
-static constexpr auto dout_subsys = ceph_subsys_rgw;
-
-using namespace std;
-
-
-/* Given a tenant, find and return the DBStore handle.
- * If not found and 'create' set to true, create one
- * and return
- */
-DB *DBStoreManager::getDB (string tenant, bool create)
-{
-  map<string, DB*>::iterator iter;
-  DB *dbs = nullptr;
-  pair<map<string, DB*>::iterator,bool> ret;
-
-  if (tenant.empty())
-    return default_db;
-
-  if (DBStoreHandles.empty())
-    goto not_found;
-
-  iter = DBStoreHandles.find(tenant);
-
-  if (iter != DBStoreHandles.end())
-    return iter->second;
-
-not_found:
-  if (!create)
-    return nullptr;
-
-  dbs = createDB(tenant);
-
-  return dbs;
-}
-
-/* Create DBStore instance */
-DB *DBStoreManager::createDB(std::string tenant) {
-  DB *dbs = nullptr;
-  pair<map<string, DB*>::iterator,bool> ret;
-  const auto& db_path = g_conf().get_val<std::string>("dbstore_db_dir");
-  const auto& db_name = g_conf().get_val<std::string>("dbstore_db_name_prefix") + "-" + tenant;
-
-  auto db_full_path = std::filesystem::path(db_path) / db_name;
-  ldout(cct, 0) << "DB initialization full db_path("<<db_full_path<<")" << dendl;
-
-  /* Create the handle */
-#ifdef SQLITE_ENABLED
-  dbs = new SQLiteDB(db_full_path.string(), cct);
-#else
-  dbs = new DB(db_full_path.string(), cct);
-#endif
-
-  /* API is DB::Initialize(string logfile, int loglevel);
-   * If none provided, by default write in to dbstore.log file
-   * created in current working directory with loglevel L_EVENT.
-   * XXX: need to align these logs to ceph location
-   */
-  if (dbs->Initialize("", -1) < 0) {
-    ldout(cct, 0) << "DB initialization failed for tenant("<<tenant<<")" << dendl;
-
-    delete dbs;
-    return nullptr;
-  }
-
-  /* XXX: Do we need lock to protect this map?
-  */
-  ret = DBStoreHandles.insert(pair<string, DB*>(tenant, dbs));
-
-  /*
-   * Its safe to check for already existing entry (just
-   * incase other thread raced and created the entry)
-   */
-  if (ret.second == false) {
-    /* Entry already created by another thread */
-    delete dbs;
-
-    dbs = ret.first->second;
-  }
-
-  return dbs;
-}
-
-void DBStoreManager::deleteDB(string tenant) {
-  map<string, DB*>::iterator iter;
-  DB *dbs = nullptr;
-
-  if (tenant.empty() || DBStoreHandles.empty())
-    return;
-
-  /* XXX: Check if we need to perform this operation under a lock */
-  iter = DBStoreHandles.find(tenant);
-
-  if (iter == DBStoreHandles.end())
-    return;
-
-  dbs = iter->second;
-
-  DBStoreHandles.erase(iter);
-  dbs->Destroy(dbs->get_def_dpp());
-  delete dbs;
-
-  return;
-}
-
-void DBStoreManager::deleteDB(DB *dbs) {
-  if (!dbs)
-    return;
-
-  (void)deleteDB(dbs->getDBname());
-}
-
-
-void DBStoreManager::destroyAllHandles(){
-  map<string, DB*>::iterator iter;
-  DB *dbs = nullptr;
-
-  if (DBStoreHandles.empty())
-    return;
-
-  for (iter = DBStoreHandles.begin(); iter != DBStoreHandles.end();
-      ++iter) {
-    dbs = iter->second;
-    dbs->Destroy(dbs->get_def_dpp());
-    delete dbs;
-  }
-
-  DBStoreHandles.clear();
-
-  return;
-}
-
-
diff --git a/src/rgw/store/dbstore/dbstore_mgr.h b/src/rgw/store/dbstore/dbstore_mgr.h
deleted file mode 100644 (file)
index 77fc3aa..0000000
+++ /dev/null
@@ -1,56 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-
-#pragma once
-
-#include <map>
-#include <cerrno>
-#include <cstdlib>
-#include <string>
-#include <cstdio>
-#include <iostream>
-#include <vector>
-
-#include "common/ceph_context.h"
-#include "common/dbstore.h"
-#include "sqlite/sqliteDB.h"
-
-using namespace rgw::store;
-using DB = rgw::store::DB;
-
-/* XXX: Should be a dbstore config option */
-const static std::string default_tenant = "default_ns";
-
-class DBStoreManager {
-private:
-  std::map<std::string, DB*> DBStoreHandles;
-  DB *default_db = nullptr;
-  CephContext *cct;
-
-public:
-  DBStoreManager(CephContext *_cct): DBStoreHandles() {
-    cct = _cct;
-       default_db = createDB(default_tenant);
-  };
-  DBStoreManager(CephContext *_cct, std::string logfile, int loglevel): DBStoreHandles() {
-    /* No ceph context. Create one with log args provided */
-    cct = _cct;
-    cct->_log->set_log_file(logfile);
-    cct->_log->reopen_log_file();
-    cct->_conf->subsys.set_log_level(ceph_subsys_rgw, loglevel);
-    default_db = createDB(default_tenant);
-  };
-  ~DBStoreManager() { destroyAllHandles(); };
-
-  /* XXX: TBD based on testing
-   * 1)  Lock to protect DBStoreHandles map.
-   * 2) Refcount of each DBStore to protect from
-   * being deleted while using it.
-   */
-  DB* getDB () { return default_db; };
-  DB* getDB (std::string tenant, bool create);
-  DB* createDB (std::string tenant);
-  void deleteDB (std::string tenant);
-  void deleteDB (DB* db);
-  void destroyAllHandles();
-};
diff --git a/src/rgw/store/dbstore/sqlite/CMakeLists.txt b/src/rgw/store/dbstore/sqlite/CMakeLists.txt
deleted file mode 100644 (file)
index 909765e..0000000
+++ /dev/null
@@ -1,16 +0,0 @@
-cmake_minimum_required(VERSION 3.14.0)
-project(sqlite_db)
-
-find_package(SQLite3 REQUIRED)
-
-set(sqlite_db_srcs
-    sqliteDB.h
-    sqliteDB.cc)
-
-include_directories(${CMAKE_INCLUDE_DIR})
-
-set(SQLITE_COMPILE_FLAGS "-DSQLITE_THREADSAFE=1")
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SQLITE_COMPILE_FLAGS}")
-
-add_library(sqlite_db STATIC ${sqlite_db_srcs})
-target_link_libraries(sqlite_db sqlite3 dbstore_lib rgw_common)
diff --git a/src/rgw/store/dbstore/sqlite/connection.cc b/src/rgw/store/dbstore/sqlite/connection.cc
deleted file mode 100644 (file)
index 143a3a0..0000000
+++ /dev/null
@@ -1,34 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab ft=cpp
-
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2022 Red Hat, Inc.
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation. See file COPYING.
- *
- */
-
-#include "common/dout.h"
-#include "connection.h"
-#include "error.h"
-
-namespace rgw::dbstore::sqlite {
-
-db_ptr open_database(const char* filename, int flags)
-{
-  sqlite3* db = nullptr;
-  const int result = ::sqlite3_open_v2(filename, &db, flags, nullptr);
-  if (result != SQLITE_OK) {
-    throw std::system_error(result, sqlite::error_category());
-  }
-  // request extended result codes
-  (void) ::sqlite3_extended_result_codes(db, 1);
-  return db_ptr{db};
-}
-
-} // namespace rgw::dbstore::sqlite
diff --git a/src/rgw/store/dbstore/sqlite/connection.h b/src/rgw/store/dbstore/sqlite/connection.h
deleted file mode 100644 (file)
index f5cd77d..0000000
+++ /dev/null
@@ -1,66 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab ft=cpp
-
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2022 Red Hat, Inc.
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation. See file COPYING.
- *
- */
-
-#pragma once
-
-#include <memory>
-#include <sqlite3.h>
-
-#undef FMT_HEADER_ONLY
-#define FMT_HEADER_ONLY 1
-#include <fmt/format.h>
-
-#include "sqlite/statement.h"
-
-class DoutPrefixProvider;
-
-namespace rgw::dbstore::sqlite {
-
-// owning sqlite3 pointer
-struct db_deleter {
-  void operator()(sqlite3* p) const { ::sqlite3_close(p); }
-};
-using db_ptr = std::unique_ptr<sqlite3, db_deleter>;
-
-
-// open the database file or throw on error
-db_ptr open_database(const char* filename, int flags);
-
-
-struct Connection {
-  db_ptr db;
-  // map of statements, prepared on first use
-  std::map<std::string_view, stmt_ptr> statements;
-
-  explicit Connection(db_ptr db) : db(std::move(db)) {}
-};
-
-// sqlite connection factory for ConnectionPool
-class ConnectionFactory {
-  std::string uri;
-  int flags;
- public:
-  ConnectionFactory(std::string uri, int flags)
-      : uri(std::move(uri)), flags(flags) {}
-
-  auto operator()(const DoutPrefixProvider* dpp)
-    -> std::unique_ptr<Connection>
-  {
-    auto db = open_database(uri.c_str(), flags);
-    return std::make_unique<Connection>(std::move(db));
-  }
-};
-
-} // namespace rgw::dbstore::sqlite
diff --git a/src/rgw/store/dbstore/sqlite/error.cc b/src/rgw/store/dbstore/sqlite/error.cc
deleted file mode 100644 (file)
index 5fe9eb0..0000000
+++ /dev/null
@@ -1,37 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab ft=cpp
-
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2022 Red Hat, Inc.
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation. See file COPYING.
- *
- */
-
-#include "error.h"
-
-namespace rgw::dbstore::sqlite {
-
-const std::error_category& error_category()
-{
-  struct category : std::error_category {
-    const char* name() const noexcept override {
-      return "dbstore:sqlite";
-    }
-    std::string message(int ev) const override {
-      return ::sqlite3_errstr(ev);
-    }
-    std::error_condition default_error_condition(int code) const noexcept override {
-      return {code & 0xFF, category()};
-    }
-  };
-  static category instance;
-  return instance;
-}
-
-} // namespace rgw::dbstore::sqlite
diff --git a/src/rgw/store/dbstore/sqlite/error.h b/src/rgw/store/dbstore/sqlite/error.h
deleted file mode 100644 (file)
index 15396d8..0000000
+++ /dev/null
@@ -1,81 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab ft=cpp
-
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2022 Red Hat, Inc.
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation. See file COPYING.
- *
- */
-
-#pragma once
-
-#include <system_error>
-#include <sqlite3.h>
-
-namespace rgw::dbstore::sqlite {
-
-// error category for sqlite extended result codes:
-//   https://www.sqlite.org/rescode.html
-const std::error_category& error_category();
-
-
-// sqlite exception type that carries the extended error code and message
-class error : public std::runtime_error {
-  std::error_code ec;
- public:
-  error(const char* errmsg, std::error_code ec)
-      : runtime_error(errmsg), ec(ec) {}
-  error(sqlite3* db, std::error_code ec) : error(::sqlite3_errmsg(db), ec) {}
-  error(sqlite3* db, int result) : error(db, {result, error_category()}) {}
-  error(sqlite3* db) : error(db, ::sqlite3_extended_errcode(db)) {}
-  std::error_code code() const { return ec; }
-};
-
-
-// sqlite error conditions for primary and extended result codes
-//
-// 'primary' error_conditions will match 'primary' error_codes as well as any
-// 'extended' error_codes whose lowest 8 bits match that primary code. for
-// example, the error_condition for SQLITE_CONSTRAINT will match the error_codes
-// SQLITE_CONSTRAINT and SQLITE_CONSTRAINT_*
-enum class errc {
-  // primary result codes
-  ok = SQLITE_OK,
-  busy = SQLITE_BUSY,
-  constraint = SQLITE_CONSTRAINT,
-  row = SQLITE_ROW,
-  done = SQLITE_DONE,
-
-  // extended result codes
-  primary_key_constraint = SQLITE_CONSTRAINT_PRIMARYKEY,
-  foreign_key_constraint = SQLITE_CONSTRAINT_FOREIGNKEY,
-  unique_constraint = SQLITE_CONSTRAINT_UNIQUE,
-
-  // ..add conditions as needed
-};
-
-inline std::error_code make_error_code(errc e)
-{
-  return {static_cast<int>(e), error_category()};
-}
-
-inline std::error_condition make_error_condition(errc e)
-{
-  return {static_cast<int>(e), error_category()};
-}
-
-} // namespace rgw::dbstore::sqlite
-
-namespace std {
-
-// enable implicit conversions from sqlite::errc to std::error_condition
-template<> struct is_error_condition_enum<
-    rgw::dbstore::sqlite::errc> : public true_type {};
-
-} // namespace std
diff --git a/src/rgw/store/dbstore/sqlite/sqliteDB.cc b/src/rgw/store/dbstore/sqlite/sqliteDB.cc
deleted file mode 100644 (file)
index b0ced45..0000000
+++ /dev/null
@@ -1,3001 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-
-#include "sqliteDB.h"
-
-using namespace std;
-
-#define SQL_PREPARE(dpp, params, sdb, stmt, ret, Op)   \
-  do {                                                 \
-    string schema;                                     \
-    schema = Schema(params);                   \
-    sqlite3_prepare_v2 (*sdb, schema.c_str(),  \
-        -1, &stmt , NULL);             \
-    if (!stmt) {                                       \
-      ldpp_dout(dpp, 0) <<"failed to prepare statement " \
-      <<"for Op("<<Op<<"); Errmsg -"\
-      <<sqlite3_errmsg(*sdb)<< dendl;\
-      ret = -1;                                \
-      goto out;                                \
-    }                                          \
-    ldpp_dout(dpp, 20)<<"Successfully Prepared stmt for Op("<<Op       \
-    <<") schema("<<schema<<") stmt("<<stmt<<")"<< dendl;       \
-    ret = 0;                                   \
-  } while(0);
-
-#define SQL_BIND_INDEX(dpp, stmt, index, str, sdb)     \
-  do {                                         \
-    index = sqlite3_bind_parameter_index(stmt, str);     \
-    \
-    if (index <=0)  {                               \
-      ldpp_dout(dpp, 0) <<"failed to fetch bind parameter"\
-      " index for str("<<str<<") in "   \
-      <<"stmt("<<stmt<<"); Errmsg -"    \
-      <<sqlite3_errmsg(*sdb)<< dendl;       \
-      rc = -1;                              \
-      goto out;                                     \
-    }                                               \
-    ldpp_dout(dpp, 20)<<"Bind parameter index for str("  \
-    <<str<<") in stmt("<<stmt<<") is "  \
-    <<index<< dendl;                        \
-  }while(0);
-
-#define SQL_BIND_TEXT(dpp, stmt, index, str, sdb)                      \
-  do {                                                         \
-    rc = sqlite3_bind_text(stmt, index, str, -1, SQLITE_TRANSIENT);    \
-    if (rc != SQLITE_OK) {                                             \
-      ldpp_dout(dpp, 0)<<"sqlite bind text failed for index("          \
-      <<index<<"), str("<<str<<") in stmt("    \
-      <<stmt<<"); Errmsg - "<<sqlite3_errmsg(*sdb) \
-      << dendl;                                \
-      rc = -1;                                 \
-      goto out;                                        \
-    }                                                  \
-    ldpp_dout(dpp, 20)<<"Bind parameter text for index("  \
-    <<index<<") in stmt("<<stmt<<") is "  \
-    <<str<< dendl;                          \
-  }while(0);
-
-#define SQL_BIND_INT(dpp, stmt, index, num, sdb)                       \
-  do {                                                         \
-    rc = sqlite3_bind_int(stmt, index, num);           \
-    \
-    if (rc != SQLITE_OK) {                                     \
-      ldpp_dout(dpp, 0)<<"sqlite bind int failed for index("           \
-      <<index<<"), num("<<num<<") in stmt("    \
-      <<stmt<<"); Errmsg - "<<sqlite3_errmsg(*sdb) \
-      << dendl;                                \
-      rc = -1;                                 \
-      goto out;                                        \
-    }                                                  \
-    ldpp_dout(dpp, 20)<<"Bind parameter int for index("  \
-    <<index<<") in stmt("<<stmt<<") is "  \
-    <<num<< dendl;                          \
-  }while(0);
-
-#define SQL_BIND_BLOB(dpp, stmt, index, blob, size, sdb)               \
-  do {                                                         \
-    rc = sqlite3_bind_blob(stmt, index, blob, size, SQLITE_TRANSIENT);  \
-    \
-    if (rc != SQLITE_OK) {                                     \
-      ldpp_dout(dpp, 0)<<"sqlite bind blob failed for index("          \
-      <<index<<"), blob("<<blob<<") in stmt("          \
-      <<stmt<<"); Errmsg - "<<sqlite3_errmsg(*sdb) \
-      << dendl;                                \
-      rc = -1;                                 \
-      goto out;                                        \
-    }                                                  \
-  }while(0);
-
-#define SQL_ENCODE_BLOB_PARAM(dpp, stmt, index, param, sdb)            \
-  do {                                                         \
-    bufferlist b;                                              \
-    encode(param, b);                                  \
-    SQL_BIND_BLOB(dpp, stmt, index, b.c_str(), b.length(), sdb); \
-  }while(0);
-
-#define SQL_READ_BLOB(dpp, stmt, index, void_ptr, len)         \
-  do {                                                         \
-    void_ptr = NULL;                                   \
-    void_ptr = (void *)sqlite3_column_blob(stmt, index);       \
-    len = sqlite3_column_bytes(stmt, index);           \
-    \
-    if (!void_ptr || len == 0) {                               \
-      ldpp_dout(dpp, 20)<<"Null value for blob index("  \
-      <<index<<") in stmt("<<stmt<<") "<< dendl;   \
-    }                                                  \
-  }while(0);
-
-#define SQL_DECODE_BLOB_PARAM(dpp, stmt, index, param, sdb)            \
-  do {                                                         \
-    bufferlist b;                                              \
-    void *blob;                                                \
-    int blob_len = 0;                                  \
-    \
-    SQL_READ_BLOB(dpp, stmt, index, blob, blob_len);           \
-    \
-    b.append(reinterpret_cast<char *>(blob), blob_len);        \
-    \
-    decode(param, b);                                  \
-  }while(0);
-
-#define SQL_EXECUTE(dpp, params, stmt, cbk, args...) \
-  do{                                          \
-    const std::lock_guard<std::mutex> lk(((DBOp*)(this))->mtx); \
-    if (!stmt) {                               \
-      ret = Prepare(dpp, params);              \
-    }                                  \
-    \
-    if (!stmt) {                               \
-      ldpp_dout(dpp, 0) <<"No prepared statement "<< dendl;    \
-      goto out;                        \
-    }                                  \
-    \
-    ret = Bind(dpp, params);                   \
-    if (ret) {                         \
-      ldpp_dout(dpp, 0) <<"Bind parameters failed for stmt(" <<stmt<<") "<< dendl;             \
-      goto out;                        \
-    }                                  \
-    \
-    ret = Step(dpp, params->op, stmt, cbk);            \
-    \
-    Reset(dpp, stmt);                          \
-    \
-    if (ret) {                         \
-      ldpp_dout(dpp, 0) <<"Execution failed for stmt(" <<stmt<<")"<< dendl;            \
-      goto out;                        \
-    }                                  \
-  }while(0);
-
-int SQLiteDB::InitPrepareParams(const DoutPrefixProvider *dpp,
-                                DBOpPrepareParams &p_params,
-                                DBOpParams* params)
-{
-  std::string bucket;
-
-  if (!params)
-    return -1;
-
-  if (params->user_table.empty()) {
-    params->user_table = getUserTable();
-  }
-  if (params->user_table.empty()) {
-    params->user_table = getUserTable();
-  }
-  if (params->bucket_table.empty()) {
-    params->bucket_table = getBucketTable();
-  }
-  if (params->quota_table.empty()) {
-    params->quota_table = getQuotaTable();
-  }
-  if (params->lc_entry_table.empty()) {
-    params->lc_entry_table = getLCEntryTable();
-  }
-  if (params->lc_head_table.empty()) {
-    params->lc_head_table = getLCHeadTable();
-  }
-
-  p_params.user_table = params->user_table;
-  p_params.bucket_table = params->bucket_table;
-  p_params.quota_table = params->quota_table;
-  p_params.lc_entry_table = params->lc_entry_table;
-  p_params.lc_head_table = params->lc_head_table;
-
-  p_params.op.query_str = params->op.query_str;
-
-  bucket = params->op.bucket.info.bucket.name;
-
-  if (!bucket.empty()) {
-    if (params->object_table.empty()) {
-      params->object_table = getObjectTable(bucket);
-    }
-    if (params->objectdata_table.empty()) {
-      params->objectdata_table = getObjectDataTable(bucket);
-    }
-    if (params->object_view.empty()) {
-      params->object_view = getObjectView(bucket);
-    }
-    if (params->object_trigger.empty()) {
-      params->object_trigger = getObjectTrigger(bucket);
-    }
-    p_params.object_table = params->object_table;
-    p_params.objectdata_table = params->objectdata_table;
-    p_params.object_view = params->object_view;
-  }
-
-  return 0;
-}
-
-static int list_callback(void *None, int argc, char **argv, char **aname)
-{
-  int i;
-  for(i=0; i < argc; i++) {
-    string arg = argv[i] ? argv[i] : "NULL";
-    cout<<aname[i]<<" = "<<arg<<"\n";
-  }
-  return 0;
-}
-
-enum GetUser {
-  UserID = 0,
-  Tenant,
-  NS,
-  DisplayName,
-  UserEmail,
-  AccessKeysID,
-  AccessKeysSecret,
-  AccessKeys,
-  SwiftKeys,
-  SubUsers,
-  Suspended,
-  MaxBuckets,
-  OpMask,
-  UserCaps,
-  Admin,
-  System,
-  PlacementName,
-  PlacementStorageClass,
-  PlacementTags,
-  BucketQuota,
-  TempURLKeys,
-  UserQuota,
-  TYPE,
-  MfaIDs,
-  AssumedRoleARN,
-  UserAttrs,
-  UserVersion,
-  UserVersionTag,
-};
-
-enum GetBucket {
-  BucketName = 0,
-  Bucket_Tenant, //Tenant
-  Marker,
-  BucketID,
-  Size,
-  SizeRounded,
-  CreationTime,
-  Count,
-  Bucket_PlacementName,
-  Bucket_PlacementStorageClass,
-  OwnerID,
-  Flags,
-  Zonegroup,
-  HasInstanceObj,
-  Quota,
-  RequesterPays,
-  HasWebsite,
-  WebsiteConf,
-  SwiftVersioning,
-  SwiftVerLocation,
-  MdsearchConfig,
-  NewBucketInstanceID,
-  ObjectLock,
-  SyncPolicyInfoGroups,
-  BucketAttrs,
-  BucketVersion,
-  BucketVersionTag,
-  Mtime,
-  Bucket_User_NS
-};
-
-enum GetObject {
-  ObjName,
-  ObjInstance,
-  ObjNS,
-  ObjBucketName,
-  ACLs,
-  IndexVer,
-  Tag,
-  ObjFlags,
-  VersionedEpoch,
-  ObjCategory,
-  Etag,
-  Owner,
-  OwnerDisplayName,
-  StorageClass,
-  Appendable,
-  ContentType,
-  IndexHashSource,
-  ObjSize,
-  AccountedSize,
-  ObjMtime,
-  Epoch,
-  ObjTag,
-  TailTag,
-  WriteTag,
-  FakeTag,
-  ShadowObj,
-  HasData,
-  IsVersioned,
-  VersionNum,
-  PGVer,
-  ZoneShortID,
-  ObjVersion,
-  ObjVersionTag,
-  ObjAttrs,
-  HeadSize,
-  MaxHeadSize,
-  ObjID,
-  TailInstance,
-  HeadPlacementRuleName,
-  HeadPlacementRuleStorageClass,
-  TailPlacementRuleName,
-  TailPlacementStorageClass,
-  ManifestPartObjs,
-  ManifestPartRules,
-  Omap,
-  IsMultipart,
-  MPPartsList,
-  HeadData,
-  Versions
-};
-
-enum GetObjectData {
-  ObjDataName,
-  ObjDataInstance,
-  ObjDataNS,
-  ObjDataBucketName,
-  ObjDataID,
-  MultipartPartStr,
-  PartNum,
-  Offset,
-  ObjDataSize,
-  ObjDataMtime,
-  ObjData
-};
-
-enum GetLCEntry {
-  LCEntryIndex,
-  LCEntryBucketName,
-  LCEntryStartTime,
-  LCEntryStatus
-};
-
-enum GetLCHead {
-  LCHeadIndex,
-  LCHeadMarker,
-  LCHeadStartDate
-};
-
-static int list_user(const DoutPrefixProvider *dpp, DBOpInfo &op, sqlite3_stmt *stmt) {
-  if (!stmt)
-    return -1;
-
-  op.user.uinfo.user_id.tenant = (const char*)sqlite3_column_text(stmt, Tenant);
-  op.user.uinfo.user_id.id = (const char*)sqlite3_column_text(stmt, UserID);
-  op.user.uinfo.user_id.ns = (const char*)sqlite3_column_text(stmt, NS);
-  op.user.uinfo.display_name = (const char*)sqlite3_column_text(stmt, DisplayName); // user_name
-  op.user.uinfo.user_email = (const char*)sqlite3_column_text(stmt, UserEmail);
-
-  SQL_DECODE_BLOB_PARAM(dpp, stmt, SwiftKeys, op.user.uinfo.swift_keys, sdb);
-  SQL_DECODE_BLOB_PARAM(dpp, stmt, SubUsers, op.user.uinfo.subusers, sdb);
-  SQL_DECODE_BLOB_PARAM(dpp, stmt, AccessKeys, op.user.uinfo.access_keys, sdb);
-
-  op.user.uinfo.suspended = sqlite3_column_int(stmt, Suspended);
-  op.user.uinfo.max_buckets = sqlite3_column_int(stmt, MaxBuckets);
-  op.user.uinfo.op_mask = sqlite3_column_int(stmt, OpMask);
-
-  SQL_DECODE_BLOB_PARAM(dpp, stmt, UserCaps, op.user.uinfo.caps, sdb);
-
-  op.user.uinfo.admin = sqlite3_column_int(stmt, Admin);
-  op.user.uinfo.system = sqlite3_column_int(stmt, System);
-
-  op.user.uinfo.default_placement.name = (const char*)sqlite3_column_text(stmt, PlacementName);
-
-  op.user.uinfo.default_placement.storage_class = (const char*)sqlite3_column_text(stmt, PlacementStorageClass);
-
-  SQL_DECODE_BLOB_PARAM(dpp, stmt, PlacementTags, op.user.uinfo.placement_tags, sdb);
-
-  SQL_DECODE_BLOB_PARAM(dpp, stmt, BucketQuota, op.user.uinfo.quota.bucket_quota, sdb);
-  SQL_DECODE_BLOB_PARAM(dpp, stmt, TempURLKeys, op.user.uinfo.temp_url_keys, sdb);
-  SQL_DECODE_BLOB_PARAM(dpp, stmt, UserQuota, op.user.uinfo.quota.user_quota, sdb);
-
-  op.user.uinfo.type = sqlite3_column_int(stmt, TYPE);
-
-  SQL_DECODE_BLOB_PARAM(dpp, stmt, MfaIDs, op.user.uinfo.mfa_ids, sdb);
-
-  op.user.uinfo.assumed_role_arn = (const char*)sqlite3_column_text(stmt, AssumedRoleARN);
-
-  SQL_DECODE_BLOB_PARAM(dpp, stmt, UserAttrs, op.user.user_attrs, sdb);
-  op.user.user_version.ver = sqlite3_column_int(stmt, UserVersion);
-  op.user.user_version.tag = (const char*)sqlite3_column_text(stmt, UserVersionTag);
-
-  return 0;
-}
-
-static int list_bucket(const DoutPrefixProvider *dpp, DBOpInfo &op, sqlite3_stmt *stmt) {
-  if (!stmt)
-    return -1;
-
-  op.bucket.ent.bucket.name = (const char*)sqlite3_column_text(stmt, BucketName);
-  op.bucket.ent.bucket.tenant = (const char*)sqlite3_column_text(stmt, Bucket_Tenant);
-  op.bucket.ent.bucket.marker = (const char*)sqlite3_column_text(stmt, Marker);
-  op.bucket.ent.bucket.bucket_id = (const char*)sqlite3_column_text(stmt, BucketID);
-  op.bucket.ent.size = sqlite3_column_int(stmt, Size);
-  op.bucket.ent.size_rounded = sqlite3_column_int(stmt, SizeRounded);
-  SQL_DECODE_BLOB_PARAM(dpp, stmt, CreationTime, op.bucket.ent.creation_time, sdb);
-  op.bucket.ent.count = sqlite3_column_int(stmt, Count);
-  op.bucket.ent.placement_rule.name = (const char*)sqlite3_column_text(stmt, Bucket_PlacementName);
-  op.bucket.ent.placement_rule.storage_class = (const char*)sqlite3_column_text(stmt, Bucket_PlacementStorageClass);
-
-  op.bucket.info.bucket = op.bucket.ent.bucket;
-  op.bucket.info.placement_rule = op.bucket.ent.placement_rule;
-  op.bucket.info.creation_time = op.bucket.ent.creation_time;
-
-  op.bucket.info.owner.id = (const char*)sqlite3_column_text(stmt, OwnerID);
-  op.bucket.info.owner.tenant = op.bucket.ent.bucket.tenant;
-
-  if (op.name == "GetBucket") {
-    op.bucket.info.owner.ns = (const char*)sqlite3_column_text(stmt, Bucket_User_NS);
-  }
-
-  op.bucket.info.flags = sqlite3_column_int(stmt, Flags);
-  op.bucket.info.zonegroup = (const char*)sqlite3_column_text(stmt, Zonegroup);
-  op.bucket.info.has_instance_obj = sqlite3_column_int(stmt, HasInstanceObj);
-
-  SQL_DECODE_BLOB_PARAM(dpp, stmt, Quota, op.bucket.info.quota, sdb);
-  op.bucket.info.requester_pays = sqlite3_column_int(stmt, RequesterPays);
-  op.bucket.info.has_website = sqlite3_column_int(stmt, HasWebsite);
-  SQL_DECODE_BLOB_PARAM(dpp, stmt, WebsiteConf, op.bucket.info.website_conf, sdb);
-  op.bucket.info.swift_versioning = sqlite3_column_int(stmt, SwiftVersioning);
-  op.bucket.info.swift_ver_location = (const char*)sqlite3_column_text(stmt, SwiftVerLocation);
-  SQL_DECODE_BLOB_PARAM(dpp, stmt, MdsearchConfig, op.bucket.info.mdsearch_config, sdb);
-  op.bucket.info.new_bucket_instance_id = (const char*)sqlite3_column_text(stmt, NewBucketInstanceID);
-  SQL_DECODE_BLOB_PARAM(dpp, stmt, ObjectLock, op.bucket.info.obj_lock, sdb);
-  SQL_DECODE_BLOB_PARAM(dpp, stmt, SyncPolicyInfoGroups, op.bucket.info.sync_policy, sdb);
-  SQL_DECODE_BLOB_PARAM(dpp, stmt, BucketAttrs, op.bucket.bucket_attrs, sdb);
-  op.bucket.bucket_version.ver = sqlite3_column_int(stmt, BucketVersion);
-  op.bucket.bucket_version.tag = (const char*)sqlite3_column_text(stmt, BucketVersionTag);
-
-  /* Read bucket version into info.objv_tracker.read_ver. No need
-   * to set write_ver as its not used anywhere. Still keeping its
-   * value same as read_ver */
-  op.bucket.info.objv_tracker.read_version = op.bucket.bucket_version;
-  op.bucket.info.objv_tracker.write_version = op.bucket.bucket_version;
-
-  SQL_DECODE_BLOB_PARAM(dpp, stmt, Mtime, op.bucket.mtime, sdb);
-
-  op.bucket.list_entries.push_back(op.bucket.ent);
-
-  return 0;
-}
-
-static int list_object(const DoutPrefixProvider *dpp, DBOpInfo &op, sqlite3_stmt *stmt) {
-  if (!stmt)
-    return -1;
-
-  //cout<<sqlite3_column_text(stmt, 0)<<", ";
-  //cout<<sqlite3_column_text(stmt, 1) << "\n";
-
-  op.obj.state.exists = true;
-  op.obj.state.obj.key.name = (const char*)sqlite3_column_text(stmt, ObjName);
-  op.bucket.info.bucket.name = (const char*)sqlite3_column_text(stmt, ObjBucketName);
-  op.obj.state.obj.key.instance = (const char*)sqlite3_column_text(stmt, ObjInstance);
-  op.obj.state.obj.key.ns = (const char*)sqlite3_column_text(stmt, ObjNS);
-  SQL_DECODE_BLOB_PARAM(dpp, stmt, ACLs, op.obj.acls, sdb);
-  op.obj.index_ver = sqlite3_column_int(stmt, IndexVer);
-  op.obj.tag = (const char*)sqlite3_column_text(stmt, Tag);
-  op.obj.flags = sqlite3_column_int(stmt, ObjFlags); 
-  op.obj.versioned_epoch = sqlite3_column_int(stmt, VersionedEpoch);
-  op.obj.category = (RGWObjCategory)sqlite3_column_int(stmt, ObjCategory); 
-  op.obj.etag = (const char*)sqlite3_column_text(stmt, Etag);
-  op.obj.owner = (const char*)sqlite3_column_text(stmt, Owner);
-  op.obj.owner_display_name = (const char*)sqlite3_column_text(stmt, OwnerDisplayName);
-  op.obj.storage_class = (const char*)sqlite3_column_text(stmt, StorageClass);
-  op.obj.appendable = sqlite3_column_int(stmt, Appendable); 
-  op.obj.content_type = (const char*)sqlite3_column_text(stmt, ContentType);
-  op.obj.state.obj.index_hash_source = (const char*)sqlite3_column_text(stmt, IndexHashSource);
-  op.obj.state.size = sqlite3_column_int(stmt, ObjSize); 
-  op.obj.state.accounted_size = sqlite3_column_int(stmt, AccountedSize); 
-  SQL_DECODE_BLOB_PARAM(dpp, stmt, ObjMtime, op.obj.state.mtime, sdb);
-  op.obj.state.epoch = sqlite3_column_int(stmt, Epoch);
-  SQL_DECODE_BLOB_PARAM(dpp, stmt, ObjTag, op.obj.state.obj_tag, sdb);
-  SQL_DECODE_BLOB_PARAM(dpp, stmt, TailTag, op.obj.state.tail_tag, sdb);
-  op.obj.state.write_tag = (const char*)sqlite3_column_text(stmt, WriteTag);
-  op.obj.state.fake_tag = sqlite3_column_int(stmt, FakeTag);
-  op.obj.state.shadow_obj = (const char*)sqlite3_column_text(stmt, ShadowObj);
-  op.obj.state.has_data = sqlite3_column_int(stmt, HasData); 
-  op.obj.is_versioned = sqlite3_column_int(stmt, IsVersioned); 
-  op.obj.version_num = sqlite3_column_int(stmt, VersionNum); 
-  op.obj.state.pg_ver = sqlite3_column_int(stmt, PGVer); 
-  op.obj.state.zone_short_id = sqlite3_column_int(stmt, ZoneShortID); 
-  op.obj.state.objv_tracker.read_version.ver = sqlite3_column_int(stmt, ObjVersion); 
-  op.obj.state.objv_tracker.read_version.tag = (const char*)sqlite3_column_text(stmt, ObjVersionTag);
-  SQL_DECODE_BLOB_PARAM(dpp, stmt, ObjAttrs, op.obj.state.attrset, sdb);
-  op.obj.head_size = sqlite3_column_int(stmt, HeadSize); 
-  op.obj.max_head_size = sqlite3_column_int(stmt, MaxHeadSize); 
-  op.obj.obj_id = (const char*)sqlite3_column_text(stmt, ObjID);
-  op.obj.tail_instance = (const char*)sqlite3_column_text(stmt, TailInstance);
-  op.obj.head_placement_rule.name = (const char*)sqlite3_column_text(stmt, HeadPlacementRuleName);
-  op.obj.head_placement_rule.storage_class = (const char*)sqlite3_column_text(stmt, HeadPlacementRuleStorageClass);
-  op.obj.tail_placement.placement_rule.name = (const char*)sqlite3_column_text(stmt, TailPlacementRuleName);
-  op.obj.tail_placement.placement_rule.storage_class = (const char*)sqlite3_column_text(stmt, TailPlacementStorageClass);
-  SQL_DECODE_BLOB_PARAM(dpp, stmt, ManifestPartObjs, op.obj.objs, sdb);
-  SQL_DECODE_BLOB_PARAM(dpp, stmt, ManifestPartRules, op.obj.rules, sdb);
-  SQL_DECODE_BLOB_PARAM(dpp, stmt, Omap, op.obj.omap, sdb);
-  op.obj.is_multipart = sqlite3_column_int(stmt, IsMultipart);
-  SQL_DECODE_BLOB_PARAM(dpp, stmt, MPPartsList, op.obj.mp_parts, sdb);
-  SQL_DECODE_BLOB_PARAM(dpp, stmt, HeadData, op.obj.head_data, sdb);
-  op.obj.state.data = op.obj.head_data;
-
-  rgw_bucket_dir_entry dent;
-  dent.key.name = op.obj.state.obj.key.name;
-  dent.key.instance = op.obj.state.obj.key.instance;
-  dent.tag = op.obj.tag;
-  dent.flags = op.obj.flags;
-  dent.versioned_epoch = op.obj.versioned_epoch;
-  dent.index_ver = op.obj.index_ver;
-  dent.exists = true;
-  dent.meta.category = op.obj.category;
-  dent.meta.size = op.obj.state.size;
-  dent.meta.accounted_size = op.obj.state.accounted_size;
-  dent.meta.mtime = op.obj.state.mtime;
-  dent.meta.etag = op.obj.etag;
-  dent.meta.owner = op.obj.owner;
-  dent.meta.owner_display_name = op.obj.owner_display_name;
-  dent.meta.content_type = op.obj.content_type;
-  dent.meta.storage_class = op.obj.storage_class;
-  dent.meta.appendable = op.obj.appendable;
-
-  op.obj.list_entries.push_back(dent);
-  return 0;
-}
-
-static int get_objectdata(const DoutPrefixProvider *dpp, DBOpInfo &op, sqlite3_stmt *stmt) {
-  if (!stmt)
-    return -1;
-
-  op.obj.state.obj.key.name = (const char*)sqlite3_column_text(stmt, ObjName);
-  op.bucket.info.bucket.name = (const char*)sqlite3_column_text(stmt, ObjBucketName);
-  op.obj.state.obj.key.instance = (const char*)sqlite3_column_text(stmt, ObjInstance);
-  op.obj.state.obj.key.ns = (const char*)sqlite3_column_text(stmt, ObjNS);
-  op.obj.obj_id = (const char*)sqlite3_column_text(stmt, ObjDataID);
-  op.obj_data.part_num = sqlite3_column_int(stmt, PartNum);
-  op.obj_data.offset = sqlite3_column_int(stmt, Offset);
-  op.obj_data.size = sqlite3_column_int(stmt, ObjDataSize);
-  op.obj_data.multipart_part_str = (const char*)sqlite3_column_text(stmt, MultipartPartStr);
-  SQL_DECODE_BLOB_PARAM(dpp, stmt, ObjDataMtime, op.obj.state.mtime, sdb);
-  SQL_DECODE_BLOB_PARAM(dpp, stmt, ObjData, op.obj_data.data, sdb);
-
-  return 0;
-}
-
-static int list_lc_entry(const DoutPrefixProvider *dpp, DBOpInfo &op, sqlite3_stmt *stmt) {
-  if (!stmt)
-    return -1;
-
-  op.lc_entry.index = (const char*)sqlite3_column_text(stmt, LCEntryIndex);
-  op.lc_entry.entry.set_bucket((const char*)sqlite3_column_text(stmt, LCEntryBucketName));
-  op.lc_entry.entry.set_start_time(sqlite3_column_int(stmt, LCEntryStartTime));
-  op.lc_entry.entry.set_status(sqlite3_column_int(stmt, LCEntryStatus));
-  op.lc_entry.list_entries.push_back(op.lc_entry.entry);
-
-  return 0;
-}
-
-static int list_lc_head(const DoutPrefixProvider *dpp, DBOpInfo &op, sqlite3_stmt *stmt) {
-  if (!stmt)
-    return -1;
-
-  int64_t start_date;
-
-  op.lc_head.index = (const char*)sqlite3_column_text(stmt, LCHeadIndex);
-  op.lc_head.head.set_marker((const char*)sqlite3_column_text(stmt, LCHeadMarker));
-  SQL_DECODE_BLOB_PARAM(dpp, stmt, LCHeadStartDate, start_date, sdb);
-  op.lc_head.head.get_start_date() = start_date;
-
-  return 0;
-}
-
-int SQLiteDB::InitializeDBOps(const DoutPrefixProvider *dpp)
-{
-  (void)createTables(dpp);
-  dbops.InsertUser = make_shared<SQLInsertUser>(&this->db, this->getDBname(), cct);
-  dbops.RemoveUser = make_shared<SQLRemoveUser>(&this->db, this->getDBname(), cct);
-  dbops.GetUser = make_shared<SQLGetUser>(&this->db, this->getDBname(), cct);
-  dbops.InsertBucket = make_shared<SQLInsertBucket>(&this->db, this->getDBname(), cct);
-  dbops.UpdateBucket = make_shared<SQLUpdateBucket>(&this->db, this->getDBname(), cct);
-  dbops.RemoveBucket = make_shared<SQLRemoveBucket>(&this->db, this->getDBname(), cct);
-  dbops.GetBucket = make_shared<SQLGetBucket>(&this->db, this->getDBname(), cct);
-  dbops.ListUserBuckets = make_shared<SQLListUserBuckets>(&this->db, this->getDBname(), cct);
-  dbops.InsertLCEntry = make_shared<SQLInsertLCEntry>(&this->db, this->getDBname(), cct);
-  dbops.RemoveLCEntry = make_shared<SQLRemoveLCEntry>(&this->db, this->getDBname(), cct);
-  dbops.GetLCEntry = make_shared<SQLGetLCEntry>(&this->db, this->getDBname(), cct);
-  dbops.ListLCEntries = make_shared<SQLListLCEntries>(&this->db, this->getDBname(), cct);
-  dbops.InsertLCHead = make_shared<SQLInsertLCHead>(&this->db, this->getDBname(), cct);
-  dbops.RemoveLCHead = make_shared<SQLRemoveLCHead>(&this->db, this->getDBname(), cct);
-  dbops.GetLCHead = make_shared<SQLGetLCHead>(&this->db, this->getDBname(), cct);
-
-  return 0;
-}
-
-void *SQLiteDB::openDB(const DoutPrefixProvider *dpp)
-{
-  string dbname;
-  int rc = 0;
-
-  dbname = getDBfile();
-  if (dbname.empty()) {
-    ldpp_dout(dpp, 0)<<"dbname is NULL" << dendl;
-    goto out;
-  }
-
-  rc = sqlite3_open_v2(dbname.c_str(), (sqlite3**)&db,
-      SQLITE_OPEN_READWRITE |
-      SQLITE_OPEN_CREATE |
-      SQLITE_OPEN_FULLMUTEX,
-      NULL);
-
-  if (rc) {
-    ldpp_dout(dpp, 0) <<"Cant open "<<dbname<<"; Errmsg - "\
-      <<sqlite3_errmsg((sqlite3*)db) <<  dendl;
-  } else {
-    ldpp_dout(dpp, 0) <<"Opened database("<<dbname<<") successfully" <<  dendl;
-  }
-
-  exec(dpp, "PRAGMA foreign_keys=ON", NULL);
-
-out:
-  return db;
-}
-
-int SQLiteDB::closeDB(const DoutPrefixProvider *dpp)
-{
-  if (db)
-    sqlite3_close((sqlite3 *)db);
-
-  db = NULL;
-
-  return 0;
-}
-
-int SQLiteDB::Reset(const DoutPrefixProvider *dpp, sqlite3_stmt *stmt)
-{
-  int ret = -1;
-
-  if (!stmt) {
-    return -1;
-  }
-  sqlite3_clear_bindings(stmt);
-  ret = sqlite3_reset(stmt);
-
-  return ret;
-}
-
-int SQLiteDB::Step(const DoutPrefixProvider *dpp, DBOpInfo &op, sqlite3_stmt *stmt,
-    int (*cbk)(const DoutPrefixProvider *dpp, DBOpInfo &op, sqlite3_stmt *stmt))
-{
-  int ret = -1;
-
-  if (!stmt) {
-    return -1;
-  }
-
-again:
-  ret = sqlite3_step(stmt);
-
-  if ((ret != SQLITE_DONE) && (ret != SQLITE_ROW)) {
-    ldpp_dout(dpp, 0)<<"sqlite step failed for stmt("<<stmt \
-      <<"); Errmsg - "<<sqlite3_errmsg((sqlite3*)db) << dendl;
-    return -1;
-  } else if (ret == SQLITE_ROW) {
-    if (cbk) {
-      (*cbk)(dpp, op, stmt);
-    } else {
-    }
-    goto again;
-  }
-
-  ldpp_dout(dpp, 20)<<"sqlite step successfully executed for stmt(" \
-    <<stmt<<")  ret = " << ret << dendl;
-
-  return 0;
-}
-
-int SQLiteDB::exec(const DoutPrefixProvider *dpp, const char *schema,
-    int (*callback)(void*,int,char**,char**))
-{
-  int ret = -1;
-  char *errmsg = NULL;
-
-  if (!db)
-    goto out;
-
-  ret = sqlite3_exec((sqlite3*)db, schema, callback, 0, &errmsg);
-  if (ret != SQLITE_OK) {
-    ldpp_dout(dpp, 0) <<"sqlite exec failed for schema("<<schema \
-      <<"); Errmsg - "<<errmsg <<  dendl;
-    sqlite3_free(errmsg);
-    goto out;
-  }
-  ret = 0;
-  ldpp_dout(dpp, 10) <<"sqlite exec successfully processed for schema(" \
-    <<schema<<")" <<  dendl;
-out:
-  return ret;
-}
-
-int SQLiteDB::createTables(const DoutPrefixProvider *dpp)
-{
-  int ret = -1;
-  int cu = 0, cb = 0, cq = 0;
-  DBOpParams params = {};
-
-  params.user_table = getUserTable();
-  params.bucket_table = getBucketTable();
-
-  if ((cu = createUserTable(dpp, &params)))
-    goto out;
-
-  if ((cb = createBucketTable(dpp, &params)))
-    goto out;
-
-  if ((cq = createQuotaTable(dpp, &params)))
-    goto out;
-
-  ret = 0;
-out:
-  if (ret) {
-    if (cu)
-      DeleteUserTable(dpp, &params);
-    if (cb)
-      DeleteBucketTable(dpp, &params);
-    ldpp_dout(dpp, 0)<<"Creation of tables failed" << dendl;
-  }
-
-  return ret;
-}
-
-int SQLiteDB::createUserTable(const DoutPrefixProvider *dpp, DBOpParams *params)
-{
-  int ret = -1;
-  string schema;
-
-  schema = CreateTableSchema("User", params);
-
-  ret = exec(dpp, schema.c_str(), NULL);
-  if (ret)
-    ldpp_dout(dpp, 0)<<"CreateUserTable failed" << dendl;
-
-  ldpp_dout(dpp, 20)<<"CreateUserTable suceeded" << dendl;
-
-  return ret;
-}
-
-int SQLiteDB::createBucketTable(const DoutPrefixProvider *dpp, DBOpParams *params)
-{
-  int ret = -1;
-  string schema;
-
-  schema = CreateTableSchema("Bucket", params);
-
-  ret = exec(dpp, schema.c_str(), NULL);
-  if (ret)
-    ldpp_dout(dpp, 0)<<"CreateBucketTable failed " << dendl;
-
-  ldpp_dout(dpp, 20)<<"CreateBucketTable suceeded " << dendl;
-
-  return ret;
-}
-
-int SQLiteDB::createObjectTable(const DoutPrefixProvider *dpp, DBOpParams *params)
-{
-  int ret = -1;
-  string schema;
-
-  schema = CreateTableSchema("Object", params);
-
-  ret = exec(dpp, schema.c_str(), NULL);
-  if (ret)
-    ldpp_dout(dpp, 0)<<"CreateObjectTable failed " << dendl;
-
-  ldpp_dout(dpp, 20)<<"CreateObjectTable suceeded " << dendl;
-
-  return ret;
-}
-
-int SQLiteDB::createObjectTableTrigger(const DoutPrefixProvider *dpp, DBOpParams *params)
-{
-  int ret = -1;
-  string schema;
-
-  schema = CreateTableSchema("ObjectTrigger", params);
-
-  ret = exec(dpp, schema.c_str(), NULL);
-  if (ret)
-    ldpp_dout(dpp, 0)<<"CreateObjectTableTrigger failed " << dendl;
-
-  ldpp_dout(dpp, 20)<<"CreateObjectTableTrigger suceeded " << dendl;
-
-  return ret;
-}
-
-int SQLiteDB::createObjectView(const DoutPrefixProvider *dpp, DBOpParams *params)
-{
-  int ret = -1;
-  string schema;
-
-  schema = CreateTableSchema("ObjectView", params);
-
-  ret = exec(dpp, schema.c_str(), NULL);
-  if (ret)
-    ldpp_dout(dpp, 0)<<"CreateObjectView failed " << dendl;
-
-  ldpp_dout(dpp, 20)<<"CreateObjectView suceeded " << dendl;
-
-  return ret;
-}
-
-int SQLiteDB::createQuotaTable(const DoutPrefixProvider *dpp, DBOpParams *params)
-{
-  int ret = -1;
-  string schema;
-
-  schema = CreateTableSchema("Quota", params);
-
-  ret = exec(dpp, schema.c_str(), NULL);
-  if (ret)
-    ldpp_dout(dpp, 0)<<"CreateQuotaTable failed " << dendl;
-
-  ldpp_dout(dpp, 20)<<"CreateQuotaTable suceeded " << dendl;
-
-  return ret;
-}
-
-int SQLiteDB::createObjectDataTable(const DoutPrefixProvider *dpp, DBOpParams *params)
-{
-  int ret = -1;
-  string schema;
-
-  schema = CreateTableSchema("ObjectData", params);
-
-  ret = exec(dpp, schema.c_str(), NULL);
-  if (ret)
-    ldpp_dout(dpp, 0)<<"CreateObjectDataTable failed " << dendl;
-
-  ldpp_dout(dpp, 20)<<"CreateObjectDataTable suceeded " << dendl;
-
-  return ret;
-}
-
-int SQLiteDB::createLCTables(const DoutPrefixProvider *dpp)
-{
-  int ret = -1;
-  string schema;
-  DBOpParams params = {};
-
-  params.lc_entry_table = getLCEntryTable();
-  params.lc_head_table = getLCHeadTable();
-  params.bucket_table = getBucketTable();
-
-  schema = CreateTableSchema("LCEntry", &params);
-  ret = exec(dpp, schema.c_str(), NULL);
-  if (ret) {
-    ldpp_dout(dpp, 0)<<"CreateLCEntryTable failed" << dendl;
-    return ret;
-  }
-  ldpp_dout(dpp, 20)<<"CreateLCEntryTable suceeded" << dendl;
-
-  schema = CreateTableSchema("LCHead", &params);
-  ret = exec(dpp, schema.c_str(), NULL);
-  if (ret) {
-    ldpp_dout(dpp, 0)<<"CreateLCHeadTable failed" << dendl;
-    (void)DeleteLCEntryTable(dpp, &params);
-  }
-  ldpp_dout(dpp, 20)<<"CreateLCHeadTable suceeded" << dendl;
-
-  return ret;
-}
-
-int SQLiteDB::DeleteUserTable(const DoutPrefixProvider *dpp, DBOpParams *params)
-{
-  int ret = -1;
-  string schema;
-
-  schema = DeleteTableSchema(params->user_table);
-
-  ret = exec(dpp, schema.c_str(), NULL);
-  if (ret)
-    ldpp_dout(dpp, 0)<<"DeleteUserTable failed " << dendl;
-
-  ldpp_dout(dpp, 20)<<"DeleteUserTable suceeded " << dendl;
-
-  return ret;
-}
-
-int SQLiteDB::DeleteBucketTable(const DoutPrefixProvider *dpp, DBOpParams *params)
-{
-  int ret = -1;
-  string schema;
-
-  schema = DeleteTableSchema(params->bucket_table);
-
-  ret = exec(dpp, schema.c_str(), NULL);
-  if (ret)
-    ldpp_dout(dpp, 0)<<"DeletebucketTable failed " << dendl;
-
-  ldpp_dout(dpp, 20)<<"DeletebucketTable suceeded " << dendl;
-
-  return ret;
-}
-
-int SQLiteDB::DeleteObjectTable(const DoutPrefixProvider *dpp, DBOpParams *params)
-{
-  int ret = -1;
-  string schema;
-
-  schema = DeleteTableSchema(params->object_table);
-
-  ret = exec(dpp, schema.c_str(), NULL);
-  if (ret)
-    ldpp_dout(dpp, 0)<<"DeleteObjectTable failed " << dendl;
-
-  ldpp_dout(dpp, 20)<<"DeleteObjectTable suceeded " << dendl;
-
-  return ret;
-}
-
-int SQLiteDB::DeleteObjectDataTable(const DoutPrefixProvider *dpp, DBOpParams *params)
-{
-  int ret = -1;
-  string schema;
-
-  schema = DeleteTableSchema(params->objectdata_table);
-
-  ret = exec(dpp, schema.c_str(), NULL);
-  if (ret)
-    ldpp_dout(dpp, 0)<<"DeleteObjectDataTable failed " << dendl;
-
-  ldpp_dout(dpp, 20)<<"DeleteObjectDataTable suceeded " << dendl;
-
-  return ret;
-}
-
-int SQLiteDB::DeleteQuotaTable(const DoutPrefixProvider *dpp, DBOpParams *params)
-{
-  int ret = -1;
-  string schema;
-
-  schema = DeleteTableSchema(params->quota_table);
-
-  ret = exec(dpp, schema.c_str(), NULL);
-  if (ret)
-    ldpp_dout(dpp, 0)<<"DeleteQuotaTable failed " << dendl;
-
-  ldpp_dout(dpp, 20)<<"DeleteQuotaTable suceeded " << dendl;
-
-  return ret;
-}
-
-int SQLiteDB::DeleteLCEntryTable(const DoutPrefixProvider *dpp, DBOpParams *params)
-{
-  int ret = -1;
-  string schema;
-
-  schema = DeleteTableSchema(params->lc_entry_table);
-  ret = exec(dpp, schema.c_str(), NULL);
-  if (ret)
-    ldpp_dout(dpp, 0)<<"DeleteLCEntryTable failed " << dendl;
-  ldpp_dout(dpp, 20)<<"DeleteLCEntryTable suceeded " << dendl;
-
-  return ret;
-}
-
-int SQLiteDB::DeleteLCHeadTable(const DoutPrefixProvider *dpp, DBOpParams *params)
-{
-  int ret = -1;
-  string schema;
-
-  schema = DeleteTableSchema(params->lc_head_table);
-  ret = exec(dpp, schema.c_str(), NULL);
-  if (ret)
-    ldpp_dout(dpp, 0)<<"DeleteLCHeadTable failed " << dendl;
-  ldpp_dout(dpp, 20)<<"DeleteLCHeadTable suceeded " << dendl;
-
-  return ret;
-}
-
-int SQLiteDB::ListAllUsers(const DoutPrefixProvider *dpp, DBOpParams *params)
-{
-  int ret = -1;
-  string schema;
-
-  schema = ListTableSchema(params->user_table);
-  ret = exec(dpp, schema.c_str(), &list_callback);
-  if (ret)
-    ldpp_dout(dpp, 0)<<"GetUsertable failed " << dendl;
-
-  ldpp_dout(dpp, 20)<<"GetUserTable suceeded " << dendl;
-
-  return ret;
-}
-
-int SQLiteDB::ListAllBuckets(const DoutPrefixProvider *dpp, DBOpParams *params)
-{
-  int ret = -1;
-  string schema;
-
-  schema = ListTableSchema(params->bucket_table);
-
-  ret = exec(dpp, schema.c_str(), &list_callback);
-  if (ret)
-    ldpp_dout(dpp, 0)<<"Listbuckettable failed " << dendl;
-
-  ldpp_dout(dpp, 20)<<"ListbucketTable suceeded " << dendl;
-
-  return ret;
-}
-
-int SQLiteDB::ListAllObjects(const DoutPrefixProvider *dpp, DBOpParams *params)
-{
-  int ret = -1;
-  string schema;
-  map<string, class ObjectOp*>::iterator iter;
-  map<string, class ObjectOp*> objectmap;
-  string bucket;
-
-  objectmap = getObjectMap();
-
-  if (objectmap.empty())
-    ldpp_dout(dpp, 20)<<"objectmap empty " << dendl;
-
-  for (iter = objectmap.begin(); iter != objectmap.end(); ++iter) {
-    bucket = iter->first;
-    params->object_table = getObjectTable(bucket);
-    schema = ListTableSchema(params->object_table);
-
-    ret = exec(dpp, schema.c_str(), &list_callback);
-    if (ret)
-      ldpp_dout(dpp, 0)<<"ListObjecttable failed " << dendl;
-
-    ldpp_dout(dpp, 20)<<"ListObjectTable suceeded " << dendl;
-  }
-
-  return ret;
-}
-
-int SQLObjectOp::InitializeObjectOps(string db_name, const DoutPrefixProvider *dpp)
-{
-  PutObject = make_shared<SQLPutObject>(sdb, db_name, cct);
-  DeleteObject = make_shared<SQLDeleteObject>(sdb, db_name, cct);
-  GetObject = make_shared<SQLGetObject>(sdb, db_name, cct);
-  UpdateObject = make_shared<SQLUpdateObject>(sdb, db_name, cct);
-  ListBucketObjects = make_shared<SQLListBucketObjects>(sdb, db_name, cct);
-  ListVersionedObjects = make_shared<SQLListVersionedObjects>(sdb, db_name, cct);
-  PutObjectData = make_shared<SQLPutObjectData>(sdb, db_name, cct);
-  UpdateObjectData = make_shared<SQLUpdateObjectData>(sdb, db_name, cct);
-  GetObjectData = make_shared<SQLGetObjectData>(sdb, db_name, cct);
-  DeleteObjectData = make_shared<SQLDeleteObjectData>(sdb, db_name, cct);
-  DeleteStaleObjectData = make_shared<SQLDeleteStaleObjectData>(sdb, db_name, cct);
-
-  return 0;
-}
-
-int SQLInsertUser::Prepare(const DoutPrefixProvider *dpp, struct DBOpParams *params)
-{
-  int ret = -1;
-  struct DBOpPrepareParams p_params = PrepareParams;
-
-  if (!*sdb) {
-    ldpp_dout(dpp, 0)<<"In SQLInsertUser - no db" << dendl;
-    goto out;
-  }
-
-  InitPrepareParams(dpp, p_params, params);
-
-  SQL_PREPARE(dpp, p_params, sdb, stmt, ret, "PrepareInsertUser");
-out:
-  return ret;
-}
-
-int SQLInsertUser::Bind(const DoutPrefixProvider *dpp, struct DBOpParams *params)
-{
-  int index = -1;
-  int rc = 0;
-  struct DBOpPrepareParams p_params = PrepareParams;
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.user.tenant, sdb);
-  SQL_BIND_TEXT(dpp, stmt, index, params->op.user.uinfo.user_id.tenant.c_str(), sdb);
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.user.user_id, sdb);
-  SQL_BIND_TEXT(dpp, stmt, index, params->op.user.uinfo.user_id.id.c_str(), sdb);
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.user.ns, sdb);
-  SQL_BIND_TEXT(dpp, stmt, index, params->op.user.uinfo.user_id.ns.c_str(), sdb);
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.user.display_name, sdb);
-  SQL_BIND_TEXT(dpp, stmt, index, params->op.user.uinfo.display_name.c_str(), sdb);
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.user.user_email, sdb);
-  SQL_BIND_TEXT(dpp, stmt, index, params->op.user.uinfo.user_email.c_str(), sdb);
-
-  if (!params->op.user.uinfo.access_keys.empty()) {
-    string access_key;
-    string key;
-    map<string, RGWAccessKey>::const_iterator it =
-      params->op.user.uinfo.access_keys.begin();
-    const RGWAccessKey& k = it->second;
-    access_key = k.id;
-    key = k.key;
-
-    SQL_BIND_INDEX(dpp, stmt, index, p_params.op.user.access_keys_id, sdb);
-    SQL_BIND_TEXT(dpp, stmt, index, access_key.c_str(), sdb);
-
-    SQL_BIND_INDEX(dpp, stmt, index, p_params.op.user.access_keys_secret, sdb);
-    SQL_BIND_TEXT(dpp, stmt, index, key.c_str(), sdb);
-
-  }
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.user.access_keys, sdb);
-  SQL_ENCODE_BLOB_PARAM(dpp, stmt, index, params->op.user.uinfo.access_keys, sdb);
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.user.swift_keys, sdb);
-  SQL_ENCODE_BLOB_PARAM(dpp, stmt, index, params->op.user.uinfo.swift_keys, sdb);
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.user.subusers, sdb);
-  SQL_ENCODE_BLOB_PARAM(dpp, stmt, index, params->op.user.uinfo.subusers, sdb);
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.user.suspended, sdb);
-  SQL_BIND_INT(dpp, stmt, index, params->op.user.uinfo.suspended, sdb);
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.user.max_buckets, sdb);
-  SQL_BIND_INT(dpp, stmt, index, params->op.user.uinfo.max_buckets, sdb);
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.user.op_mask, sdb);
-  SQL_BIND_INT(dpp, stmt, index, params->op.user.uinfo.op_mask, sdb);
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.user.user_caps, sdb);
-  SQL_ENCODE_BLOB_PARAM(dpp, stmt, index, params->op.user.uinfo.caps, sdb);
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.user.admin, sdb);
-  SQL_BIND_INT(dpp, stmt, index, params->op.user.uinfo.admin, sdb);
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.user.system, sdb);
-  SQL_BIND_INT(dpp, stmt, index, params->op.user.uinfo.system, sdb);
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.user.placement_name, sdb);
-  SQL_BIND_TEXT(dpp, stmt, index, params->op.user.uinfo.default_placement.name.c_str(), sdb);
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.user.placement_storage_class, sdb);
-  SQL_BIND_TEXT(dpp, stmt, index, params->op.user.uinfo.default_placement.storage_class.c_str(), sdb);
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.user.placement_tags, sdb);
-  SQL_ENCODE_BLOB_PARAM(dpp, stmt, index, params->op.user.uinfo.placement_tags, sdb);
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.user.bucket_quota, sdb);
-  SQL_ENCODE_BLOB_PARAM(dpp, stmt, index, params->op.user.uinfo.quota.bucket_quota, sdb);
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.user.temp_url_keys, sdb);
-  SQL_ENCODE_BLOB_PARAM(dpp, stmt, index, params->op.user.uinfo.temp_url_keys, sdb);
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.user.user_quota, sdb);
-  SQL_ENCODE_BLOB_PARAM(dpp, stmt, index, params->op.user.uinfo.quota.user_quota, sdb);
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.user.type, sdb);
-  SQL_BIND_INT(dpp, stmt, index, params->op.user.uinfo.type, sdb);
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.user.mfa_ids, sdb);
-  SQL_ENCODE_BLOB_PARAM(dpp, stmt, index, params->op.user.uinfo.mfa_ids, sdb);
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.user.assumed_role_arn, sdb);
-  SQL_BIND_TEXT(dpp, stmt, index, params->op.user.uinfo.assumed_role_arn.c_str(), sdb);
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.user.user_attrs, sdb);
-  SQL_ENCODE_BLOB_PARAM(dpp, stmt, index, params->op.user.user_attrs, sdb);
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.user.user_ver, sdb);
-  SQL_BIND_INT(dpp, stmt, index, params->op.user.user_version.ver, sdb);
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.user.user_ver_tag, sdb);
-  SQL_BIND_TEXT(dpp, stmt, index, params->op.user.user_version.tag.c_str(), sdb);
-
-out:
-  return rc;
-}
-
-int SQLInsertUser::Execute(const DoutPrefixProvider *dpp, struct DBOpParams *params)
-{
-  int ret = -1;
-
-  SQL_EXECUTE(dpp, params, stmt, NULL);
-out:
-  return ret;
-}
-
-int SQLRemoveUser::Prepare(const DoutPrefixProvider *dpp, struct DBOpParams *params)
-{
-  int ret = -1;
-  struct DBOpPrepareParams p_params = PrepareParams;
-
-  if (!*sdb) {
-    ldpp_dout(dpp, 0)<<"In SQLRemoveUser - no db" << dendl;
-    goto out;
-  }
-
-  InitPrepareParams(dpp, p_params, params);
-
-  SQL_PREPARE(dpp, p_params, sdb, stmt, ret, "PrepareRemoveUser");
-out:
-  return ret;
-}
-
-int SQLRemoveUser::Bind(const DoutPrefixProvider *dpp, struct DBOpParams *params)
-{
-  int index = -1;
-  int rc = 0;
-  struct DBOpPrepareParams p_params = PrepareParams;
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.user.user_id, sdb);
-  SQL_BIND_TEXT(dpp, stmt, index, params->op.user.uinfo.user_id.id.c_str(), sdb);
-
-out:
-  return rc;
-}
-
-int SQLRemoveUser::Execute(const DoutPrefixProvider *dpp, struct DBOpParams *params)
-{
-  int ret = -1;
-
-  SQL_EXECUTE(dpp, params, stmt, NULL);
-out:
-  return ret;
-}
-
-int SQLGetUser::Prepare(const DoutPrefixProvider *dpp, struct DBOpParams *params)
-{
-  int ret = -1;
-  struct DBOpPrepareParams p_params = PrepareParams;
-
-  if (!*sdb) {
-    ldpp_dout(dpp, 0)<<"In SQLGetUser - no db" << dendl;
-    goto out;
-  }
-
-  InitPrepareParams(dpp, p_params, params);
-
-  if (params->op.query_str == "email") { 
-    SQL_PREPARE(dpp, p_params, sdb, email_stmt, ret, "PrepareGetUser");
-  } else if (params->op.query_str == "access_key") { 
-    SQL_PREPARE(dpp, p_params, sdb, ak_stmt, ret, "PrepareGetUser");
-  } else if (params->op.query_str == "user_id") { 
-    SQL_PREPARE(dpp, p_params, sdb, userid_stmt, ret, "PrepareGetUser");
-  } else { // by default by userid
-    SQL_PREPARE(dpp, p_params, sdb, stmt, ret, "PrepareGetUser");
-  }
-out:
-  return ret;
-}
-
-int SQLGetUser::Bind(const DoutPrefixProvider *dpp, struct DBOpParams *params)
-{
-  int index = -1;
-  int rc = 0;
-  struct DBOpPrepareParams p_params = PrepareParams;
-
-  if (params->op.query_str == "email") { 
-    SQL_BIND_INDEX(dpp, email_stmt, index, p_params.op.user.user_email, sdb);
-    SQL_BIND_TEXT(dpp, email_stmt, index, params->op.user.uinfo.user_email.c_str(), sdb);
-  } else if (params->op.query_str == "access_key") { 
-    if (!params->op.user.uinfo.access_keys.empty()) {
-      string access_key;
-      map<string, RGWAccessKey>::const_iterator it =
-        params->op.user.uinfo.access_keys.begin();
-      const RGWAccessKey& k = it->second;
-      access_key = k.id;
-
-      SQL_BIND_INDEX(dpp, ak_stmt, index, p_params.op.user.access_keys_id, sdb);
-      SQL_BIND_TEXT(dpp, ak_stmt, index, access_key.c_str(), sdb);
-    }
-  } else if (params->op.query_str == "user_id") { 
-    SQL_BIND_INDEX(dpp, userid_stmt, index, p_params.op.user.user_id, sdb);
-    SQL_BIND_TEXT(dpp, userid_stmt, index, params->op.user.uinfo.user_id.id.c_str(), sdb);
-  } else { // by default by userid
-    SQL_BIND_INDEX(dpp, stmt, index, p_params.op.user.user_id, sdb);
-    SQL_BIND_TEXT(dpp, stmt, index, params->op.user.uinfo.user_id.id.c_str(), sdb);
-  }
-
-out:
-  return rc;
-}
-
-int SQLGetUser::Execute(const DoutPrefixProvider *dpp, struct DBOpParams *params)
-{
-  int ret = -1;
-
-  if (params->op.query_str == "email") { 
-    SQL_EXECUTE(dpp, params, email_stmt, list_user);
-  } else if (params->op.query_str == "access_key") { 
-    SQL_EXECUTE(dpp, params, ak_stmt, list_user);
-  } else if (params->op.query_str == "user_id") { 
-    SQL_EXECUTE(dpp, params, userid_stmt, list_user);
-  } else { // by default by userid
-    SQL_EXECUTE(dpp, params, stmt, list_user);
-  }
-
-out:
-  return ret;
-}
-
-int SQLInsertBucket::Prepare(const DoutPrefixProvider *dpp, struct DBOpParams *params)
-{
-  int ret = -1;
-  struct DBOpPrepareParams p_params = PrepareParams;
-
-  if (!*sdb) {
-    ldpp_dout(dpp, 0)<<"In SQLInsertBucket - no db" << dendl;
-    goto out;
-  }
-
-  InitPrepareParams(dpp, p_params, params);
-
-  SQL_PREPARE(dpp, p_params, sdb, stmt, ret, "PrepareInsertBucket");
-
-out:
-  return ret;
-}
-
-int SQLInsertBucket::Bind(const DoutPrefixProvider *dpp, struct DBOpParams *params)
-{
-  int index = -1;
-  int rc = 0;
-  struct DBOpPrepareParams p_params = PrepareParams;
-
-  // user_id here is copied as OwnerID in the bucket table.
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.user.user_id, sdb);
-  SQL_BIND_TEXT(dpp, stmt, index, params->op.user.uinfo.user_id.id.c_str(), sdb);
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.bucket_name, sdb);
-  SQL_BIND_TEXT(dpp, stmt, index, params->op.bucket.info.bucket.name.c_str(), sdb);
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.tenant, sdb);
-  SQL_BIND_TEXT(dpp, stmt, index, params->op.bucket.info.bucket.tenant.c_str(), sdb);
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.marker, sdb);
-  SQL_BIND_TEXT(dpp, stmt, index, params->op.bucket.info.bucket.marker.c_str(), sdb);
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.bucket_id, sdb);
-  SQL_BIND_TEXT(dpp, stmt, index, params->op.bucket.info.bucket.bucket_id.c_str(), sdb);
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.size, sdb);
-  SQL_BIND_INT(dpp, stmt, index, params->op.bucket.ent.size, sdb);
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.size_rounded, sdb);
-  SQL_BIND_INT(dpp, stmt, index, params->op.bucket.ent.size_rounded, sdb);
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.creation_time, sdb);
-  SQL_ENCODE_BLOB_PARAM(dpp, stmt, index, params->op.bucket.info.creation_time, sdb);
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.count, sdb);
-  SQL_BIND_INT(dpp, stmt, index, params->op.bucket.ent.count, sdb);
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.placement_name, sdb);
-  SQL_BIND_TEXT(dpp, stmt, index, params->op.bucket.info.placement_rule.name.c_str(), sdb);
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.placement_storage_class, sdb);
-  SQL_BIND_TEXT(dpp, stmt, index, params->op.bucket.info.placement_rule.storage_class.c_str(), sdb);
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.flags, sdb);
-  SQL_BIND_INT(dpp, stmt, index, params->op.bucket.info.flags, sdb);
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.zonegroup, sdb);
-  SQL_BIND_TEXT(dpp, stmt, index, params->op.bucket.info.zonegroup.c_str(), sdb);
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.has_instance_obj, sdb);
-  SQL_BIND_INT(dpp, stmt, index, params->op.bucket.info.has_instance_obj, sdb);
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.quota, sdb);
-  SQL_ENCODE_BLOB_PARAM(dpp, stmt, index, params->op.bucket.info.quota, sdb);
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.requester_pays, sdb);
-  SQL_BIND_INT(dpp, stmt, index, params->op.bucket.info.requester_pays, sdb);
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.has_website, sdb);
-  SQL_BIND_INT(dpp, stmt, index, params->op.bucket.info.has_website, sdb);
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.website_conf, sdb);
-  SQL_ENCODE_BLOB_PARAM(dpp, stmt, index, params->op.bucket.info.website_conf, sdb);
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.swift_versioning, sdb);
-  SQL_BIND_INT(dpp, stmt, index, params->op.bucket.info.swift_versioning, sdb);
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.swift_ver_location, sdb);
-  SQL_BIND_TEXT(dpp, stmt, index, params->op.bucket.info.swift_ver_location.c_str(), sdb);
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.mdsearch_config, sdb);
-  SQL_ENCODE_BLOB_PARAM(dpp, stmt, index, params->op.bucket.info.mdsearch_config, sdb);
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.new_bucket_instance_id, sdb);
-  SQL_BIND_TEXT(dpp, stmt, index, params->op.bucket.info.new_bucket_instance_id.c_str(), sdb);
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.obj_lock, sdb);
-  SQL_ENCODE_BLOB_PARAM(dpp, stmt, index, params->op.bucket.info.obj_lock, sdb);
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.sync_policy_info_groups, sdb);
-  SQL_ENCODE_BLOB_PARAM(dpp, stmt, index, params->op.bucket.info.sync_policy, sdb);
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.bucket_attrs, sdb);
-  SQL_ENCODE_BLOB_PARAM(dpp, stmt, index, params->op.bucket.bucket_attrs, sdb);
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.bucket_ver, sdb);
-  SQL_BIND_INT(dpp, stmt, index, params->op.bucket.bucket_version.ver, sdb);
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.bucket_ver_tag, sdb);
-  SQL_BIND_TEXT(dpp, stmt, index, params->op.bucket.bucket_version.tag.c_str(), sdb);
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.mtime, sdb);
-  SQL_ENCODE_BLOB_PARAM(dpp, stmt, index, params->op.bucket.mtime, sdb);
-
-out:
-  return rc;
-}
-
-int SQLInsertBucket::Execute(const DoutPrefixProvider *dpp, struct DBOpParams *params)
-{
-  int ret = -1;
-  class SQLObjectOp *ObPtr = NULL;
-  string bucket_name = params->op.bucket.info.bucket.name;
-  struct DBOpPrepareParams p_params = PrepareParams;
-
-  ObPtr = new SQLObjectOp(sdb, ctx());
-
-  objectmapInsert(dpp, bucket_name, ObPtr);
-
-  SQL_EXECUTE(dpp, params, stmt, NULL);
-
-  /* Once Bucket is inserted created corresponding object(&data) tables
-   */
-  InitPrepareParams(dpp, p_params, params);
-
-  (void)createObjectTable(dpp, params);
-  (void)createObjectDataTable(dpp, params);
-  (void)createObjectTableTrigger(dpp, params);
-out:
-  return ret;
-}
-
-int SQLUpdateBucket::Prepare(const DoutPrefixProvider *dpp, struct DBOpParams *params)
-{
-  int ret = -1;
-  struct DBOpPrepareParams p_params = PrepareParams;
-
-  if (!*sdb) {
-    ldpp_dout(dpp, 0)<<"In SQLUpdateBucket - no db" << dendl;
-    goto out;
-  }
-
-  InitPrepareParams(dpp, p_params, params);
-
-  if (params->op.query_str == "attrs") { 
-    SQL_PREPARE(dpp, p_params, sdb, attrs_stmt, ret, "PrepareUpdateBucket");
-  } else if (params->op.query_str == "owner") { 
-    SQL_PREPARE(dpp, p_params, sdb, owner_stmt, ret, "PrepareUpdateBucket");
-  } else if (params->op.query_str == "info") { 
-    SQL_PREPARE(dpp, p_params, sdb, info_stmt, ret, "PrepareUpdateBucket");
-  } else {
-    ldpp_dout(dpp, 0)<<"In SQLUpdateBucket invalid query_str:" <<
-      params->op.query_str << "" << dendl;
-    goto out;
-  }
-
-out:
-  return ret;
-}
-
-int SQLUpdateBucket::Bind(const DoutPrefixProvider *dpp, struct DBOpParams *params)
-{
-  int index = -1;
-  int rc = 0;
-  struct DBOpPrepareParams p_params = PrepareParams;
-  sqlite3_stmt** stmt = NULL; // Prepared statement
-
-  /* All below fields for attrs */
-  if (params->op.query_str == "attrs") { 
-    stmt = &attrs_stmt;
-  } else if (params->op.query_str == "owner") { 
-    stmt = &owner_stmt;
-  } else if (params->op.query_str == "info") { 
-    stmt = &info_stmt;
-  } else {
-    ldpp_dout(dpp, 0)<<"In SQLUpdateBucket invalid query_str:" <<
-      params->op.query_str << "" << dendl;
-    goto out;
-  }
-
-  if (params->op.query_str == "attrs") { 
-    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.bucket.bucket_attrs, sdb);
-    SQL_ENCODE_BLOB_PARAM(dpp, *stmt, index, params->op.bucket.bucket_attrs, sdb);
-  } else if (params->op.query_str == "owner") { 
-    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.bucket.creation_time, sdb);
-    SQL_ENCODE_BLOB_PARAM(dpp, *stmt, index, params->op.bucket.info.creation_time, sdb);
-  } else if (params->op.query_str == "info") { 
-    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.bucket.tenant, sdb);
-    SQL_BIND_TEXT(dpp, *stmt, index, params->op.bucket.info.bucket.tenant.c_str(), sdb);
-
-    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.bucket.marker, sdb);
-    SQL_BIND_TEXT(dpp, *stmt, index, params->op.bucket.info.bucket.marker.c_str(), sdb);
-
-    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.bucket.bucket_id, sdb);
-    SQL_BIND_TEXT(dpp, *stmt, index, params->op.bucket.info.bucket.bucket_id.c_str(), sdb);
-
-    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.bucket.creation_time, sdb);
-    SQL_ENCODE_BLOB_PARAM(dpp, *stmt, index, params->op.bucket.info.creation_time, sdb);
-
-    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.bucket.count, sdb);
-    SQL_BIND_INT(dpp, *stmt, index, params->op.bucket.ent.count, sdb);
-
-    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.bucket.placement_name, sdb);
-    SQL_BIND_TEXT(dpp, *stmt, index, params->op.bucket.info.placement_rule.name.c_str(), sdb);
-
-    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.bucket.placement_storage_class, sdb);
-    SQL_BIND_TEXT(dpp, *stmt, index, params->op.bucket.info.placement_rule.storage_class.c_str(), sdb);
-
-    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.bucket.flags, sdb);
-    SQL_BIND_INT(dpp, *stmt, index, params->op.bucket.info.flags, sdb);
-
-    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.bucket.zonegroup, sdb);
-    SQL_BIND_TEXT(dpp, *stmt, index, params->op.bucket.info.zonegroup.c_str(), sdb);
-
-    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.bucket.has_instance_obj, sdb);
-    SQL_BIND_INT(dpp, *stmt, index, params->op.bucket.info.has_instance_obj, sdb);
-
-    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.bucket.quota, sdb);
-    SQL_ENCODE_BLOB_PARAM(dpp, *stmt, index, params->op.bucket.info.quota, sdb);
-
-    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.bucket.requester_pays, sdb);
-    SQL_BIND_INT(dpp, *stmt, index, params->op.bucket.info.requester_pays, sdb);
-
-    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.bucket.has_website, sdb);
-    SQL_BIND_INT(dpp, *stmt, index, params->op.bucket.info.has_website, sdb);
-
-    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.bucket.website_conf, sdb);
-    SQL_ENCODE_BLOB_PARAM(dpp, *stmt, index, params->op.bucket.info.website_conf, sdb);
-
-    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.bucket.swift_versioning, sdb);
-    SQL_BIND_INT(dpp, *stmt, index, params->op.bucket.info.swift_versioning, sdb);
-
-    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.bucket.swift_ver_location, sdb);
-    SQL_BIND_TEXT(dpp, *stmt, index, params->op.bucket.info.swift_ver_location.c_str(), sdb);
-
-    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.bucket.mdsearch_config, sdb);
-    SQL_ENCODE_BLOB_PARAM(dpp, *stmt, index, params->op.bucket.info.mdsearch_config, sdb);
-
-    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.bucket.new_bucket_instance_id, sdb);
-    SQL_BIND_TEXT(dpp, *stmt, index, params->op.bucket.info.new_bucket_instance_id.c_str(), sdb);
-
-    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.bucket.obj_lock, sdb);
-    SQL_ENCODE_BLOB_PARAM(dpp, *stmt, index, params->op.bucket.info.obj_lock, sdb);
-
-    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.bucket.sync_policy_info_groups, sdb);
-    SQL_ENCODE_BLOB_PARAM(dpp, *stmt, index, params->op.bucket.info.sync_policy, sdb);
-  }
-
-  SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.user.user_id, sdb);
-  SQL_BIND_TEXT(dpp, *stmt, index, params->op.user.uinfo.user_id.id.c_str(), sdb);
-
-  SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.bucket.bucket_name, sdb);
-  SQL_BIND_TEXT(dpp, *stmt, index, params->op.bucket.info.bucket.name.c_str(), sdb);
-
-  SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.bucket.bucket_ver, sdb);
-  SQL_BIND_INT(dpp, *stmt, index, params->op.bucket.bucket_version.ver, sdb);
-
-  SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.bucket.mtime, sdb);
-  SQL_ENCODE_BLOB_PARAM(dpp, *stmt, index, params->op.bucket.mtime, sdb);
-
-out:
-  return rc;
-}
-
-int SQLUpdateBucket::Execute(const DoutPrefixProvider *dpp, struct DBOpParams *params)
-{
-  int ret = -1;
-  sqlite3_stmt** stmt = NULL; // Prepared statement
-
-  if (params->op.query_str == "attrs") { 
-    stmt = &attrs_stmt;
-  } else if (params->op.query_str == "owner") { 
-    stmt = &owner_stmt;
-  } else if (params->op.query_str == "info") { 
-    stmt = &info_stmt;
-  } else {
-    ldpp_dout(dpp, 0)<<"In SQLUpdateBucket invalid query_str:" <<
-      params->op.query_str << "" << dendl;
-    goto out;
-  }
-
-  SQL_EXECUTE(dpp, params, *stmt, NULL);
-out:
-  return ret;
-}
-
-int SQLRemoveBucket::Prepare(const DoutPrefixProvider *dpp, struct DBOpParams *params)
-{
-  int ret = -1;
-  struct DBOpPrepareParams p_params = PrepareParams;
-
-  if (!*sdb) {
-    ldpp_dout(dpp, 0)<<"In SQLRemoveBucket - no db" << dendl;
-    goto out;
-  }
-
-  InitPrepareParams(dpp, p_params, params);
-
-  SQL_PREPARE(dpp, p_params, sdb, stmt, ret, "PrepareRemoveBucket");
-
-out:
-  return ret;
-}
-
-int SQLRemoveBucket::Bind(const DoutPrefixProvider *dpp, struct DBOpParams *params)
-{
-  int index = -1;
-  int rc = 0;
-  struct DBOpPrepareParams p_params = PrepareParams;
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.bucket_name, sdb);
-
-  SQL_BIND_TEXT(dpp, stmt, index, params->op.bucket.info.bucket.name.c_str(), sdb);
-
-out:
-  return rc;
-}
-
-int SQLRemoveBucket::Execute(const DoutPrefixProvider *dpp, struct DBOpParams *params)
-{
-  int ret = -1;
-
-  objectmapDelete(dpp, params->op.bucket.info.bucket.name);
-
-  SQL_EXECUTE(dpp, params, stmt, NULL);
-out:
-  return ret;
-}
-
-int SQLGetBucket::Prepare(const DoutPrefixProvider *dpp, struct DBOpParams *params)
-{
-  int ret = -1;
-  struct DBOpPrepareParams p_params = PrepareParams;
-
-  if (!*sdb) {
-    ldpp_dout(dpp, 0)<<"In SQLGetBucket - no db" << dendl;
-    goto out;
-  }
-
-  InitPrepareParams(dpp, p_params, params);
-
-  SQL_PREPARE(dpp, p_params, sdb, stmt, ret, "PrepareGetBucket");
-
-out:
-  return ret;
-}
-
-int SQLGetBucket::Bind(const DoutPrefixProvider *dpp, struct DBOpParams *params)
-{
-  int index = -1;
-  int rc = 0;
-  struct DBOpPrepareParams p_params = PrepareParams;
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.bucket_name, sdb);
-
-  SQL_BIND_TEXT(dpp, stmt, index, params->op.bucket.info.bucket.name.c_str(), sdb);
-
-out:
-  return rc;
-}
-
-int SQLGetBucket::Execute(const DoutPrefixProvider *dpp, struct DBOpParams *params)
-{
-  int ret = -1;
-  class SQLObjectOp *ObPtr = NULL;
-
-  params->op.name = "GetBucket";
-
-  ObPtr = new SQLObjectOp(sdb, ctx());
-
-  /* For the case when the  server restarts, need to reinsert objectmap*/
-  objectmapInsert(dpp, params->op.bucket.info.bucket.name, ObPtr);
-  SQL_EXECUTE(dpp, params, stmt, list_bucket);
-out:
-  return ret;
-}
-
-int SQLListUserBuckets::Prepare(const DoutPrefixProvider *dpp, struct DBOpParams *params)
-{
-  int ret = -1;
-  struct DBOpPrepareParams p_params = PrepareParams;
-
-  if (!*sdb) {
-    ldpp_dout(dpp, 0)<<"In SQLListUserBuckets - no db" << dendl;
-    goto out;
-  }
-
-  InitPrepareParams(dpp, p_params, params);
-
-  if (params->op.query_str == "all") { 
-    SQL_PREPARE(dpp, p_params, sdb, all_stmt, ret, "PrepareListUserBuckets");
-  }else {
-    SQL_PREPARE(dpp, p_params, sdb, stmt, ret, "PrepareListUserBuckets");
-  }
-
-out:
-  return ret;
-}
-
-int SQLListUserBuckets::Bind(const DoutPrefixProvider *dpp, struct DBOpParams *params)
-{
-  int index = -1;
-  int rc = 0;
-  struct DBOpPrepareParams p_params = PrepareParams;
-  sqlite3_stmt** pstmt = NULL; // Prepared statement
-
-  if (params->op.query_str == "all") { 
-    pstmt = &all_stmt;
-  } else { 
-    pstmt = &stmt;
-  }
-
-  if (params->op.query_str != "all") { 
-    SQL_BIND_INDEX(dpp, *pstmt, index, p_params.op.user.user_id, sdb);
-    SQL_BIND_TEXT(dpp, *pstmt, index, params->op.user.uinfo.user_id.id.c_str(), sdb);
-  }
-
-  SQL_BIND_INDEX(dpp, *pstmt, index, p_params.op.bucket.min_marker, sdb);
-  SQL_BIND_TEXT(dpp, *pstmt, index, params->op.bucket.min_marker.c_str(), sdb);
-
-  SQL_BIND_INDEX(dpp, *pstmt, index, p_params.op.list_max_count, sdb);
-  SQL_BIND_INT(dpp, *pstmt, index, params->op.list_max_count, sdb);
-
-out:
-  return rc;
-}
-
-int SQLListUserBuckets::Execute(const DoutPrefixProvider *dpp, struct DBOpParams *params)
-{
-  int ret = -1;
-
-  if (params->op.query_str == "all") { 
-    SQL_EXECUTE(dpp, params, all_stmt, list_bucket);
-  } else {
-    SQL_EXECUTE(dpp, params, stmt, list_bucket);
-  }
-out:
-  return ret;
-}
-
-int SQLPutObject::Prepare(const DoutPrefixProvider *dpp, struct DBOpParams *params)
-{
-  int ret = -1;
-  struct DBOpPrepareParams p_params = PrepareParams;
-
-  if (!*sdb) {
-    ldpp_dout(dpp, 0)<<"In SQLPutObject - no db" << dendl;
-    goto out;
-  }
-
-  InitPrepareParams(dpp, p_params, params);
-  SQL_PREPARE(dpp, p_params, sdb, stmt, ret, "PreparePutObject");
-
-out:
-  return ret;
-}
-
-int SQLPutObject::Bind(const DoutPrefixProvider *dpp, struct DBOpParams *params)
-{
-  int index = -1;
-  int rc = 0;
-  int VersionNum = 0;
-  struct DBOpPrepareParams p_params = PrepareParams;
-
-  if (params->op.obj.state.obj.key.instance.empty()) {
-    params->op.obj.state.obj.key.instance = "null";
-  }
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.obj_name, sdb);
-  SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.state.obj.key.name.c_str(), sdb);
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.bucket_name, sdb);
-  SQL_BIND_TEXT(dpp, stmt, index, params->op.bucket.info.bucket.name.c_str(), sdb);
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.obj_instance, sdb);
-  SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.state.obj.key.instance.c_str(), sdb);
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.obj_ns, sdb);
-  SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.state.obj.key.ns.c_str(), sdb);
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.acls, sdb);
-  SQL_ENCODE_BLOB_PARAM(dpp, stmt, index, params->op.obj.acls, sdb);
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.index_ver, sdb);
-  SQL_BIND_INT(dpp, stmt, index, params->op.obj.index_ver, sdb);
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.tag, sdb);
-  SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.tag.c_str(), sdb);
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.flags, sdb);
-  SQL_BIND_INT(dpp, stmt, index, params->op.obj.flags, sdb);
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.versioned_epoch, sdb);
-  SQL_BIND_INT(dpp, stmt, index, params->op.obj.versioned_epoch, sdb);
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.obj_category, sdb);
-  SQL_BIND_INT(dpp, stmt, index, (uint8_t)(params->op.obj.category), sdb);
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.etag, sdb);
-  SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.etag.c_str(), sdb);
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.owner, sdb);
-  SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.owner.c_str(), sdb);
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.owner_display_name, sdb);
-  SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.owner_display_name.c_str(), sdb);
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.storage_class, sdb);
-  SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.storage_class.c_str(), sdb);
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.appendable, sdb);
-  SQL_BIND_INT(dpp, stmt, index, params->op.obj.appendable, sdb);
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.content_type, sdb);
-  SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.content_type.c_str(), sdb);
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.index_hash_source, sdb);
-  SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.state.obj.index_hash_source.c_str(), sdb);
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.obj_size, sdb);
-  SQL_BIND_INT(dpp, stmt, index, params->op.obj.state.size, sdb);
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.accounted_size, sdb);
-  SQL_BIND_INT(dpp, stmt, index, params->op.obj.state.accounted_size, sdb);
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.mtime, sdb);
-  SQL_ENCODE_BLOB_PARAM(dpp, stmt, index, params->op.obj.state.mtime, sdb);
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.epoch, sdb);
-  SQL_BIND_INT(dpp, stmt, index, params->op.obj.state.epoch, sdb);
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.obj_tag, sdb);
-  SQL_ENCODE_BLOB_PARAM(dpp, stmt, index, params->op.obj.state.obj_tag, sdb);
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.tail_tag, sdb);
-  SQL_ENCODE_BLOB_PARAM(dpp, stmt, index, params->op.obj.state.tail_tag, sdb);
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.write_tag, sdb);
-  SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.state.write_tag.c_str(), sdb);
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.fake_tag, sdb);
-  SQL_BIND_INT(dpp, stmt, index, params->op.obj.state.fake_tag, sdb);
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.shadow_obj, sdb);
-  SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.state.shadow_obj.c_str(), sdb);
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.has_data, sdb);
-  SQL_BIND_INT(dpp, stmt, index, params->op.obj.state.has_data, sdb);
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.is_versioned, sdb);
-  SQL_BIND_INT(dpp, stmt, index, params->op.obj.is_versioned, sdb);
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.version_num, sdb);
-  SQL_BIND_INT(dpp, stmt, index, VersionNum, sdb);
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.pg_ver, sdb);
-  SQL_BIND_INT(dpp, stmt, index, params->op.obj.state.pg_ver, sdb);
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.zone_short_id, sdb);
-  SQL_BIND_INT(dpp, stmt, index, params->op.obj.state.zone_short_id, sdb);
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.obj_version, sdb);
-  SQL_BIND_INT(dpp, stmt, index, params->op.obj.state.objv_tracker.read_version.ver, sdb);
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.obj_version_tag, sdb);
-  SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.state.objv_tracker.read_version.tag.c_str(), sdb);
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.obj_attrs, sdb);
-  SQL_ENCODE_BLOB_PARAM(dpp, stmt, index, params->op.obj.state.attrset, sdb);
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.head_size, sdb);
-  SQL_BIND_INT(dpp, stmt, index, params->op.obj.head_size, sdb);
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.max_head_size, sdb);
-  SQL_BIND_INT(dpp, stmt, index, params->op.obj.max_head_size, sdb);
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.obj_id, sdb);
-  SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.obj_id.c_str(), sdb);
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.tail_instance, sdb);
-  SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.tail_instance.c_str(), sdb);
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.head_placement_rule_name, sdb);
-  SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.head_placement_rule.name.c_str(), sdb);
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.head_placement_storage_class, sdb);
-  SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.head_placement_rule.storage_class.c_str(), sdb);
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.tail_placement_rule_name, sdb);
-  SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.tail_placement.placement_rule.name.c_str(), sdb);
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.tail_placement_storage_class, sdb);
-  SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.tail_placement.placement_rule.storage_class.c_str(), sdb);
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.manifest_part_objs, sdb);
-  SQL_ENCODE_BLOB_PARAM(dpp, stmt, index, params->op.obj.objs, sdb);
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.manifest_part_rules, sdb);
-  SQL_ENCODE_BLOB_PARAM(dpp, stmt, index, params->op.obj.rules, sdb);
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.omap, sdb);
-  SQL_ENCODE_BLOB_PARAM(dpp, stmt, index, params->op.obj.omap, sdb);
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.is_multipart, sdb);
-  SQL_BIND_INT(dpp, stmt, index, params->op.obj.is_multipart, sdb);
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.mp_parts, sdb);
-  SQL_ENCODE_BLOB_PARAM(dpp, stmt, index, params->op.obj.mp_parts, sdb);
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.head_data, sdb);
-  SQL_ENCODE_BLOB_PARAM(dpp, stmt, index, params->op.obj.head_data, sdb);
-
-out:
-  return rc;
-}
-
-int SQLPutObject::Execute(const DoutPrefixProvider *dpp, struct DBOpParams *params)
-{
-  int ret = -1;
-
-  SQL_EXECUTE(dpp, params, stmt, NULL);
-out:
-  return ret;
-}
-
-int SQLDeleteObject::Prepare(const DoutPrefixProvider *dpp, struct DBOpParams *params)
-{
-  int ret = -1;
-  struct DBOpPrepareParams p_params = PrepareParams;
-
-  if (!*sdb) {
-    ldpp_dout(dpp, 0)<<"In SQLDeleteObject - no db" << dendl;
-    goto out;
-  }
-
-  InitPrepareParams(dpp, p_params, params);
-  SQL_PREPARE(dpp, p_params, sdb, stmt, ret, "PrepareDeleteObject");
-
-out:
-  return ret;
-}
-
-int SQLDeleteObject::Bind(const DoutPrefixProvider *dpp, struct DBOpParams *params)
-{
-  int index = -1;
-  int rc = 0;
-  struct DBOpPrepareParams p_params = PrepareParams;
-
-  if (params->op.obj.state.obj.key.instance.empty()) {
-    params->op.obj.state.obj.key.instance = "null";
-  }
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.bucket_name, sdb);
-  SQL_BIND_TEXT(dpp, stmt, index, params->op.bucket.info.bucket.name.c_str(), sdb);
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.obj_name, sdb);
-  SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.state.obj.key.name.c_str(), sdb);
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.obj_instance, sdb);
-  SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.state.obj.key.instance.c_str(), sdb);
-out:
-  return rc;
-}
-
-int SQLDeleteObject::Execute(const DoutPrefixProvider *dpp, struct DBOpParams *params)
-{
-  int ret = -1;
-
-  SQL_EXECUTE(dpp, params, stmt, NULL);
-out:
-  return ret;
-}
-
-int SQLGetObject::Prepare(const DoutPrefixProvider *dpp, struct DBOpParams *params)
-{
-  int ret = -1;
-  struct DBOpPrepareParams p_params = PrepareParams;
-
-  if (!*sdb) {
-    ldpp_dout(dpp, 0)<<"In SQLGetObject - no db" << dendl;
-    goto out;
-  }
-
-  InitPrepareParams(dpp, p_params, params);
-  SQL_PREPARE(dpp, p_params, sdb, stmt, ret, "PrepareGetObject");
-
-out:
-  return ret;
-}
-
-int SQLGetObject::Bind(const DoutPrefixProvider *dpp, struct DBOpParams *params)
-{
-  int index = -1;
-  int rc = 0;
-  struct DBOpPrepareParams p_params = PrepareParams;
-
-  if (params->op.obj.state.obj.key.instance.empty()) {
-    params->op.obj.state.obj.key.instance = "null";
-  }
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.bucket_name, sdb);
-  SQL_BIND_TEXT(dpp, stmt, index, params->op.bucket.info.bucket.name.c_str(), sdb);
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.obj_name, sdb);
-  SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.state.obj.key.name.c_str(), sdb);
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.obj_instance, sdb);
-  SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.state.obj.key.instance.c_str(), sdb);
-
-out:
-  return rc;
-}
-
-int SQLGetObject::Execute(const DoutPrefixProvider *dpp, struct DBOpParams *params)
-{
-  int ret = -1;
-
-  SQL_EXECUTE(dpp, params, stmt, list_object);
-out:
-  return ret;
-}
-
-int SQLUpdateObject::Prepare(const DoutPrefixProvider *dpp, struct DBOpParams *params)
-{
-  int ret = -1;
-  struct DBOpPrepareParams p_params = PrepareParams;
-  struct DBOpParams copy = *params;
-  string bucket_name;
-
-  if (!*sdb) {
-    ldpp_dout(dpp, 0)<<"In SQLUpdateObject - no db" << dendl;
-    goto out;
-  }
-
-  InitPrepareParams(dpp, p_params, params);
-
-  if (params->op.query_str == "omap") {
-    SQL_PREPARE(dpp, p_params, sdb, omap_stmt, ret, "PrepareUpdateObject");
-  } else if (params->op.query_str == "attrs") {
-    SQL_PREPARE(dpp, p_params, sdb, attrs_stmt, ret, "PrepareUpdateObject");
-  } else if (params->op.query_str == "meta") {
-    SQL_PREPARE(dpp, p_params, sdb, meta_stmt, ret, "PrepareUpdateObject");
-  } else if (params->op.query_str == "mp") {
-    SQL_PREPARE(dpp, p_params, sdb, mp_stmt, ret, "PrepareUpdateObject");
-  } else {
-    ldpp_dout(dpp, 0)<<"In SQLUpdateObject invalid query_str:" <<
-      params->op.query_str << dendl;
-    goto out;
-  }
-
-out:
-  return ret;
-}
-
-int SQLUpdateObject::Bind(const DoutPrefixProvider *dpp, struct DBOpParams *params)
-{
-  int index = -1;
-  int rc = 0;
-  struct DBOpPrepareParams p_params = PrepareParams;
-  sqlite3_stmt** stmt = NULL; // Prepared statement
-
-  /* All below fields for attrs */
-  if (params->op.query_str == "omap") { 
-    stmt = &omap_stmt;
-  } else if (params->op.query_str == "attrs") { 
-    stmt = &attrs_stmt;
-  } else if (params->op.query_str == "meta") { 
-    stmt = &meta_stmt;
-  } else if (params->op.query_str == "mp") { 
-    stmt = &mp_stmt;
-  } else {
-    ldpp_dout(dpp, 0)<<"In SQLUpdateObject invalid query_str:" <<
-      params->op.query_str << dendl;
-    goto out;
-  }
-
-  if (params->op.obj.state.obj.key.instance.empty()) {
-    params->op.obj.state.obj.key.instance = "null";
-  }
-
-  SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.bucket.bucket_name, sdb);
-  SQL_BIND_TEXT(dpp, *stmt, index, params->op.bucket.info.bucket.name.c_str(), sdb);
-
-  SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.obj_name, sdb);
-  SQL_BIND_TEXT(dpp, *stmt, index, params->op.obj.state.obj.key.name.c_str(), sdb);
-
-  SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.obj_instance, sdb);
-  SQL_BIND_TEXT(dpp, *stmt, index, params->op.obj.state.obj.key.instance.c_str(), sdb);
-
-  SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.mtime, sdb);
-  SQL_ENCODE_BLOB_PARAM(dpp, *stmt, index, params->op.obj.state.mtime, sdb);
-
-  if (params->op.query_str == "omap") { 
-    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.omap, sdb);
-    SQL_ENCODE_BLOB_PARAM(dpp, *stmt, index, params->op.obj.omap, sdb);
-  }
-  if (params->op.query_str == "attrs") { 
-    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.obj_attrs, sdb);
-    SQL_ENCODE_BLOB_PARAM(dpp, *stmt, index, params->op.obj.state.attrset, sdb);
-  }
-  if (params->op.query_str == "mp") { 
-    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.mp_parts, sdb);
-    SQL_ENCODE_BLOB_PARAM(dpp, *stmt, index, params->op.obj.mp_parts, sdb);
-  }
-  if (params->op.query_str == "meta") { 
-    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.obj_ns, sdb);
-    SQL_BIND_TEXT(dpp, *stmt, index, params->op.obj.state.obj.key.ns.c_str(), sdb);
-
-    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.acls, sdb);
-    SQL_ENCODE_BLOB_PARAM(dpp, *stmt, index, params->op.obj.acls, sdb);
-
-    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.index_ver, sdb);
-    SQL_BIND_INT(dpp, *stmt, index, params->op.obj.index_ver, sdb);
-
-    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.tag, sdb);
-    SQL_BIND_TEXT(dpp, *stmt, index, params->op.obj.tag.c_str(), sdb);
-
-    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.flags, sdb);
-    SQL_BIND_INT(dpp, *stmt, index, params->op.obj.flags, sdb);
-
-    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.versioned_epoch, sdb);
-    SQL_BIND_INT(dpp, *stmt, index, params->op.obj.versioned_epoch, sdb);
-
-    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.obj_category, sdb);
-    SQL_BIND_INT(dpp, *stmt, index, (uint8_t)(params->op.obj.category), sdb);
-
-    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.etag, sdb);
-    SQL_BIND_TEXT(dpp, *stmt, index, params->op.obj.etag.c_str(), sdb);
-
-    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.owner, sdb);
-    SQL_BIND_TEXT(dpp, *stmt, index, params->op.obj.owner.c_str(), sdb);
-
-    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.owner_display_name, sdb);
-    SQL_BIND_TEXT(dpp, *stmt, index, params->op.obj.owner_display_name.c_str(), sdb);
-
-    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.storage_class, sdb);
-    SQL_BIND_TEXT(dpp, *stmt, index, params->op.obj.storage_class.c_str(), sdb);
-
-    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.appendable, sdb);
-    SQL_BIND_INT(dpp, *stmt, index, params->op.obj.appendable, sdb);
-
-    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.content_type, sdb);
-    SQL_BIND_TEXT(dpp, *stmt, index, params->op.obj.content_type.c_str(), sdb);
-
-    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.index_hash_source, sdb);
-    SQL_BIND_TEXT(dpp, *stmt, index, params->op.obj.state.obj.index_hash_source.c_str(), sdb);
-
-    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.obj_size, sdb);
-    SQL_BIND_INT(dpp, *stmt, index, params->op.obj.state.size, sdb);
-
-    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.accounted_size, sdb);
-    SQL_BIND_INT(dpp, *stmt, index, params->op.obj.state.accounted_size, sdb);
-
-    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.epoch, sdb);
-    SQL_BIND_INT(dpp, *stmt, index, params->op.obj.state.epoch, sdb);
-
-    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.obj_tag, sdb);
-    SQL_ENCODE_BLOB_PARAM(dpp, *stmt, index, params->op.obj.state.obj_tag, sdb);
-
-    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.tail_tag, sdb);
-    SQL_ENCODE_BLOB_PARAM(dpp, *stmt, index, params->op.obj.state.tail_tag, sdb);
-
-    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.write_tag, sdb);
-    SQL_BIND_TEXT(dpp, *stmt, index, params->op.obj.state.write_tag.c_str(), sdb);
-
-    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.fake_tag, sdb);
-    SQL_BIND_INT(dpp, *stmt, index, params->op.obj.state.fake_tag, sdb);
-
-    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.shadow_obj, sdb);
-    SQL_BIND_TEXT(dpp, *stmt, index, params->op.obj.state.shadow_obj.c_str(), sdb);
-
-    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.has_data, sdb);
-    SQL_BIND_INT(dpp, *stmt, index, params->op.obj.state.has_data, sdb);
-
-    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.is_versioned, sdb);
-    SQL_BIND_INT(dpp, *stmt, index, params->op.obj.is_versioned, sdb);
-
-    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.version_num, sdb);
-    SQL_BIND_INT(dpp, *stmt, index, params->op.obj.version_num, sdb);
-
-    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.pg_ver, sdb);
-    SQL_BIND_INT(dpp, *stmt, index, params->op.obj.state.pg_ver, sdb);
-
-    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.zone_short_id, sdb);
-    SQL_BIND_INT(dpp, *stmt, index, params->op.obj.state.zone_short_id, sdb);
-
-    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.obj_version, sdb);
-    SQL_BIND_INT(dpp, *stmt, index, params->op.obj.state.objv_tracker.read_version.ver, sdb);
-
-    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.obj_version_tag, sdb);
-    SQL_BIND_TEXT(dpp, *stmt, index, params->op.obj.state.objv_tracker.read_version.tag.c_str(), sdb);
-
-    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.obj_attrs, sdb);
-    SQL_ENCODE_BLOB_PARAM(dpp, *stmt, index, params->op.obj.state.attrset, sdb);
-
-    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.head_size, sdb);
-    SQL_BIND_INT(dpp, *stmt, index, params->op.obj.head_size, sdb);
-
-    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.max_head_size, sdb);
-    SQL_BIND_INT(dpp, *stmt, index, params->op.obj.max_head_size, sdb);
-
-    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.obj_id, sdb);
-    SQL_BIND_TEXT(dpp, *stmt, index, params->op.obj.obj_id.c_str(), sdb);
-
-    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.tail_instance, sdb);
-    SQL_BIND_TEXT(dpp, *stmt, index, params->op.obj.tail_instance.c_str(), sdb);
-
-    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.head_placement_rule_name, sdb);
-    SQL_BIND_TEXT(dpp, *stmt, index, params->op.obj.head_placement_rule.name.c_str(), sdb);
-
-    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.head_placement_storage_class, sdb);
-    SQL_BIND_TEXT(dpp, *stmt, index, params->op.obj.head_placement_rule.storage_class.c_str(), sdb);
-
-    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.tail_placement_rule_name, sdb);
-    SQL_BIND_TEXT(dpp, *stmt, index, params->op.obj.tail_placement.placement_rule.name.c_str(), sdb);
-
-    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.tail_placement_storage_class, sdb);
-    SQL_BIND_TEXT(dpp, *stmt, index, params->op.obj.tail_placement.placement_rule.storage_class.c_str(), sdb);
-
-    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.manifest_part_objs, sdb);
-    SQL_ENCODE_BLOB_PARAM(dpp, *stmt, index, params->op.obj.objs, sdb);
-
-    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.manifest_part_rules, sdb);
-    SQL_ENCODE_BLOB_PARAM(dpp, *stmt, index, params->op.obj.rules, sdb);
-
-    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.omap, sdb);
-    SQL_ENCODE_BLOB_PARAM(dpp, *stmt, index, params->op.obj.omap, sdb);
-
-    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.is_multipart, sdb);
-    SQL_BIND_INT(dpp, *stmt, index, params->op.obj.is_multipart, sdb);
-
-    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.mp_parts, sdb);
-    SQL_ENCODE_BLOB_PARAM(dpp, *stmt, index, params->op.obj.mp_parts, sdb);
-
-    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.head_data, sdb);
-    SQL_ENCODE_BLOB_PARAM(dpp, *stmt, index, params->op.obj.head_data, sdb);
-  }
-
-out:
-  return rc;
-}
-
-int SQLUpdateObject::Execute(const DoutPrefixProvider *dpp, struct DBOpParams *params)
-{
-  int ret = -1;
-  sqlite3_stmt** stmt = NULL; // Prepared statement
-
-  if (params->op.query_str == "omap") { 
-    stmt = &omap_stmt;
-  } else if (params->op.query_str == "attrs") { 
-    stmt = &attrs_stmt;
-  } else if (params->op.query_str == "meta") { 
-    stmt = &meta_stmt;
-  } else if (params->op.query_str == "mp") { 
-    stmt = &mp_stmt;
-  } else {
-    ldpp_dout(dpp, 0)<<"In SQLUpdateObject invalid query_str:" <<
-      params->op.query_str << dendl;
-    goto out;
-  }
-
-  SQL_EXECUTE(dpp, params, *stmt, NULL);
-out:
-  return ret;
-}
-
-int SQLListBucketObjects::Prepare(const DoutPrefixProvider *dpp, struct DBOpParams *params)
-{
-  int ret = -1;
-  struct DBOpPrepareParams p_params = PrepareParams;
-
-  if (!*sdb) {
-    ldpp_dout(dpp, 0)<<"In SQLListBucketObjects - no db" << dendl;
-    goto out;
-  }
-
-  InitPrepareParams(dpp, p_params, params);
-
-  SQL_PREPARE(dpp, p_params, sdb, stmt, ret, "PrepareListBucketObjects");
-
-out:
-  return ret;
-}
-
-int SQLListBucketObjects::Bind(const DoutPrefixProvider *dpp, struct DBOpParams *params)
-{
-  int index = -1;
-  int rc = 0;
-  struct DBOpPrepareParams p_params = PrepareParams;
-
-  if (params->op.obj.state.obj.key.instance.empty()) {
-    params->op.obj.state.obj.key.instance = "null";
-  }
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.bucket_name, sdb);
-  SQL_BIND_TEXT(dpp, stmt, index, params->op.bucket.info.bucket.name.c_str(), sdb);
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.min_marker, sdb);
-  SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.min_marker.c_str(), sdb);
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.prefix, sdb);
-  SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.prefix.c_str(), sdb);
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.list_max_count, sdb);
-  SQL_BIND_INT(dpp, stmt, index, params->op.list_max_count, sdb);
-
-out:
-  return rc;
-}
-
-int SQLListBucketObjects::Execute(const DoutPrefixProvider *dpp, struct DBOpParams *params)
-{
-  int ret = -1;
-
-  SQL_EXECUTE(dpp, params, stmt, list_object);
-out:
-  return ret;
-}
-
-int SQLListVersionedObjects::Prepare(const DoutPrefixProvider *dpp, struct DBOpParams *params)
-{
-  int ret = -1;
-  struct DBOpPrepareParams p_params = PrepareParams;
-
-  if (!*sdb) {
-    ldpp_dout(dpp, 0)<<"In SQLListVersionedObjects - no db" << dendl;
-    goto out;
-  }
-
-  InitPrepareParams(dpp, p_params, params);
-
-  SQL_PREPARE(dpp, p_params, sdb, stmt, ret, "PrepareListVersionedObjects");
-
-out:
-  return ret;
-}
-
-int SQLListVersionedObjects::Bind(const DoutPrefixProvider *dpp, struct DBOpParams *params)
-{
-  int index = -1;
-  int rc = 0;
-  struct DBOpPrepareParams p_params = PrepareParams;
-
-  if (params->op.obj.state.obj.key.instance.empty()) {
-    params->op.obj.state.obj.key.instance = "null";
-  }
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.bucket_name, sdb);
-  SQL_BIND_TEXT(dpp, stmt, index, params->op.bucket.info.bucket.name.c_str(), sdb);
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.obj_name, sdb);
-  SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.state.obj.key.name.c_str(), sdb);
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.list_max_count, sdb);
-  SQL_BIND_INT(dpp, stmt, index, params->op.list_max_count, sdb);
-
-out:
-  return rc;
-}
-
-int SQLListVersionedObjects::Execute(const DoutPrefixProvider *dpp, struct DBOpParams *params)
-{
-  int ret = -1;
-
-  SQL_EXECUTE(dpp, params, stmt, list_object);
-out:
-  return ret;
-}
-
-int SQLPutObjectData::Prepare(const DoutPrefixProvider *dpp, struct DBOpParams *params)
-{
-  int ret = -1;
-  struct DBOpPrepareParams p_params = PrepareParams;
-
-  if (!*sdb) {
-    ldpp_dout(dpp, 0)<<"In SQLPutObjectData - no db" << dendl;
-    goto out;
-  }
-
-  InitPrepareParams(dpp, p_params, params);
-
-  SQL_PREPARE(dpp, p_params, sdb, stmt, ret, "PreparePutObjectData");
-
-out:
-  return ret;
-}
-
-int SQLPutObjectData::Bind(const DoutPrefixProvider *dpp, struct DBOpParams *params)
-{
-  int index = -1;
-  int rc = 0;
-  struct DBOpPrepareParams p_params = PrepareParams;
-
-  if (params->op.obj.state.obj.key.instance.empty()) {
-    params->op.obj.state.obj.key.instance = "null";
-  }
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.obj_name, sdb);
-
-  SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.state.obj.key.name.c_str(), sdb);
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.obj_instance, sdb);
-
-  SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.state.obj.key.instance.c_str(), sdb);
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.obj_ns, sdb);
-
-  SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.state.obj.key.ns.c_str(), sdb);
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.bucket_name, sdb);
-
-  SQL_BIND_TEXT(dpp, stmt, index, params->op.bucket.info.bucket.name.c_str(), sdb);
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.obj_id, sdb);
-  SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.obj_id.c_str(), sdb);
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj_data.part_num, sdb);
-
-  SQL_BIND_INT(dpp, stmt, index, params->op.obj_data.part_num, sdb);
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj_data.offset, sdb);
-
-  SQL_BIND_INT(dpp, stmt, index, params->op.obj_data.offset, sdb);
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj_data.data, sdb);
-  SQL_ENCODE_BLOB_PARAM(dpp, stmt, index, params->op.obj_data.data, sdb);
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj_data.size, sdb);
-
-  SQL_BIND_INT(dpp, stmt, index, params->op.obj_data.size, sdb);
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj_data.multipart_part_str, sdb);
-
-  SQL_BIND_TEXT(dpp, stmt, index, params->op.obj_data.multipart_part_str.c_str(), sdb);
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.mtime, sdb);
-  SQL_ENCODE_BLOB_PARAM(dpp, stmt, index, params->op.obj.state.mtime, sdb);
-
-out:
-  return rc;
-}
-
-int SQLPutObjectData::Execute(const DoutPrefixProvider *dpp, struct DBOpParams *params)
-{
-  int ret = -1;
-
-  SQL_EXECUTE(dpp, params, stmt, NULL);
-out:
-  return ret;
-}
-
-int SQLUpdateObjectData::Prepare(const DoutPrefixProvider *dpp, struct DBOpParams *params)
-{
-  int ret = -1;
-  struct DBOpPrepareParams p_params = PrepareParams;
-
-  if (!*sdb) {
-    ldpp_dout(dpp, 0)<<"In SQLUpdateObjectData - no db" << dendl;
-    goto out;
-  }
-
-  InitPrepareParams(dpp, p_params, params);
-  SQL_PREPARE(dpp, p_params, sdb, stmt, ret, "PrepareUpdateObjectData");
-
-out:
-  return ret;
-}
-
-int SQLUpdateObjectData::Bind(const DoutPrefixProvider *dpp, struct DBOpParams *params)
-{
-  int index = -1;
-  int rc = 0;
-  struct DBOpPrepareParams p_params = PrepareParams;
-
-  if (params->op.obj.state.obj.key.instance.empty()) {
-    params->op.obj.state.obj.key.instance = "null";
-  }
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.obj_name, sdb);
-  SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.state.obj.key.name.c_str(), sdb);
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.obj_instance, sdb);
-  SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.state.obj.key.instance.c_str(), sdb);
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.bucket_name, sdb);
-  SQL_BIND_TEXT(dpp, stmt, index, params->op.bucket.info.bucket.name.c_str(), sdb);
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.obj_id, sdb);
-  SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.obj_id.c_str(), sdb);
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.mtime, sdb);
-  SQL_ENCODE_BLOB_PARAM(dpp, stmt, index, params->op.obj.state.mtime, sdb);
-
-out:
-  return rc;
-}
-
-int SQLUpdateObjectData::Execute(const DoutPrefixProvider *dpp, struct DBOpParams *params)
-{
-  int ret = -1;
-
-  SQL_EXECUTE(dpp, params, stmt, NULL);
-out:
-  return ret;
-}
-
-int SQLGetObjectData::Prepare(const DoutPrefixProvider *dpp, struct DBOpParams *params)
-{
-  int ret = -1;
-  struct DBOpPrepareParams p_params = PrepareParams;
-
-  if (!*sdb) {
-    ldpp_dout(dpp, 0)<<"In SQLGetObjectData - no db" << dendl;
-    goto out;
-  }
-
-  InitPrepareParams(dpp, p_params, params);
-  SQL_PREPARE(dpp, p_params, sdb, stmt, ret, "PrepareGetObjectData");
-
-out:
-  return ret;
-}
-
-int SQLGetObjectData::Bind(const DoutPrefixProvider *dpp, struct DBOpParams *params)
-{
-  int index = -1;
-  int rc = 0;
-  struct DBOpPrepareParams p_params = PrepareParams;
-
-  if (params->op.obj.state.obj.key.instance.empty()) {
-    params->op.obj.state.obj.key.instance = "null";
-  }
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.bucket_name, sdb);
-  SQL_BIND_TEXT(dpp, stmt, index, params->op.bucket.info.bucket.name.c_str(), sdb);
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.obj_name, sdb);
-  SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.state.obj.key.name.c_str(), sdb);
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.obj_instance, sdb);
-  SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.state.obj.key.instance.c_str(), sdb);
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.obj_id, sdb);
-  SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.obj_id.c_str(), sdb);
-
-out:
-  return rc;
-}
-
-int SQLGetObjectData::Execute(const DoutPrefixProvider *dpp, struct DBOpParams *params)
-{
-  int ret = -1;
-
-  SQL_EXECUTE(dpp, params, stmt, get_objectdata);
-out:
-  return ret;
-}
-
-int SQLDeleteObjectData::Prepare(const DoutPrefixProvider *dpp, struct DBOpParams *params)
-{
-  int ret = -1;
-  struct DBOpPrepareParams p_params = PrepareParams;
-
-  if (!*sdb) {
-    ldpp_dout(dpp, 0)<<"In SQLDeleteObjectData - no db" << dendl;
-    goto out;
-  }
-
-  InitPrepareParams(dpp, p_params, params);
-  SQL_PREPARE(dpp, p_params, sdb, stmt, ret, "PrepareDeleteObjectData");
-
-out:
-  return ret;
-}
-
-int SQLDeleteObjectData::Bind(const DoutPrefixProvider *dpp, struct DBOpParams *params)
-{
-  int index = -1;
-  int rc = 0;
-  struct DBOpPrepareParams p_params = PrepareParams;
-
-  if (params->op.obj.state.obj.key.instance.empty()) {
-    params->op.obj.state.obj.key.instance = "null";
-  }
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.bucket_name, sdb);
-  SQL_BIND_TEXT(dpp, stmt, index, params->op.bucket.info.bucket.name.c_str(), sdb);
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.obj_name, sdb);
-  SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.state.obj.key.name.c_str(), sdb);
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.obj_instance, sdb);
-  SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.state.obj.key.instance.c_str(), sdb);
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.obj_id, sdb);
-  SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.obj_id.c_str(), sdb);
-
-out:
-  return rc;
-}
-
-int SQLDeleteObjectData::Execute(const DoutPrefixProvider *dpp, struct DBOpParams *params)
-{
-  int ret = -1;
-
-  SQL_EXECUTE(dpp, params, stmt, NULL);
-out:
-  return ret;
-}
-
-int SQLDeleteStaleObjectData::Prepare(const DoutPrefixProvider *dpp, struct DBOpParams *params)
-{
-  int ret = -1;
-  struct DBOpPrepareParams p_params = PrepareParams;
-
-  if (!*sdb) {
-    ldpp_dout(dpp, 0)<<"In SQLDeleteStaleObjectData - no db" << dendl;
-    goto out;
-  }
-
-  InitPrepareParams(dpp, p_params, params);
-  SQL_PREPARE(dpp, p_params, sdb, stmt, ret, "PrepareDeleteStaleObjectData");
-
-out:
-  return ret;
-}
-
-int SQLDeleteStaleObjectData::Bind(const DoutPrefixProvider *dpp, struct DBOpParams *params)
-{
-  int index = -1;
-  int rc = 0;
-  struct DBOpPrepareParams p_params = PrepareParams;
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.mtime, sdb);
-  SQL_ENCODE_BLOB_PARAM(dpp, stmt, index, params->op.obj.state.mtime, sdb);
-
-out:
-  return rc;
-}
-
-int SQLDeleteStaleObjectData::Execute(const DoutPrefixProvider *dpp, struct DBOpParams *params)
-{
-  int ret = -1;
-
-  SQL_EXECUTE(dpp, params, stmt, NULL);
-out:
-  return ret;
-}
-
-int SQLInsertLCEntry::Prepare(const DoutPrefixProvider *dpp, struct DBOpParams *params)
-{
-  int ret = -1;
-  struct DBOpPrepareParams p_params = PrepareParams;
-
-  if (!*sdb) {
-    ldpp_dout(dpp, 0)<<"In SQLInsertLCEntry - no db" << dendl;
-    goto out;
-  }
-
-  InitPrepareParams(dpp, p_params, params);
-
-  SQL_PREPARE(dpp, p_params, sdb, stmt, ret, "PrepareInsertLCEntry");
-
-out:
-  return ret;
-}
-
-int SQLInsertLCEntry::Bind(const DoutPrefixProvider *dpp, struct DBOpParams *params)
-{
-  int index = -1;
-  int rc = 0;
-  struct DBOpPrepareParams p_params = PrepareParams;
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.lc_entry.index, sdb);
-  SQL_BIND_TEXT(dpp, stmt, index, params->op.lc_entry.index.c_str(), sdb);
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.lc_entry.bucket_name, sdb);
-  SQL_BIND_TEXT(dpp, stmt, index, params->op.lc_entry.entry.get_bucket().c_str(), sdb);
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.lc_entry.status, sdb);
-  SQL_BIND_INT(dpp, stmt, index, params->op.lc_entry.entry.get_status(), sdb);
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.lc_entry.start_time, sdb);
-  SQL_BIND_INT(dpp, stmt, index, params->op.lc_entry.entry.get_start_time(), sdb);
-
-out:
-  return rc;
-}
-
-int SQLInsertLCEntry::Execute(const DoutPrefixProvider *dpp, struct DBOpParams *params)
-{
-  int ret = -1;
-
-  SQL_EXECUTE(dpp, params, stmt, NULL);
-out:
-  return ret;
-}
-
-int SQLRemoveLCEntry::Prepare(const DoutPrefixProvider *dpp, struct DBOpParams *params)
-{
-  int ret = -1;
-  struct DBOpPrepareParams p_params = PrepareParams;
-
-  if (!*sdb) {
-    ldpp_dout(dpp, 0)<<"In SQLRemoveLCEntry - no db" << dendl;
-    goto out;
-  }
-
-  InitPrepareParams(dpp, p_params, params);
-
-  SQL_PREPARE(dpp, p_params, sdb, stmt, ret, "PrepareRemoveLCEntry");
-
-out:
-  return ret;
-}
-
-int SQLRemoveLCEntry::Bind(const DoutPrefixProvider *dpp, struct DBOpParams *params)
-{
-  int index = -1;
-  int rc = 0;
-  struct DBOpPrepareParams p_params = PrepareParams;
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.lc_entry.index, sdb);
-  SQL_BIND_TEXT(dpp, stmt, index, params->op.lc_entry.index.c_str(), sdb);
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.lc_entry.bucket_name, sdb);
-  SQL_BIND_TEXT(dpp, stmt, index, params->op.lc_entry.entry.get_bucket().c_str(), sdb);
-
-out:
-  return rc;
-}
-
-int SQLRemoveLCEntry::Execute(const DoutPrefixProvider *dpp, struct DBOpParams *params)
-{
-  int ret = -1;
-
-  SQL_EXECUTE(dpp, params, stmt, NULL);
-out:
-  return ret;
-}
-
-int SQLGetLCEntry::Prepare(const DoutPrefixProvider *dpp, struct DBOpParams *params)
-{
-  int ret = -1;
-  sqlite3_stmt** pstmt = NULL; // Prepared statement
-  struct DBOpPrepareParams p_params = PrepareParams;
-
-  if (!*sdb) {
-    ldpp_dout(dpp, 0)<<"In SQLGetLCEntry - no db" << dendl;
-    goto out;
-  }
-
-  InitPrepareParams(dpp, p_params, params);
-
-  if (params->op.query_str == "get_next_entry") {
-    pstmt = &next_stmt;
-  } else {
-    pstmt = &stmt;
-  }
-  SQL_PREPARE(dpp, p_params, sdb, *pstmt, ret, "PrepareGetLCEntry");
-
-out:
-  return ret;
-}
-
-int SQLGetLCEntry::Bind(const DoutPrefixProvider *dpp, struct DBOpParams *params)
-{
-  int index = -1;
-  int rc = 0;
-  struct DBOpPrepareParams p_params = PrepareParams;
-  sqlite3_stmt** pstmt = NULL; // Prepared statement
-
-  if (params->op.query_str == "get_next_entry") {
-    pstmt = &next_stmt;
-  } else {
-    pstmt = &stmt;
-  }
-  SQL_BIND_INDEX(dpp, *pstmt, index, p_params.op.lc_entry.index, sdb);
-  SQL_BIND_TEXT(dpp, *pstmt, index, params->op.lc_entry.index.c_str(), sdb);
-
-  SQL_BIND_INDEX(dpp, *pstmt, index, p_params.op.lc_entry.bucket_name, sdb);
-  SQL_BIND_TEXT(dpp, *pstmt, index, params->op.lc_entry.entry.get_bucket().c_str(), sdb);
-
-out:
-  return rc;
-}
-
-int SQLGetLCEntry::Execute(const DoutPrefixProvider *dpp, struct DBOpParams *params)
-{
-  int ret = -1;
-  sqlite3_stmt** pstmt = NULL; // Prepared statement
-
-  if (params->op.query_str == "get_next_entry") {
-    pstmt = &next_stmt;
-  } else {
-    pstmt = &stmt;
-  }
-
-  SQL_EXECUTE(dpp, params, *pstmt, list_lc_entry);
-out:
-  return ret;
-}
-
-int SQLListLCEntries::Prepare(const DoutPrefixProvider *dpp, struct DBOpParams *params)
-{
-  int ret = -1;
-  struct DBOpPrepareParams p_params = PrepareParams;
-
-  if (!*sdb) {
-    ldpp_dout(dpp, 0)<<"In SQLListLCEntries - no db" << dendl;
-    goto out;
-  }
-
-  InitPrepareParams(dpp, p_params, params);
-
-  SQL_PREPARE(dpp, p_params, sdb, stmt, ret, "PrepareListLCEntries");
-
-out:
-  return ret;
-}
-
-int SQLListLCEntries::Bind(const DoutPrefixProvider *dpp, struct DBOpParams *params)
-{
-  int index = -1;
-  int rc = 0;
-  struct DBOpPrepareParams p_params = PrepareParams;
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.lc_entry.index, sdb);
-  SQL_BIND_TEXT(dpp, stmt, index, params->op.lc_entry.index.c_str(), sdb);
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.lc_entry.min_marker, sdb);
-  SQL_BIND_TEXT(dpp, stmt, index, params->op.lc_entry.min_marker.c_str(), sdb);
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.list_max_count, sdb);
-  SQL_BIND_INT(dpp, stmt, index, params->op.list_max_count, sdb);
-
-out:
-  return rc;
-}
-
-int SQLListLCEntries::Execute(const DoutPrefixProvider *dpp, struct DBOpParams *params)
-{
-  int ret = -1;
-
-  SQL_EXECUTE(dpp, params, stmt, list_lc_entry);
-out:
-  return ret;
-}
-
-int SQLInsertLCHead::Prepare(const DoutPrefixProvider *dpp, struct DBOpParams *params)
-{
-  int ret = -1;
-  struct DBOpPrepareParams p_params = PrepareParams;
-
-  if (!*sdb) {
-    ldpp_dout(dpp, 0)<<"In SQLInsertLCHead - no db" << dendl;
-    goto out;
-  }
-
-  InitPrepareParams(dpp, p_params, params);
-
-  SQL_PREPARE(dpp, p_params, sdb, stmt, ret, "PrepareInsertLCHead");
-
-out:
-  return ret;
-}
-
-int SQLInsertLCHead::Bind(const DoutPrefixProvider *dpp, struct DBOpParams *params)
-{
-  int index = -1;
-  int rc = 0;
-  struct DBOpPrepareParams p_params = PrepareParams;
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.lc_head.index, sdb);
-  SQL_BIND_TEXT(dpp, stmt, index, params->op.lc_head.index.c_str(), sdb);
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.lc_head.marker, sdb);
-  SQL_BIND_TEXT(dpp, stmt, index, params->op.lc_head.head.get_marker().c_str(), sdb);
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.lc_head.start_date, sdb);
-  SQL_ENCODE_BLOB_PARAM(dpp, stmt, index, static_cast<int64_t>(params->op.lc_head.head.start_date), sdb);
-
-out:
-  return rc;
-}
-
-int SQLInsertLCHead::Execute(const DoutPrefixProvider *dpp, struct DBOpParams *params)
-{
-  int ret = -1;
-
-  SQL_EXECUTE(dpp, params, stmt, NULL);
-out:
-  return ret;
-}
-
-int SQLRemoveLCHead::Prepare(const DoutPrefixProvider *dpp, struct DBOpParams *params)
-{
-  int ret = -1;
-  struct DBOpPrepareParams p_params = PrepareParams;
-
-  if (!*sdb) {
-    ldpp_dout(dpp, 0)<<"In SQLRemoveLCHead - no db" << dendl;
-    goto out;
-  }
-
-  InitPrepareParams(dpp, p_params, params);
-
-  SQL_PREPARE(dpp, p_params, sdb, stmt, ret, "PrepareRemoveLCHead");
-
-out:
-  return ret;
-}
-
-int SQLRemoveLCHead::Bind(const DoutPrefixProvider *dpp, struct DBOpParams *params)
-{
-  int index = -1;
-  int rc = 0;
-  struct DBOpPrepareParams p_params = PrepareParams;
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.lc_head.index, sdb);
-  SQL_BIND_TEXT(dpp, stmt, index, params->op.lc_head.index.c_str(), sdb);
-
-out:
-  return rc;
-}
-
-int SQLRemoveLCHead::Execute(const DoutPrefixProvider *dpp, struct DBOpParams *params)
-{
-  int ret = -1;
-
-  SQL_EXECUTE(dpp, params, stmt, NULL);
-out:
-  return ret;
-}
-
-int SQLGetLCHead::Prepare(const DoutPrefixProvider *dpp, struct DBOpParams *params)
-{
-  int ret = -1;
-  struct DBOpPrepareParams p_params = PrepareParams;
-
-  if (!*sdb) {
-    ldpp_dout(dpp, 0)<<"In SQLGetLCHead - no db" << dendl;
-    goto out;
-  }
-
-  InitPrepareParams(dpp, p_params, params);
-
-  SQL_PREPARE(dpp, p_params, sdb, stmt, ret, "PrepareGetLCHead");
-
-out:
-  return ret;
-}
-
-int SQLGetLCHead::Bind(const DoutPrefixProvider *dpp, struct DBOpParams *params)
-{
-  int index = -1;
-  int rc = 0;
-  struct DBOpPrepareParams p_params = PrepareParams;
-
-  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.lc_head.index, sdb);
-  SQL_BIND_TEXT(dpp, stmt, index, params->op.lc_head.index.c_str(), sdb);
-
-out:
-  return rc;
-}
-
-int SQLGetLCHead::Execute(const DoutPrefixProvider *dpp, struct DBOpParams *params)
-{
-  int ret = -1;
-
-  // clear the params before fetching the entry
-  params->op.lc_head.head = {};
-  SQL_EXECUTE(dpp, params, stmt, list_lc_head);
-out:
-  return ret;
-}
diff --git a/src/rgw/store/dbstore/sqlite/sqliteDB.h b/src/rgw/store/dbstore/sqlite/sqliteDB.h
deleted file mode 100644 (file)
index 038b24f..0000000
+++ /dev/null
@@ -1,554 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-
-#ifndef SQLITE_DB_H
-#define SQLITE_DB_H
-
-#include <errno.h>
-#include <stdlib.h>
-#include <string>
-#include <sqlite3.h>
-#include "rgw/store/dbstore/common/dbstore.h"
-
-using namespace rgw::store;
-
-class SQLiteDB : public DB, virtual public DBOp {
-  private:
-    sqlite3_mutex *mutex = NULL;
-
-  protected:
-    CephContext *cct;
-
-  public:
-    sqlite3_stmt *stmt = NULL;
-    DBOpPrepareParams PrepareParams;
-
-    SQLiteDB(sqlite3 *dbi, std::string db_name, CephContext *_cct) : DB(db_name, _cct), cct(_cct) {
-      db = (void*)dbi;
-    }
-    SQLiteDB(std::string db_name, CephContext *_cct) : DB(db_name, _cct), cct(_cct) {
-    }
-    ~SQLiteDB() {}
-
-    uint64_t get_blob_limit() override { return SQLITE_LIMIT_LENGTH; }
-    void *openDB(const DoutPrefixProvider *dpp) override;
-    int closeDB(const DoutPrefixProvider *dpp) override;
-    int InitializeDBOps(const DoutPrefixProvider *dpp) override;
-
-    int InitPrepareParams(const DoutPrefixProvider *dpp, DBOpPrepareParams &p_params,
-                          DBOpParams* params) override;
-
-    int exec(const DoutPrefixProvider *dpp, const char *schema,
-        int (*callback)(void*,int,char**,char**));
-    int Step(const DoutPrefixProvider *dpp, DBOpInfo &op, sqlite3_stmt *stmt,
-        int (*cbk)(const DoutPrefixProvider *dpp, DBOpInfo &op, sqlite3_stmt *stmt));
-    int Reset(const DoutPrefixProvider *dpp, sqlite3_stmt *stmt);
-    /* default value matches with sqliteDB style */
-
-    int createTables(const DoutPrefixProvider *dpp) override;
-    int createBucketTable(const DoutPrefixProvider *dpp, DBOpParams *params);
-    int createUserTable(const DoutPrefixProvider *dpp, DBOpParams *params);
-    int createObjectTable(const DoutPrefixProvider *dpp, DBOpParams *params);
-    int createObjectDataTable(const DoutPrefixProvider *dpp, DBOpParams *params);
-    int createObjectView(const DoutPrefixProvider *dpp, DBOpParams *params);
-    int createObjectTableTrigger(const DoutPrefixProvider *dpp, DBOpParams *params);
-    int createQuotaTable(const DoutPrefixProvider *dpp, DBOpParams *params);
-    void populate_object_params(const DoutPrefixProvider *dpp,
-                                struct DBOpPrepareParams& p_params,
-                                struct DBOpParams* params, bool data);
-
-    int createLCTables(const DoutPrefixProvider *dpp) override;
-
-    int DeleteBucketTable(const DoutPrefixProvider *dpp, DBOpParams *params);
-    int DeleteUserTable(const DoutPrefixProvider *dpp, DBOpParams *params);
-    int DeleteObjectTable(const DoutPrefixProvider *dpp, DBOpParams *params);
-    int DeleteObjectDataTable(const DoutPrefixProvider *dpp, DBOpParams *params);
-    int DeleteQuotaTable(const DoutPrefixProvider *dpp, DBOpParams *params);
-    int DeleteLCEntryTable(const DoutPrefixProvider *dpp, DBOpParams *params);
-    int DeleteLCHeadTable(const DoutPrefixProvider *dpp, DBOpParams *params);
-
-    int ListAllBuckets(const DoutPrefixProvider *dpp, DBOpParams *params) override;
-    int ListAllUsers(const DoutPrefixProvider *dpp, DBOpParams *params) override;
-    int ListAllObjects(const DoutPrefixProvider *dpp, DBOpParams *params) override;
-};
-
-class SQLObjectOp : public ObjectOp {
-  private:
-    sqlite3 **sdb = NULL;
-    CephContext *cct;
-
-  public:
-    SQLObjectOp(sqlite3 **sdbi, CephContext *_cct) : sdb(sdbi), cct(_cct) {};
-    ~SQLObjectOp() {}
-
-    int InitializeObjectOps(std::string db_name, const DoutPrefixProvider *dpp);
-};
-
-class SQLInsertUser : public SQLiteDB, public InsertUserOp {
-  private:
-    sqlite3 **sdb = NULL;
-    sqlite3_stmt *stmt = NULL; // Prepared statement
-
-  public:
-    SQLInsertUser(void **db, std::string db_name, CephContext *cct) : SQLiteDB((sqlite3 *)(*db), db_name, cct), sdb((sqlite3 **)db) {}
-    ~SQLInsertUser() {
-      if (stmt)
-        sqlite3_finalize(stmt);
-    }
-    int Prepare(const DoutPrefixProvider *dpp, DBOpParams *params);
-    int Execute(const DoutPrefixProvider *dpp, DBOpParams *params);
-    int Bind(const DoutPrefixProvider *dpp, DBOpParams *params);
-};
-
-class SQLRemoveUser : public SQLiteDB, public RemoveUserOp {
-  private:
-    sqlite3 **sdb = NULL;
-    sqlite3_stmt *stmt = NULL; // Prepared statement
-
-  public:
-    SQLRemoveUser(void **db, std::string db_name, CephContext *cct) : SQLiteDB((sqlite3 *)(*db), db_name, cct), sdb((sqlite3 **)db) {}
-    ~SQLRemoveUser() {
-      if (stmt)
-        sqlite3_finalize(stmt);
-    }
-    int Prepare(const DoutPrefixProvider *dpp, DBOpParams *params);
-    int Execute(const DoutPrefixProvider *dpp, DBOpParams *params);
-    int Bind(const DoutPrefixProvider *dpp, DBOpParams *params);
-};
-
-class SQLGetUser : public SQLiteDB, public GetUserOp {
-  private:
-    sqlite3 **sdb = NULL;
-    sqlite3_stmt *stmt = NULL; // Prepared statement
-    sqlite3_stmt *email_stmt = NULL; // Prepared statement to query by useremail
-    sqlite3_stmt *ak_stmt = NULL; // Prepared statement to query by access_key_id
-    sqlite3_stmt *userid_stmt = NULL; // Prepared statement to query by user_id
-
-  public:
-    SQLGetUser(void **db, std::string db_name, CephContext *cct) : SQLiteDB((sqlite3 *)(*db), db_name, cct), sdb((sqlite3 **)db) {}
-    ~SQLGetUser() {
-      if (stmt)
-        sqlite3_finalize(stmt);
-      if (email_stmt)
-        sqlite3_finalize(email_stmt);
-      if (ak_stmt)
-        sqlite3_finalize(ak_stmt);
-      if (userid_stmt)
-        sqlite3_finalize(userid_stmt);
-    }
-    int Prepare(const DoutPrefixProvider *dpp, DBOpParams *params);
-    int Execute(const DoutPrefixProvider *dpp, DBOpParams *params);
-    int Bind(const DoutPrefixProvider *dpp, DBOpParams *params);
-};
-
-class SQLInsertBucket : public SQLiteDB, public InsertBucketOp {
-  private:
-    sqlite3 **sdb = NULL;
-    sqlite3_stmt *stmt = NULL; // Prepared statement
-
-  public:
-    SQLInsertBucket(void **db, std::string db_name, CephContext *cct) : SQLiteDB((sqlite3 *)(*db), db_name, cct), sdb((sqlite3 **)db) {}
-    ~SQLInsertBucket() {
-      if (stmt)
-        sqlite3_finalize(stmt);
-    }
-    int Prepare(const DoutPrefixProvider *dpp, DBOpParams *params);
-    int Execute(const DoutPrefixProvider *dpp, DBOpParams *params);
-    int Bind(const DoutPrefixProvider *dpp, DBOpParams *params);
-};
-
-class SQLUpdateBucket : public SQLiteDB, public UpdateBucketOp {
-  private:
-    sqlite3 **sdb = NULL;
-    sqlite3_stmt *info_stmt = NULL; // Prepared statement
-    sqlite3_stmt *attrs_stmt = NULL; // Prepared statement
-    sqlite3_stmt *owner_stmt = NULL; // Prepared statement
-
-  public:
-    SQLUpdateBucket(void **db, std::string db_name, CephContext *cct) : SQLiteDB((sqlite3 *)(*db), db_name, cct), sdb((sqlite3 **)db) {}
-    ~SQLUpdateBucket() {
-      if (info_stmt)
-        sqlite3_finalize(info_stmt);
-      if (attrs_stmt)
-        sqlite3_finalize(attrs_stmt);
-      if (owner_stmt)
-        sqlite3_finalize(owner_stmt);
-    }
-    int Prepare(const DoutPrefixProvider *dpp, DBOpParams *params);
-    int Execute(const DoutPrefixProvider *dpp, DBOpParams *params);
-    int Bind(const DoutPrefixProvider *dpp, DBOpParams *params);
-};
-
-class SQLRemoveBucket : public SQLiteDB, public RemoveBucketOp {
-  private:
-    sqlite3 **sdb = NULL;
-    sqlite3_stmt *stmt = NULL; // Prepared statement
-
-  public:
-    SQLRemoveBucket(void **db, std::string db_name, CephContext *cct) : SQLiteDB((sqlite3 *)(*db), db_name, cct), sdb((sqlite3 **)db) {}
-    ~SQLRemoveBucket() {
-      if (stmt)
-        sqlite3_finalize(stmt);
-    }
-    int Prepare(const DoutPrefixProvider *dpp, DBOpParams *params);
-    int Execute(const DoutPrefixProvider *dpp, DBOpParams *params);
-    int Bind(const DoutPrefixProvider *dpp, DBOpParams *params);
-};
-
-class SQLGetBucket : public SQLiteDB, public GetBucketOp {
-  private:
-    sqlite3 **sdb = NULL;
-    sqlite3_stmt *stmt = NULL; // Prepared statement
-
-  public:
-    SQLGetBucket(void **db, std::string db_name, CephContext *cct) : SQLiteDB((sqlite3 *)(*db), db_name, cct), sdb((sqlite3 **)db) {}
-    ~SQLGetBucket() {
-      if (stmt)
-        sqlite3_finalize(stmt);
-    }
-    int Prepare(const DoutPrefixProvider *dpp, DBOpParams *params);
-    int Execute(const DoutPrefixProvider *dpp, DBOpParams *params);
-    int Bind(const DoutPrefixProvider *dpp, DBOpParams *params);
-};
-
-class SQLListUserBuckets : public SQLiteDB, public ListUserBucketsOp {
-  private:
-    sqlite3 **sdb = NULL;
-    sqlite3_stmt *stmt = NULL; // Prepared statement
-    sqlite3_stmt *all_stmt = NULL; // Prepared statement
-
-  public:
-    SQLListUserBuckets(void **db, std::string db_name, CephContext *cct) : SQLiteDB((sqlite3 *)(*db), db_name, cct), sdb((sqlite3 **)db) {}
-    ~SQLListUserBuckets() {
-      if (stmt)
-        sqlite3_finalize(stmt);
-      if (all_stmt)
-        sqlite3_finalize(all_stmt);
-    }
-    int Prepare(const DoutPrefixProvider *dpp, DBOpParams *params);
-    int Execute(const DoutPrefixProvider *dpp, DBOpParams *params);
-    int Bind(const DoutPrefixProvider *dpp, DBOpParams *params);
-};
-
-class SQLPutObject : public SQLiteDB, public PutObjectOp {
-  private:
-    sqlite3 **sdb = NULL;
-    sqlite3_stmt *stmt = NULL; // Prepared statement
-
-  public:
-    SQLPutObject(void **db, std::string db_name, CephContext *cct) : SQLiteDB((sqlite3 *)(*db), db_name, cct), sdb((sqlite3 **)db) {}
-    SQLPutObject(sqlite3 **sdbi, std::string db_name, CephContext *cct) : SQLiteDB(*sdbi, db_name, cct), sdb(sdbi) {}
-
-    ~SQLPutObject() {
-      if (stmt)
-        sqlite3_finalize(stmt);
-    }
-    int Prepare(const DoutPrefixProvider *dpp, DBOpParams *params);
-    int Execute(const DoutPrefixProvider *dpp, DBOpParams *params);
-    int Bind(const DoutPrefixProvider *dpp, DBOpParams *params);
-};
-
-class SQLDeleteObject : public SQLiteDB, public DeleteObjectOp {
-  private:
-    sqlite3 **sdb = NULL;
-    sqlite3_stmt *stmt = NULL; // Prepared statement
-
-  public:
-    SQLDeleteObject(void **db, std::string db_name, CephContext *cct) : SQLiteDB((sqlite3 *)(*db), db_name, cct), sdb((sqlite3 **)db) {}
-    SQLDeleteObject(sqlite3 **sdbi, std::string db_name, CephContext *cct) : SQLiteDB(*sdbi, db_name, cct), sdb(sdbi) {}
-
-    ~SQLDeleteObject() {
-      if (stmt)
-        sqlite3_finalize(stmt);
-    }
-    int Prepare(const DoutPrefixProvider *dpp, DBOpParams *params);
-    int Execute(const DoutPrefixProvider *dpp, DBOpParams *params);
-    int Bind(const DoutPrefixProvider *dpp, DBOpParams *params);
-};
-
-class SQLGetObject : public SQLiteDB, public GetObjectOp {
-  private:
-    sqlite3 **sdb = NULL;
-    sqlite3_stmt *stmt = NULL; // Prepared statement
-
-  public:
-    SQLGetObject(void **db, std::string db_name, CephContext *cct) : SQLiteDB((sqlite3 *)(*db), db_name, cct), sdb((sqlite3 **)db) {}
-    SQLGetObject(sqlite3 **sdbi, std::string db_name, CephContext *cct) : SQLiteDB(*sdbi, db_name, cct), sdb(sdbi) {}
-
-    ~SQLGetObject() {
-      if (stmt)
-        sqlite3_finalize(stmt);
-    }
-    int Prepare(const DoutPrefixProvider *dpp, DBOpParams *params);
-    int Execute(const DoutPrefixProvider *dpp, DBOpParams *params);
-    int Bind(const DoutPrefixProvider *dpp, DBOpParams *params);
-};
-
-class SQLUpdateObject : public SQLiteDB, public UpdateObjectOp {
-  private:
-    sqlite3 **sdb = NULL;
-    sqlite3_stmt *omap_stmt = NULL; // Prepared statement
-    sqlite3_stmt *attrs_stmt = NULL; // Prepared statement
-    sqlite3_stmt *meta_stmt = NULL; // Prepared statement
-    sqlite3_stmt *mp_stmt = NULL; // Prepared statement
-
-  public:
-    SQLUpdateObject(void **db, std::string db_name, CephContext *cct) : SQLiteDB((sqlite3 *)(*db), db_name, cct), sdb((sqlite3 **)db) {}
-    SQLUpdateObject(sqlite3 **sdbi, std::string db_name, CephContext *cct) : SQLiteDB(*sdbi, db_name, cct), sdb(sdbi) {}
-
-    ~SQLUpdateObject() {
-      if (omap_stmt)
-        sqlite3_finalize(omap_stmt);
-      if (attrs_stmt)
-        sqlite3_finalize(attrs_stmt);
-      if (meta_stmt)
-        sqlite3_finalize(meta_stmt);
-    }
-
-    int Prepare(const DoutPrefixProvider *dpp, DBOpParams *params);
-    int Execute(const DoutPrefixProvider *dpp, DBOpParams *params);
-    int Bind(const DoutPrefixProvider *dpp, DBOpParams *params);
-};
-
-class SQLListBucketObjects : public SQLiteDB, public ListBucketObjectsOp {
-  private:
-    sqlite3 **sdb = NULL;
-    sqlite3_stmt *stmt = NULL; // Prepared statement
-
-  public:
-    SQLListBucketObjects(void **db, std::string db_name, CephContext *cct) : SQLiteDB((sqlite3 *)(*db), db_name, cct), sdb((sqlite3 **)db) {}
-    SQLListBucketObjects(sqlite3 **sdbi, std::string db_name, CephContext *cct) : SQLiteDB(*sdbi, db_name, cct), sdb(sdbi) {}
-
-    ~SQLListBucketObjects() {
-      if (stmt)
-        sqlite3_finalize(stmt);
-    }
-    int Prepare(const DoutPrefixProvider *dpp, DBOpParams *params);
-    int Execute(const DoutPrefixProvider *dpp, DBOpParams *params);
-    int Bind(const DoutPrefixProvider *dpp, DBOpParams *params);
-};
-
-class SQLListVersionedObjects : public SQLiteDB, public ListVersionedObjectsOp {
-  private:
-    sqlite3 **sdb = NULL;
-    sqlite3_stmt *stmt = NULL; // Prepared statement
-
-  public:
-    SQLListVersionedObjects(void **db, std::string db_name, CephContext *cct) : SQLiteDB((sqlite3 *)(*db), db_name, cct), sdb((sqlite3 **)db) {}
-    SQLListVersionedObjects(sqlite3 **sdbi, std::string db_name, CephContext *cct) : SQLiteDB(*sdbi, db_name, cct), sdb(sdbi) {}
-
-    ~SQLListVersionedObjects() {
-      if (stmt)
-        sqlite3_finalize(stmt);
-    }
-    int Prepare(const DoutPrefixProvider *dpp, DBOpParams *params);
-    int Execute(const DoutPrefixProvider *dpp, DBOpParams *params);
-    int Bind(const DoutPrefixProvider *dpp, DBOpParams *params);
-};
-
-class SQLPutObjectData : public SQLiteDB, public PutObjectDataOp {
-  private:
-    sqlite3 **sdb = NULL;
-    sqlite3_stmt *stmt = NULL; // Prepared statement
-
-  public:
-    SQLPutObjectData(void **db, std::string db_name, CephContext *cct) : SQLiteDB((sqlite3 *)(*db), db_name, cct), sdb((sqlite3 **)db) {}
-    SQLPutObjectData(sqlite3 **sdbi, std::string db_name, CephContext *cct) : SQLiteDB(*sdbi, db_name, cct), sdb(sdbi) {}
-
-    ~SQLPutObjectData() {
-      if (stmt)
-        sqlite3_finalize(stmt);
-    }
-    int Prepare(const DoutPrefixProvider *dpp, DBOpParams *params);
-    int Execute(const DoutPrefixProvider *dpp, DBOpParams *params);
-    int Bind(const DoutPrefixProvider *dpp, DBOpParams *params);
-};
-
-class SQLUpdateObjectData : public SQLiteDB, public UpdateObjectDataOp {
-  private:
-    sqlite3 **sdb = NULL;
-    sqlite3_stmt *stmt = NULL; // Prepared statement
-
-  public:
-    SQLUpdateObjectData(void **db, std::string db_name, CephContext *cct) : SQLiteDB((sqlite3 *)(*db), db_name, cct), sdb((sqlite3 **)db) {}
-    SQLUpdateObjectData(sqlite3 **sdbi, std::string db_name, CephContext *cct) : SQLiteDB(*sdbi, db_name, cct), sdb(sdbi) {}
-
-    ~SQLUpdateObjectData() {
-      if (stmt)
-        sqlite3_finalize(stmt);
-    }
-    int Prepare(const DoutPrefixProvider *dpp, DBOpParams *params);
-    int Execute(const DoutPrefixProvider *dpp, DBOpParams *params);
-    int Bind(const DoutPrefixProvider *dpp, DBOpParams *params);
-};
-
-class SQLGetObjectData : public SQLiteDB, public GetObjectDataOp {
-  private:
-    sqlite3 **sdb = NULL;
-    sqlite3_stmt *stmt = NULL; // Prepared statement
-
-  public:
-    SQLGetObjectData(void **db, std::string db_name, CephContext *cct) : SQLiteDB((sqlite3 *)(*db), db_name, cct), sdb((sqlite3 **)db) {}
-    SQLGetObjectData(sqlite3 **sdbi, std::string db_name, CephContext *cct) : SQLiteDB(*sdbi, db_name, cct), sdb(sdbi) {}
-
-    ~SQLGetObjectData() {
-      if (stmt)
-        sqlite3_finalize(stmt);
-    }
-    int Prepare(const DoutPrefixProvider *dpp, DBOpParams *params);
-    int Execute(const DoutPrefixProvider *dpp, DBOpParams *params);
-    int Bind(const DoutPrefixProvider *dpp, DBOpParams *params);
-};
-
-class SQLDeleteObjectData : public SQLiteDB, public DeleteObjectDataOp {
-  private:
-    sqlite3 **sdb = NULL;
-    sqlite3_stmt *stmt = NULL; // Prepared statement
-
-  public:
-    SQLDeleteObjectData(void **db, std::string db_name, CephContext *cct) : SQLiteDB((sqlite3 *)(*db), db_name, cct), sdb((sqlite3 **)db) {}
-    SQLDeleteObjectData(sqlite3 **sdbi, std::string db_name, CephContext *cct) : SQLiteDB(*sdbi, db_name, cct), sdb(sdbi) {}
-
-    ~SQLDeleteObjectData() {
-      if (stmt)
-        sqlite3_finalize(stmt);
-    }
-    int Prepare(const DoutPrefixProvider *dpp, DBOpParams *params);
-    int Execute(const DoutPrefixProvider *dpp, DBOpParams *params);
-    int Bind(const DoutPrefixProvider *dpp, DBOpParams *params);
-};
-
-class SQLDeleteStaleObjectData : public SQLiteDB, public DeleteStaleObjectDataOp {
-  private:
-    sqlite3 **sdb = NULL;
-    sqlite3_stmt *stmt = NULL; // Prepared statement
-
-  public:
-    SQLDeleteStaleObjectData(void **db, std::string db_name, CephContext *cct) : SQLiteDB((sqlite3 *)(*db), db_name, cct), sdb((sqlite3 **)db) {}
-    SQLDeleteStaleObjectData(sqlite3 **sdbi, std::string db_name, CephContext *cct) : SQLiteDB(*sdbi, db_name, cct), sdb(sdbi) {}
-
-    ~SQLDeleteStaleObjectData() {
-      if (stmt)
-        sqlite3_finalize(stmt);
-    }
-    int Prepare(const DoutPrefixProvider *dpp, DBOpParams *params);
-    int Execute(const DoutPrefixProvider *dpp, DBOpParams *params);
-    int Bind(const DoutPrefixProvider *dpp, DBOpParams *params);
-};
-
-class SQLInsertLCEntry : public SQLiteDB, public InsertLCEntryOp {
-  private:
-    sqlite3 **sdb = NULL;
-    sqlite3_stmt *stmt = NULL; // Prepared statement
-
-  public:
-    SQLInsertLCEntry(void **db, std::string db_name, CephContext *cct) : SQLiteDB((sqlite3 *)(*db), db_name, cct), sdb((sqlite3 **)db) {}
-    ~SQLInsertLCEntry() {
-      if (stmt)
-        sqlite3_finalize(stmt);
-    }
-    int Prepare(const DoutPrefixProvider *dpp, DBOpParams *params);
-    int Execute(const DoutPrefixProvider *dpp, DBOpParams *params);
-    int Bind(const DoutPrefixProvider *dpp, DBOpParams *params);
-};
-
-class SQLRemoveLCEntry : public SQLiteDB, public RemoveLCEntryOp {
-  private:
-    sqlite3 **sdb = NULL;
-    sqlite3_stmt *stmt = NULL; // Prepared statement
-
-  public:
-    SQLRemoveLCEntry(void **db, std::string db_name, CephContext *cct) : SQLiteDB((sqlite3 *)(*db), db_name, cct), sdb((sqlite3 **)db) {}
-    ~SQLRemoveLCEntry() {
-      if (stmt)
-        sqlite3_finalize(stmt);
-    }
-    int Prepare(const DoutPrefixProvider *dpp, DBOpParams *params);
-    int Execute(const DoutPrefixProvider *dpp, DBOpParams *params);
-    int Bind(const DoutPrefixProvider *dpp, DBOpParams *params);
-};
-
-class SQLGetLCEntry : public SQLiteDB, public GetLCEntryOp {
-  private:
-    sqlite3 **sdb = NULL;
-    sqlite3_stmt *stmt = NULL; // Prepared statement
-    sqlite3_stmt *next_stmt = NULL; // Prepared statement
-
-  public:
-    SQLGetLCEntry(void **db, std::string db_name, CephContext *cct) : SQLiteDB((sqlite3 *)(*db), db_name, cct), sdb((sqlite3 **)db) {}
-    ~SQLGetLCEntry() {
-      if (stmt)
-        sqlite3_finalize(stmt);
-      if (next_stmt)
-        sqlite3_finalize(next_stmt);
-    }
-    int Prepare(const DoutPrefixProvider *dpp, DBOpParams *params);
-    int Execute(const DoutPrefixProvider *dpp, DBOpParams *params);
-    int Bind(const DoutPrefixProvider *dpp, DBOpParams *params);
-};
-
-class SQLListLCEntries : public SQLiteDB, public ListLCEntriesOp {
-  private:
-    sqlite3 **sdb = NULL;
-    sqlite3_stmt *stmt = NULL; // Prepared statement
-
-  public:
-    SQLListLCEntries(void **db, std::string db_name, CephContext *cct) : SQLiteDB((sqlite3 *)(*db), db_name, cct), sdb((sqlite3 **)db) {}
-    ~SQLListLCEntries() {
-      if (stmt)
-        sqlite3_finalize(stmt);
-    }
-    int Prepare(const DoutPrefixProvider *dpp, DBOpParams *params);
-    int Execute(const DoutPrefixProvider *dpp, DBOpParams *params);
-    int Bind(const DoutPrefixProvider *dpp, DBOpParams *params);
-};
-
-class SQLInsertLCHead : public SQLiteDB, public InsertLCHeadOp {
-  private:
-    sqlite3 **sdb = NULL;
-    sqlite3_stmt *stmt = NULL; // Prepared statement
-
-  public:
-    SQLInsertLCHead(void **db, std::string db_name, CephContext *cct) : SQLiteDB((sqlite3 *)(*db), db_name, cct), sdb((sqlite3 **)db) {}
-    ~SQLInsertLCHead() {
-      if (stmt)
-        sqlite3_finalize(stmt);
-    }
-    int Prepare(const DoutPrefixProvider *dpp, DBOpParams *params);
-    int Execute(const DoutPrefixProvider *dpp, DBOpParams *params);
-    int Bind(const DoutPrefixProvider *dpp, DBOpParams *params);
-};
-
-class SQLRemoveLCHead : public SQLiteDB, public RemoveLCHeadOp {
-  private:
-    sqlite3 **sdb = NULL;
-    sqlite3_stmt *stmt = NULL; // Prepared statement
-
-  public:
-    SQLRemoveLCHead(void **db, std::string db_name, CephContext *cct) : SQLiteDB((sqlite3 *)(*db), db_name, cct), sdb((sqlite3 **)db) {}
-    ~SQLRemoveLCHead() {
-      if (stmt)
-        sqlite3_finalize(stmt);
-    }
-    int Prepare(const DoutPrefixProvider *dpp, DBOpParams *params);
-    int Execute(const DoutPrefixProvider *dpp, DBOpParams *params);
-    int Bind(const DoutPrefixProvider *dpp, DBOpParams *params);
-};
-
-class SQLGetLCHead : public SQLiteDB, public GetLCHeadOp {
-  private:
-    sqlite3 **sdb = NULL;
-    sqlite3_stmt *stmt = NULL; // Prepared statement
-
-  public:
-    SQLGetLCHead(void **db, std::string db_name, CephContext *cct) : SQLiteDB((sqlite3 *)(*db), db_name, cct), sdb((sqlite3 **)db) {}
-    ~SQLGetLCHead() {
-      if (stmt)
-        sqlite3_finalize(stmt);
-    }
-    int Prepare(const DoutPrefixProvider *dpp, DBOpParams *params);
-    int Execute(const DoutPrefixProvider *dpp, DBOpParams *params);
-    int Bind(const DoutPrefixProvider *dpp, DBOpParams *params);
-};
-
-#endif
diff --git a/src/rgw/store/dbstore/sqlite/statement.cc b/src/rgw/store/dbstore/sqlite/statement.cc
deleted file mode 100644 (file)
index dcf7dba..0000000
+++ /dev/null
@@ -1,196 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab ft=cpp
-
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2022 Red Hat, Inc.
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation. See file COPYING.
- *
- */
-
-#include "common/dout.h"
-#include "error.h"
-#include "statement.h"
-
-#define dout_subsys ceph_subsys_rgw_dbstore
-
-namespace rgw::dbstore::sqlite {
-
-// owning pointer to arbitrary memory allocated and returned by sqlite3
-struct sqlite_deleter {
-  template <typename T>
-  void operator()(T* p) { ::sqlite3_free(p); }
-};
-template <typename T>
-using sqlite_ptr = std::unique_ptr<T, sqlite_deleter>;
-
-
-stmt_ptr prepare_statement(const DoutPrefixProvider* dpp,
-                           sqlite3* db, std::string_view sql)
-{
-  sqlite3_stmt* stmt = nullptr;
-  int result = ::sqlite3_prepare_v2(db, sql.data(), sql.size(), &stmt, nullptr);
-  auto ec = std::error_code{result, sqlite::error_category()};
-  if (ec != sqlite::errc::ok) {
-    const char* errmsg = ::sqlite3_errmsg(db);
-    ldpp_dout(dpp, 1) << "preparation failed: " << errmsg
-        << " (" << ec << ")\nstatement: " << sql << dendl;
-    throw sqlite::error(errmsg, ec);
-  }
-  return stmt_ptr{stmt};
-}
-
-static int bind_index(const DoutPrefixProvider* dpp,
-                      const stmt_binding& stmt, const char* name)
-{
-  const int index = ::sqlite3_bind_parameter_index(stmt.get(), name);
-  if (index <= 0) {
-    ldpp_dout(dpp, 1) << "binding failed on parameter name="
-        << name << dendl;
-    sqlite3* db = ::sqlite3_db_handle(stmt.get());
-    throw sqlite::error(db);
-  }
-  return index;
-}
-
-void bind_text(const DoutPrefixProvider* dpp, const stmt_binding& stmt,
-               const char* name, std::string_view value)
-{
-  const int index = bind_index(dpp, stmt, name);
-
-  int result = ::sqlite3_bind_text(stmt.get(), index, value.data(),
-                                   value.size(), SQLITE_STATIC);
-  auto ec = std::error_code{result, sqlite::error_category()};
-  if (ec != sqlite::errc::ok) {
-    ldpp_dout(dpp, 1) << "binding failed on parameter name="
-        << name << " value=" << value << dendl;
-    sqlite3* db = ::sqlite3_db_handle(stmt.get());
-    throw sqlite::error(db, ec);
-  }
-}
-
-void bind_int(const DoutPrefixProvider* dpp, const stmt_binding& stmt,
-              const char* name, int value)
-{
-  const int index = bind_index(dpp, stmt, name);
-
-  int result = ::sqlite3_bind_int(stmt.get(), index, value);
-  auto ec = std::error_code{result, sqlite::error_category()};
-  if (ec != sqlite::errc::ok) {
-    ldpp_dout(dpp, 1) << "binding failed on parameter name="
-        << name << " value=" << value << dendl;
-    sqlite3* db = ::sqlite3_db_handle(stmt.get());
-    throw sqlite::error(db, ec);
-  }
-}
-
-void eval0(const DoutPrefixProvider* dpp, const stmt_execution& stmt)
-{
-  sqlite_ptr<char> sql;
-  if (dpp->get_cct()->_conf->subsys.should_gather<dout_subsys, 20>()) {
-    sql.reset(::sqlite3_expanded_sql(stmt.get()));
-  }
-
-  const int result = ::sqlite3_step(stmt.get());
-  auto ec = std::error_code{result, sqlite::error_category()};
-  sqlite3* db = ::sqlite3_db_handle(stmt.get());
-
-  if (ec != sqlite::errc::done) {
-    const char* errmsg = ::sqlite3_errmsg(db);
-    ldpp_dout(dpp, 20) << "evaluation failed: " << errmsg
-        << " (" << ec << ")\nstatement: " << sql.get() << dendl;
-    throw sqlite::error(errmsg, ec);
-  }
-  ldpp_dout(dpp, 20) << "evaluation succeeded: " << sql.get() << dendl;
-}
-
-void eval1(const DoutPrefixProvider* dpp, const stmt_execution& stmt)
-{
-  sqlite_ptr<char> sql;
-  if (dpp->get_cct()->_conf->subsys.should_gather<dout_subsys, 20>()) {
-    sql.reset(::sqlite3_expanded_sql(stmt.get()));
-  }
-
-  const int result = ::sqlite3_step(stmt.get());
-  auto ec = std::error_code{result, sqlite::error_category()};
-  if (ec != sqlite::errc::row) {
-    sqlite3* db = ::sqlite3_db_handle(stmt.get());
-    const char* errmsg = ::sqlite3_errmsg(db);
-    ldpp_dout(dpp, 1) << "evaluation failed: " << errmsg << " (" << ec
-        << ")\nstatement: " << sql.get() << dendl;
-    throw sqlite::error(errmsg, ec);
-  }
-  ldpp_dout(dpp, 20) << "evaluation succeeded: " << sql.get() << dendl;
-}
-
-int column_int(const stmt_execution& stmt, int column)
-{
-  return ::sqlite3_column_int(stmt.get(), column);
-}
-
-std::string column_text(const stmt_execution& stmt, int column)
-{
-  const unsigned char* text = ::sqlite3_column_text(stmt.get(), column);
-  // may be NULL
-  if (text) {
-    const std::size_t size = ::sqlite3_column_bytes(stmt.get(), column);
-    return {reinterpret_cast<const char*>(text), size};
-  } else {
-    return {};
-  }
-}
-
-auto read_text_rows(const DoutPrefixProvider* dpp,
-                    const stmt_execution& stmt,
-                    std::span<std::string> entries)
-  -> std::span<std::string>
-{
-  sqlite_ptr<char> sql;
-  if (dpp->get_cct()->_conf->subsys.should_gather<dout_subsys, 20>()) {
-    sql.reset(::sqlite3_expanded_sql(stmt.get()));
-  }
-
-  std::size_t count = 0;
-  while (count < entries.size()) {
-    const int result = ::sqlite3_step(stmt.get());
-    auto ec = std::error_code{result, sqlite::error_category()};
-    if (ec == sqlite::errc::done) {
-      break;
-    }
-    if (ec != sqlite::errc::row) {
-      sqlite3* db = ::sqlite3_db_handle(stmt.get());
-      const char* errmsg = ::sqlite3_errmsg(db);
-      ldpp_dout(dpp, 1) << "evaluation failed: " << errmsg << " (" << ec
-          << ")\nstatement: " << sql.get() << dendl;
-      throw sqlite::error(errmsg, ec);
-    }
-    entries[count] = column_text(stmt, 0);
-    ++count;
-  }
-  ldpp_dout(dpp, 20) << "statement evaluation produced " << count
-      << " results: " << sql.get() << dendl;
-
-  return entries.first(count);
-}
-
-void execute(const DoutPrefixProvider* dpp, sqlite3* db, const char* query,
-             sqlite3_callback callback, void* arg)
-{
-  char* errmsg = nullptr;
-  const int result = ::sqlite3_exec(db, query, callback, arg, &errmsg);
-  auto ec = std::error_code{result, sqlite::error_category()};
-  auto ptr = sqlite_ptr<char>{errmsg}; // free on destruction
-  if (ec != sqlite::errc::ok) {
-    ldpp_dout(dpp, 1) << "query execution failed: " << errmsg << " (" << ec
-        << ")\nquery: " << query << dendl;
-    throw sqlite::error(errmsg, ec);
-  }
-  ldpp_dout(dpp, 20) << "query execution succeeded: " << query << dendl;
-}
-
-} // namespace rgw::dbstore::sqlite
diff --git a/src/rgw/store/dbstore/sqlite/statement.h b/src/rgw/store/dbstore/sqlite/statement.h
deleted file mode 100644 (file)
index 98b4acf..0000000
+++ /dev/null
@@ -1,83 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab ft=cpp
-
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2022 Red Hat, Inc.
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation. See file COPYING.
- *
- */
-
-#pragma once
-
-#include <memory>
-#include <span>
-#include <string>
-
-#include <sqlite3.h>
-
-class DoutPrefixProvider;
-
-namespace rgw::dbstore::sqlite {
-
-// owning sqlite3_stmt pointer
-struct stmt_deleter {
-  void operator()(sqlite3_stmt* p) const { ::sqlite3_finalize(p); }
-};
-using stmt_ptr = std::unique_ptr<sqlite3_stmt, stmt_deleter>;
-
-// non-owning sqlite3_stmt pointer that clears binding state on destruction
-struct stmt_binding_deleter {
-  void operator()(sqlite3_stmt* p) const { ::sqlite3_clear_bindings(p); }
-};
-using stmt_binding = std::unique_ptr<sqlite3_stmt, stmt_binding_deleter>;
-
-// non-owning sqlite3_stmt pointer that clears execution state on destruction
-struct stmt_execution_deleter {
-  void operator()(sqlite3_stmt* p) const { ::sqlite3_reset(p); }
-};
-using stmt_execution = std::unique_ptr<sqlite3_stmt, stmt_execution_deleter>;
-
-
-// prepare the sql statement or throw on error
-stmt_ptr prepare_statement(const DoutPrefixProvider* dpp,
-                           sqlite3* db, std::string_view sql);
-
-// bind an input string for the given parameter name
-void bind_text(const DoutPrefixProvider* dpp, const stmt_binding& stmt,
-               const char* name, std::string_view value);
-
-// bind an input integer for the given parameter name
-void bind_int(const DoutPrefixProvider* dpp, const stmt_binding& stmt,
-              const char* name, int value);
-
-// evaluate a prepared statement, expecting no result rows
-void eval0(const DoutPrefixProvider* dpp, const stmt_execution& stmt);
-
-// evaluate a prepared statement, expecting a single result row
-void eval1(const DoutPrefixProvider* dpp, const stmt_execution& stmt);
-
-// return the given column as an integer
-int column_int(const stmt_execution& stmt, int column);
-
-// return the given column as text, or an empty string on NULL
-std::string column_text(const stmt_execution& stmt, int column);
-
-// read the text column from each result row into the given entries, and return
-// the sub-span of entries that contain results
-auto read_text_rows(const DoutPrefixProvider* dpp,
-                    const stmt_execution& stmt,
-                    std::span<std::string> entries)
-  -> std::span<std::string>;
-
-// execute a raw query without preparing a statement. the optional callback
-// can be used to read results
-void execute(const DoutPrefixProvider* dpp, sqlite3* db, const char* query,
-             sqlite3_callback callback, void* arg);
-
-} // namespace rgw::dbstore::sqlite
diff --git a/src/rgw/store/dbstore/tests/CMakeLists.txt b/src/rgw/store/dbstore/tests/CMakeLists.txt
deleted file mode 100644 (file)
index 4e60dcf..0000000
+++ /dev/null
@@ -1,17 +0,0 @@
-cmake_minimum_required(VERSION 3.14.0)
-project(dbstore-tests)
-
-set (CMAKE_LINK_LIBRARIES ${CMAKE_LINK_LIBRARIES} gtest)
-
-set(dbstore_tests_srcs
-    dbstore_tests.cc)
-
-include_directories(${CMAKE_INCLUDE_DIR})
-
-add_executable(unittest_dbstore_tests ${dbstore_tests_srcs})
-target_link_libraries(unittest_dbstore_tests ${CMAKE_LINK_LIBRARIES})
-add_ceph_unittest(unittest_dbstore_tests)
-
-add_executable(unittest_dbstore_mgr_tests dbstore_mgr_tests.cc)
-target_link_libraries(unittest_dbstore_mgr_tests dbstore gtest_main)
-add_ceph_unittest(unittest_dbstore_mgr_tests)
diff --git a/src/rgw/store/dbstore/tests/dbstore_mgr_tests.cc b/src/rgw/store/dbstore/tests/dbstore_mgr_tests.cc
deleted file mode 100644 (file)
index 4f58f47..0000000
+++ /dev/null
@@ -1,157 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-
-#include "common/ceph_context.h"
-#include "rgw/store/dbstore/dbstore_mgr.h"
-
-#include <filesystem>
-#include <gtest/gtest.h>
-#include <memory>
-
-using namespace rgw;
-namespace fs = std::filesystem;
-const static std::string TEST_DIR = "rgw_dbstore_tests";
-
-bool endsWith(const std::string &mainStr, const std::string &toMatch)
-{
-    if(mainStr.size() >= toMatch.size() &&
-            mainStr.compare(mainStr.size() - toMatch.size(), toMatch.size(), toMatch) == 0)
-            return true;
-        else
-            return false;
-}
-
-class TestDBStoreManager : public ::testing::Test {
-protected:
-  void SetUp() override {
-    ctx_ = std::make_shared<CephContext>(CEPH_ENTITY_TYPE_CLIENT);
-    g_ceph_context = ctx_.get();
-    fs::current_path(fs::temp_directory_path());
-    fs::create_directory(TEST_DIR);
-  }
-
-  void TearDown() override {
-    fs::current_path(fs::temp_directory_path());
-    fs::remove_all(TEST_DIR);
-  }
-
-  std::string getTestDir() const {
-    auto test_dir = fs::temp_directory_path() / TEST_DIR;
-    return test_dir.string();
-  }
-
-  fs::path getDBFullPath(const std::string & base_dir,
-                         const std::string & tenant) const {
-    auto db_path = ctx_->_conf.get_val<std::string>("dbstore_db_dir");
-    const auto& db_name = ctx_->_conf.get_val<std::string>("dbstore_db_name_prefix") + "-" + tenant + ".db";
-
-    auto db_full_path = std::filesystem::path(db_path) / db_name;
-    auto db_full_path_test = fs::path(base_dir) / db_full_path;
-    return db_full_path_test;
-  }
-
-  std::string getDBTenant(const std::string & base_dir,
-                          const std::string & tenant) const {
-    auto db_name = ctx_->_conf.get_val<std::string>("dbstore_db_name_prefix");
-    db_name += "-" + tenant;
-    auto db_full_path = fs::path(base_dir) /  db_name;
-    return db_full_path.string();
-  }
-
-  std::string getDBTenant(const std::string & tenant = default_tenant) const {
-    return getDBTenant(getTestDir(), tenant);
-  }
-
-  fs::path getDBFullPath(const std::string & tenant) const {
-    return getDBFullPath(getTestDir(), tenant);
-  }
-
-  fs::path getLogFilePath(const std::string & log_file) {
-    return fs::temp_directory_path() / log_file;
-  }
-
-  std::shared_ptr<CephContext> getContext() const {
-    return ctx_;
-  }
-
- private:
-    std::shared_ptr<CephContext> ctx_;
-};
-
-TEST_F(TestDBStoreManager, BasicInstantiateUsingDBDir) {
-  getContext()->_conf.set_val("dbstore_db_dir", getTestDir());
-
-  EXPECT_FALSE(fs::exists(getDBFullPath(default_tenant)));
-  auto dbstore_mgr = std::make_shared<DBStoreManager>(getContext().get());
-  EXPECT_TRUE(fs::exists(getDBFullPath(default_tenant)));
-}
-
-TEST_F(TestDBStoreManager, DBNamePrefix) {
-  getContext()->_conf.set_val("dbstore_db_dir", getTestDir());
-  std::string prefix = "testprefix";
-  getContext()->_conf.set_val("dbstore_db_name_prefix", prefix);
-
-  EXPECT_FALSE(fs::exists(getDBFullPath(default_tenant)));
-  auto dbstore_mgr = std::make_shared<DBStoreManager>(getContext().get());
-  EXPECT_TRUE(fs::exists(getDBFullPath(default_tenant)));
-
-  // check that the database name contains the given prefix
-  std::string expected_db_name = prefix + "-" + default_tenant + ".db";
-  EXPECT_TRUE(endsWith(getDBFullPath(default_tenant), expected_db_name));
-}
-
-TEST_F(TestDBStoreManager, BasicInstantiateSecondConstructor) {
-  getContext()->_conf.set_val("dbstore_db_dir", getTestDir());
-
-  EXPECT_FALSE(fs::exists(getDBFullPath(default_tenant)));
-  auto dbstore_mgr = std::make_shared<DBStoreManager>(getContext().get(), getLogFilePath("test.log").string(), 10);
-  EXPECT_TRUE(fs::exists(getDBFullPath(default_tenant)));
-}
-
-TEST_F(TestDBStoreManager, TestDBName) {
-  getContext()->_conf.set_val("dbstore_db_dir", getTestDir());
-
-  auto dbstore_mgr = std::make_shared<DBStoreManager>(getContext().get());
-  auto db = dbstore_mgr->getDB(default_tenant, false);
-  ASSERT_NE(nullptr, db);
-  EXPECT_EQ(getDBTenant(), db->getDBname());
-}
-
-
-TEST_F(TestDBStoreManager, TestDBNameDefaultDB) {
-  getContext()->_conf.set_val("dbstore_db_dir", getTestDir());
-
-  auto dbstore_mgr = std::make_shared<DBStoreManager>(getContext().get());
-  // passing an empty tenant should return the default_db
-  auto db = dbstore_mgr->getDB("", false);
-  ASSERT_NE(nullptr, db);
-  EXPECT_EQ(getDBTenant(), db->getDBname());
-}
-
-TEST_F(TestDBStoreManager, TestDBBadTenant) {
-  getContext()->_conf.set_val("dbstore_db_dir", getTestDir());
-
-  auto dbstore_mgr = std::make_shared<DBStoreManager>(getContext().get());
-  auto db = dbstore_mgr->getDB("does-not-exist", false);
-  ASSERT_EQ(nullptr, db);
-}
-
-TEST_F(TestDBStoreManager, TestGetNewDB) {
-  getContext()->_conf.set_val("dbstore_db_dir", getTestDir());
-
-  auto dbstore_mgr = std::make_shared<DBStoreManager>(getContext().get());
-
-  auto new_tenant_path = "new_tenant";
-  auto db = dbstore_mgr->getDB(new_tenant_path, true);
-  ASSERT_NE(nullptr, db);
-  EXPECT_EQ(getDBTenant(new_tenant_path), db->getDBname());
-}
-
-TEST_F(TestDBStoreManager, TestDelete) {
-  getContext()->_conf.set_val("dbstore_db_dir", getTestDir());
-
-  auto dbstore_mgr = std::make_shared<DBStoreManager>(getContext().get());
-  dbstore_mgr->deleteDB(default_tenant);
-  auto db = dbstore_mgr->getDB(default_tenant, false);
-  ASSERT_EQ(nullptr, db);
-}
diff --git a/src/rgw/store/dbstore/tests/dbstore_tests.cc b/src/rgw/store/dbstore/tests/dbstore_tests.cc
deleted file mode 100644 (file)
index e87002f..0000000
+++ /dev/null
@@ -1,1424 +0,0 @@
-#include "gtest/gtest.h"
-#include <iostream>
-#include <stdlib.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <dbstore.h>
-#include <sqliteDB.h>
-#include "rgw_common.h"
-
-using namespace std;
-using DB = rgw::store::DB;
-
-vector<const char*> args;
-
-namespace gtest {
-  class Environment* env;
-
-  class Environment : public ::testing::Environment {
-    public:
-      Environment(): tenant("default_ns"), db(nullptr),
-      db_type("SQLite"), ret(-1) {}
-
-      Environment(string tenantname, string db_typename): 
-        tenant(tenantname), db(nullptr),
-        db_type(db_typename), ret(-1) {}
-
-      virtual ~Environment() {}
-
-      void SetUp() override {
-        cct = global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT,
-            CODE_ENVIRONMENT_DAEMON,
-            CINIT_FLAG_NO_DEFAULT_CONFIG_FILE | CINIT_FLAG_NO_MON_CONFIG | CINIT_FLAG_NO_DAEMON_ACTIONS);
-        if (!db_type.compare("SQLite")) {
-          db = new SQLiteDB(tenant, cct.get());
-          ASSERT_TRUE(db != nullptr);
-          ret = db->Initialize(logfile, loglevel);
-          ASSERT_GE(ret, 0);
-        }
-      }
-
-      void TearDown() override {
-        if (!db)
-          return;
-        db->Destroy(db->get_def_dpp());
-        delete db;
-      }
-
-      string tenant;
-      DB *db;
-      string db_type;
-      int ret;
-      string logfile = "rgw_dbstore_tests.log";
-      int loglevel = 30;
-      boost::intrusive_ptr<CephContext> cct;
-  };
-}
-
-ceph::real_time bucket_mtime = real_clock::now();
-string marker1;
-
-class DBGetDataCB : public RGWGetDataCB {
-  public:
-    bufferlist data_bl;
-    off_t data_ofs, data_len;
-
-    int handle_data(bufferlist& bl, off_t bl_ofs, off_t bl_len) {
-      data_bl = bl;
-      data_ofs = bl_ofs;
-      data_len = bl_len;
-      return 0;
-    }
-};
-
-namespace {
-
-  class DBStoreTest : public ::testing::Test {
-    protected:
-      int ret;
-      DB *db = nullptr;
-      string user1 = "user1";
-      string user_id1 = "user_id1";
-      string bucket1 = "bucket1";
-      string object1 = "object1";
-      string data = "Hello World";
-      DBOpParams GlobalParams = {};
-      const DoutPrefixProvider *dpp;
-
-      DBStoreTest() {}
-      void SetUp() {
-        db = gtest::env->db;
-        ASSERT_TRUE(db != nullptr);
-        dpp = db->get_def_dpp();
-        ASSERT_TRUE(dpp != nullptr);
-
-        GlobalParams.op.user.uinfo.display_name = user1;
-        GlobalParams.op.user.uinfo.user_id.id = user_id1;
-        GlobalParams.op.bucket.info.bucket.name = bucket1;
-        GlobalParams.op.obj.state.obj.bucket = GlobalParams.op.bucket.info.bucket;
-        GlobalParams.op.obj.state.obj.key.name = object1;
-        GlobalParams.op.obj.state.obj.key.instance = "inst1";
-        GlobalParams.op.obj.obj_id = "obj_id1";
-        GlobalParams.op.obj_data.part_num = 0;
-
-        /* As of now InitializeParams doesnt do anything
-         * special based on fop. Hence its okay to do
-         * global initialization once.
-         */
-        ret = db->InitializeParams(dpp, &GlobalParams);
-        ASSERT_EQ(ret, 0);
-      }
-
-      void TearDown() {
-      }
-
-      int write_object(const DoutPrefixProvider *dpp, DBOpParams params) {
-        DB::Object op_target(db, params.op.bucket.info,
-                             params.op.obj.state.obj);
-        DB::Object::Write write_op(&op_target);
-        map<string, bufferlist> setattrs;
-        ret = write_op.prepare(dpp);
-        if (ret)
-          return ret;
-
-        write_op.meta.mtime = &bucket_mtime;
-        write_op.meta.category = RGWObjCategory::Main;
-        write_op.meta.owner = params.op.user.uinfo.user_id;
-
-        bufferlist b1 = params.op.obj.head_data;
-        write_op.meta.data = &b1;
-
-        bufferlist b2;
-        encode("ACL", b2);
-        setattrs[RGW_ATTR_ACL] = b2;
-
-        ret = write_op.write_meta(0, params.op.obj.state.size, b1.length()+1, setattrs);
-        return ret;
-      }
-  };
-}
-
-TEST_F(DBStoreTest, InsertUser) {
-  struct DBOpParams params = GlobalParams;
-  int ret = -1;
-
-  params.op.user.uinfo.user_id.tenant = "tenant";
-  params.op.user.uinfo.user_email = "user1@dbstore.com";
-  params.op.user.uinfo.suspended = 123;
-  params.op.user.uinfo.max_buckets = 456;
-  params.op.user.uinfo.assumed_role_arn = "role";
-  params.op.user.uinfo.placement_tags.push_back("tags");
-  RGWAccessKey k1("id1", "key1");
-  RGWAccessKey k2("id2", "key2");
-  params.op.user.uinfo.access_keys["id1"] = k1;
-  params.op.user.uinfo.access_keys["id2"] = k2;
-  params.op.user.user_version.ver = 1;    
-  params.op.user.user_version.tag = "UserTAG";    
-
-  ret = db->ProcessOp(dpp, "InsertUser", &params);
-  ASSERT_EQ(ret, 0);
-}
-
-TEST_F(DBStoreTest, GetUser) {
-  struct DBOpParams params = GlobalParams;
-  int ret = -1;
-
-  ret = db->ProcessOp(dpp, "GetUser", &params);
-  ASSERT_EQ(ret, 0);
-  ASSERT_EQ(params.op.user.uinfo.user_id.tenant, "tenant");
-  ASSERT_EQ(params.op.user.uinfo.user_email, "user1@dbstore.com");
-  ASSERT_EQ(params.op.user.uinfo.user_id.id, "user_id1");
-  ASSERT_EQ(params.op.user.uinfo.suspended, 123);
-  ASSERT_EQ(params.op.user.uinfo.max_buckets, 456);
-  ASSERT_EQ(params.op.user.uinfo.assumed_role_arn, "role");
-  ASSERT_EQ(params.op.user.uinfo.placement_tags.back(), "tags");
-  RGWAccessKey k;
-  map<string, RGWAccessKey>::iterator it2 = params.op.user.uinfo.access_keys.begin();
-  k = it2->second;
-  ASSERT_EQ(k.id, "id1");
-  ASSERT_EQ(k.key, "key1");
-  it2++;
-  k = it2->second;
-  ASSERT_EQ(k.id, "id2");
-  ASSERT_EQ(k.key, "key2");
-
-}
-
-TEST_F(DBStoreTest, GetUserQuery) {
-  struct DBOpParams params = GlobalParams;
-  int ret = -1;
-
-  params.op.query_str = "email";
-  params.op.user.uinfo.user_email = "user1@dbstore.com";
-
-  ret = db->ProcessOp(dpp, "GetUser", &params);
-  ASSERT_EQ(ret, 0);
-  ASSERT_EQ(params.op.user.uinfo.user_id.tenant, "tenant");
-  ASSERT_EQ(params.op.user.uinfo.user_email, "user1@dbstore.com");
-  ASSERT_EQ(params.op.user.uinfo.user_id.id, "user_id1");
-  ASSERT_EQ(params.op.user.uinfo.suspended, 123);
-  ASSERT_EQ(params.op.user.uinfo.max_buckets, 456);
-  ASSERT_EQ(params.op.user.uinfo.assumed_role_arn, "role");
-  ASSERT_EQ(params.op.user.uinfo.placement_tags.back(), "tags");
-  RGWAccessKey k;
-  map<string, RGWAccessKey>::iterator it2 = params.op.user.uinfo.access_keys.begin();
-  k = it2->second;
-  ASSERT_EQ(k.id, "id1");
-  ASSERT_EQ(k.key, "key1");
-  it2++;
-  k = it2->second;
-  ASSERT_EQ(k.id, "id2");
-  ASSERT_EQ(k.key, "key2");
-
-}
-
-TEST_F(DBStoreTest, GetUserQueryByEmail) {
-  int ret = -1;
-  RGWUserInfo uinfo;
-  string email = "user1@dbstore.com";
-  map<std::string, bufferlist> attrs;
-  RGWObjVersionTracker objv;
-
-  ret = db->get_user(dpp, "email", email, uinfo, &attrs, &objv);
-  ASSERT_EQ(ret, 0);
-  ASSERT_EQ(uinfo.user_id.tenant, "tenant");
-  ASSERT_EQ(uinfo.user_email, "user1@dbstore.com");
-  ASSERT_EQ(uinfo.user_id.id, "user_id1");
-  ASSERT_EQ(uinfo.suspended, 123);
-  ASSERT_EQ(uinfo.max_buckets, 456);
-  ASSERT_EQ(uinfo.assumed_role_arn, "role");
-  ASSERT_EQ(uinfo.placement_tags.back(), "tags");
-  RGWAccessKey k;
-  map<string, RGWAccessKey>::iterator it2 = uinfo.access_keys.begin();
-  k = it2->second;
-  ASSERT_EQ(k.id, "id1");
-  ASSERT_EQ(k.key, "key1");
-  it2++;
-  k = it2->second;
-  ASSERT_EQ(k.id, "id2");
-  ASSERT_EQ(k.key, "key2");
-  ASSERT_EQ(objv.read_version.ver, 1);
-}
-
-TEST_F(DBStoreTest, GetUserQueryByAccessKey) {
-  int ret = -1;
-  RGWUserInfo uinfo;
-  string key = "id1";
-
-  ret = db->get_user(dpp, "access_key", key, uinfo, nullptr, nullptr);
-  ASSERT_EQ(ret, 0);
-  ASSERT_EQ(uinfo.user_id.tenant, "tenant");
-  ASSERT_EQ(uinfo.user_email, "user1@dbstore.com");
-  ASSERT_EQ(uinfo.user_id.id, "user_id1");
-  ASSERT_EQ(uinfo.suspended, 123);
-  ASSERT_EQ(uinfo.max_buckets, 456);
-  ASSERT_EQ(uinfo.assumed_role_arn, "role");
-  ASSERT_EQ(uinfo.placement_tags.back(), "tags");
-  RGWAccessKey k;
-  map<string, RGWAccessKey>::iterator it2 = uinfo.access_keys.begin();
-  k = it2->second;
-  ASSERT_EQ(k.id, "id1");
-  ASSERT_EQ(k.key, "key1");
-  it2++;
-  k = it2->second;
-  ASSERT_EQ(k.id, "id2");
-  ASSERT_EQ(k.key, "key2");
-}
-
-TEST_F(DBStoreTest, StoreUser) {
-  struct DBOpParams params = GlobalParams;
-  int ret = -1;
-  RGWUserInfo uinfo, old_uinfo;
-  map<std::string, bufferlist> attrs;
-  RGWObjVersionTracker objv_tracker;
-
-  bufferlist attr1, attr2;
-  encode("attrs1", attr1);
-  attrs["attr1"] = attr1;
-  encode("attrs2", attr2);
-  attrs["attr2"] = attr2;
-
-  uinfo.user_id.id = "user_id2";
-  uinfo.user_id.tenant = "tenant";
-  uinfo.user_email = "user2@dbstore.com";
-  uinfo.suspended = 123;
-  uinfo.max_buckets = 456;
-  uinfo.assumed_role_arn = "role";
-  uinfo.placement_tags.push_back("tags");
-  RGWAccessKey k1("id1", "key1");
-  RGWAccessKey k2("id2", "key2");
-  uinfo.access_keys["id1"] = k1;
-  uinfo.access_keys["id2"] = k2;
-
-  /* non exclusive create..should create new one */
-  ret = db->store_user(dpp, uinfo, false, &attrs, &objv_tracker, &old_uinfo);
-  ASSERT_EQ(ret, 0);
-  ASSERT_EQ(old_uinfo.user_email, "");
-  ASSERT_EQ(objv_tracker.read_version.ver, 1);
-  ASSERT_EQ(objv_tracker.read_version.tag, "UserTAG");
-
-  /* invalid version number */
-  objv_tracker.read_version.ver = 4;
-  ret = db->store_user(dpp, uinfo, true, &attrs, &objv_tracker, &old_uinfo);
-  ASSERT_EQ(ret, -125); /* returns ECANCELED */
-  ASSERT_EQ(old_uinfo.user_id.id, uinfo.user_id.id);
-  ASSERT_EQ(old_uinfo.user_email, uinfo.user_email);
-
-  /* exclusive create..should not create new one */
-  uinfo.user_email = "user2_new@dbstore.com";
-  objv_tracker.read_version.ver = 1;
-  ret = db->store_user(dpp, uinfo, true, &attrs, &objv_tracker, &old_uinfo);
-  ASSERT_EQ(ret, 0);
-  ASSERT_EQ(old_uinfo.user_email, "user2@dbstore.com");
-  ASSERT_EQ(objv_tracker.read_version.ver, 1);
-
-  ret = db->store_user(dpp, uinfo, false, &attrs, &objv_tracker, &old_uinfo);
-  ASSERT_EQ(ret, 0);
-  ASSERT_EQ(old_uinfo.user_email, "user2@dbstore.com");
-  ASSERT_EQ(objv_tracker.read_version.ver, 2);
-  ASSERT_EQ(objv_tracker.read_version.tag, "UserTAG");
-}
-
-TEST_F(DBStoreTest, GetUserQueryByUserID) {
-  int ret = -1;
-  RGWUserInfo uinfo;
-  map<std::string, bufferlist> attrs;
-  RGWObjVersionTracker objv;
-
-  uinfo.user_id.tenant = "tenant";
-  uinfo.user_id.id = "user_id2";
-
-  ret = db->get_user(dpp, "user_id", "user_id2", uinfo, &attrs, &objv);
-  ASSERT_EQ(ret, 0);
-  ASSERT_EQ(uinfo.user_id.tenant, "tenant");
-  ASSERT_EQ(uinfo.user_email, "user2_new@dbstore.com");
-  ASSERT_EQ(uinfo.user_id.id, "user_id2");
-  ASSERT_EQ(uinfo.suspended, 123);
-  ASSERT_EQ(uinfo.max_buckets, 456);
-  ASSERT_EQ(uinfo.assumed_role_arn, "role");
-  ASSERT_EQ(uinfo.placement_tags.back(), "tags");
-  RGWAccessKey k;
-  map<string, RGWAccessKey>::iterator it = uinfo.access_keys.begin();
-  k = it->second;
-  ASSERT_EQ(k.id, "id1");
-  ASSERT_EQ(k.key, "key1");
-  it++;
-  k = it->second;
-  ASSERT_EQ(k.id, "id2");
-  ASSERT_EQ(k.key, "key2");
-
-  ASSERT_EQ(objv.read_version.ver, 2);
-
-  bufferlist k1, k2;
-  string attr;
-  map<std::string, bufferlist>::iterator it2 = attrs.begin();
-  k1 = it2->second;
-  decode(attr, k1);
-  ASSERT_EQ(attr, "attrs1");
-  it2++;
-  k2 = it2->second;
-  decode(attr, k2);
-  ASSERT_EQ(attr, "attrs2");
-}
-
-TEST_F(DBStoreTest, ListAllUsers) {
-  struct DBOpParams params = GlobalParams;
-  int ret = -1;
-
-  ret = db->ListAllUsers(dpp, &params);
-  ASSERT_EQ(ret, 0);
-}
-
-TEST_F(DBStoreTest, InsertBucket) {
-  struct DBOpParams params = GlobalParams;
-  int ret = -1;
-
-  params.op.bucket.info.bucket.name = "bucket1";
-  params.op.bucket.info.bucket.tenant = "tenant";
-  params.op.bucket.info.bucket.marker = "marker1";
-
-  params.op.bucket.ent.size = 1024;
-
-  params.op.bucket.info.has_instance_obj = false;
-  params.op.bucket.bucket_version.ver = 1;
-  params.op.bucket.bucket_version.tag = "read_tag";
-
-  params.op.bucket.mtime = bucket_mtime;
-
-  ret = db->ProcessOp(dpp, "InsertBucket", &params);
-  ASSERT_EQ(ret, 0);
-}
-
-TEST_F(DBStoreTest, UpdateBucketAttrs) {
-  int ret = -1;
-  RGWBucketInfo info;
-  map<std::string, bufferlist> attrs;
-  RGWObjVersionTracker objv;
-
-  bufferlist aclbl, aclbl2;
-  encode("attrs1", aclbl);
-  attrs["attr1"] = aclbl;
-  encode("attrs2", aclbl2);
-  attrs["attr2"] = aclbl2;
-
-  info.bucket.name = "bucket1";
-
-  /* invalid version number */
-  objv.read_version.ver = 4;
-  ret = db->update_bucket(dpp, "attrs", info, false, nullptr, &attrs, &bucket_mtime, &objv);
-  ASSERT_EQ(ret, -125); /* returns ECANCELED */
-
-  /* right version number */
-  objv.read_version.ver = 1;
-  ret = db->update_bucket(dpp, "attrs", info, false, nullptr, &attrs, &bucket_mtime, &objv);
-  ASSERT_EQ(ret, 0);
-  ASSERT_EQ(objv.read_version.ver, 2);
-}
-
-TEST_F(DBStoreTest, UpdateBucketInfo) {
-  struct DBOpParams params = GlobalParams;
-  int ret = -1;
-  RGWBucketInfo info;
-
-  params.op.bucket.info.bucket.name = "bucket1";
-
-  ret = db->ProcessOp(dpp, "GetBucket", &params);
-  ASSERT_EQ(ret, 0);
-
-  info = params.op.bucket.info;
-
-  info.bucket.marker = "marker2";
-  ret = db->update_bucket(dpp, "info", info, false, nullptr, nullptr, &bucket_mtime, nullptr);
-  ASSERT_EQ(ret, 0);
-  ASSERT_EQ(info.objv_tracker.read_version.ver, 3);
-}
-
-TEST_F(DBStoreTest, GetBucket) {
-  struct DBOpParams params = GlobalParams;
-  int ret = -1;
-
-  params.op.bucket.info.bucket.name = "bucket1";
-  ret = db->ProcessOp(dpp, "GetBucket", &params);
-  ASSERT_EQ(ret, 0);
-  ASSERT_EQ(params.op.bucket.info.bucket.name, "bucket1");
-  ASSERT_EQ(params.op.bucket.info.bucket.tenant, "tenant");
-  ASSERT_EQ(params.op.bucket.info.bucket.marker, "marker2");
-  ASSERT_EQ(params.op.bucket.ent.size, 1024);
-  ASSERT_EQ(params.op.bucket.ent.bucket.name, "bucket1");
-  ASSERT_EQ(params.op.bucket.ent.bucket.tenant, "tenant");
-  ASSERT_EQ(params.op.bucket.info.has_instance_obj, false);
-  ASSERT_EQ(params.op.bucket.info.objv_tracker.read_version.ver, 3);
-  ASSERT_EQ(params.op.bucket.info.objv_tracker.read_version.tag, "read_tag");
-  ASSERT_EQ(params.op.bucket.mtime, bucket_mtime);
-  ASSERT_EQ(params.op.bucket.info.owner.id, "user_id1");
-  bufferlist k, k2;
-  string acl;
-  map<std::string, bufferlist>::iterator it2 = params.op.bucket.bucket_attrs.begin();
-  k = it2->second;
-  decode(acl, k);
-  ASSERT_EQ(acl, "attrs1");
-  it2++;
-  k2 = it2->second;
-  decode(acl, k2);
-  ASSERT_EQ(acl, "attrs2");
-}
-
-TEST_F(DBStoreTest, CreateBucket) {
-  struct DBOpParams params = GlobalParams;
-  int ret = -1;
-  RGWBucketInfo info;
-  RGWUserInfo owner;
-  rgw_bucket bucket;
-  obj_version objv;
-  rgw_placement_rule rule;
-  map<std::string, bufferlist> attrs;
-
-  owner.user_id.id = "user_id1";
-  bucket.name = "bucket1";
-  bucket.tenant = "tenant";
-
-  objv.ver = 2;
-  objv.tag = "write_tag";
-
-  rule.name = "rule1";
-  rule.storage_class = "sc1";
-
-  ret = db->create_bucket(dpp, owner, bucket, "zid", rule, "swift_ver", NULL,
-      attrs, info, &objv, NULL, bucket_mtime, NULL, NULL,
-      null_yield, false);
-  ASSERT_EQ(ret, 0);
-  bucket.name = "bucket2";
-  ret = db->create_bucket(dpp, owner, bucket, "zid", rule, "swift_ver", NULL,
-      attrs, info, &objv, NULL, bucket_mtime, NULL, NULL,
-      null_yield, false);
-  ASSERT_EQ(ret, 0);
-  bucket.name = "bucket3";
-  ret = db->create_bucket(dpp, owner, bucket, "zid", rule, "swift_ver", NULL,
-      attrs, info, &objv, NULL, bucket_mtime, NULL, NULL,
-      null_yield, false);
-  ASSERT_EQ(ret, 0);
-  bucket.name = "bucket4";
-  ret = db->create_bucket(dpp, owner, bucket, "zid", rule, "swift_ver", NULL,
-      attrs, info, &objv, NULL, bucket_mtime, NULL, NULL,
-      null_yield, false);
-  ASSERT_EQ(ret, 0);
-  bucket.name = "bucket5";
-  ret = db->create_bucket(dpp, owner, bucket, "zid", rule, "swift_ver", NULL,
-      attrs, info, &objv, NULL, bucket_mtime, NULL, NULL,
-      null_yield, false);
-  ASSERT_EQ(ret, 0);
-}
-
-TEST_F(DBStoreTest, GetBucketQueryByName) {
-  int ret = -1;
-  RGWBucketInfo binfo;
-  binfo.bucket.name = "bucket2";
-  rgw::sal::Attrs attrs;
-  ceph::real_time mtime;
-  obj_version objv;
-
-  ret = db->get_bucket_info(dpp, "name", "", binfo, &attrs, &mtime, &objv);
-  ASSERT_EQ(ret, 0);
-  ASSERT_EQ(binfo.bucket.name, "bucket2");
-  ASSERT_EQ(binfo.bucket.tenant, "tenant");
-  ASSERT_EQ(binfo.owner.id, "user_id1");
-  ASSERT_EQ(binfo.objv_tracker.read_version.ver, 2);
-  ASSERT_EQ(binfo.objv_tracker.read_version.tag, "write_tag");
-  ASSERT_EQ(binfo.zonegroup, "zid");
-  ASSERT_EQ(binfo.creation_time, bucket_mtime);
-  ASSERT_EQ(binfo.placement_rule.name, "rule1");
-  ASSERT_EQ(binfo.placement_rule.storage_class, "sc1");
-  ASSERT_EQ(objv.ver, 2);
-  ASSERT_EQ(objv.tag, "write_tag");
-
-  marker1 = binfo.bucket.marker;
-}
-
-TEST_F(DBStoreTest, ListUserBuckets) {
-  struct DBOpParams params = GlobalParams;
-  int ret = -1;
-  rgw_user owner;
-  int max = 2;
-  bool need_stats = true;
-  bool is_truncated = false;
-  RGWUserBuckets ulist;
-
-  owner.id = "user_id1";
-
-  marker1 = "";
-  do {
-    is_truncated = false;
-    ret = db->list_buckets(dpp, "", owner, marker1, "", max, need_stats, &ulist,
-          &is_truncated);
-    ASSERT_EQ(ret, 0);
-
-    cout << "marker1 :" << marker1 << "\n";
-
-    cout << "is_truncated :" << is_truncated << "\n";
-
-    for (const auto& ent: ulist.get_buckets()) {
-      RGWBucketEnt e = ent.second;
-      cout << "###################### \n";
-      cout << "ent.bucket.id : " << e.bucket.name << "\n";
-      cout << "ent.bucket.marker : " << e.bucket.marker << "\n";
-      cout << "ent.bucket.bucket_id : " << e.bucket.bucket_id << "\n";
-      cout << "ent.size : " << e.size << "\n";
-      cout << "ent.rule.name : " << e.placement_rule.name << "\n";
-
-      marker1 = e.bucket.name;
-    }
-    ulist.clear();
-  } while(is_truncated);
-}
-
-TEST_F(DBStoreTest, BucketChown) {
-  int ret = -1;
-  RGWBucketInfo info;
-  rgw_user user;
-  user.id = "user_id2";
-
-  info.bucket.name = "bucket5";
-
-  ret = db->update_bucket(dpp, "owner", info, false, &user, nullptr, &bucket_mtime, nullptr);
-  ASSERT_EQ(ret, 0);
-  ASSERT_EQ(info.objv_tracker.read_version.ver, 3);
-}
-
-TEST_F(DBStoreTest, ListAllBuckets) {
-  struct DBOpParams params = GlobalParams;
-  int ret = -1;
-
-  ret = db->ListAllBuckets(dpp, &params);
-  ASSERT_EQ(ret, 0);
-}
-
-TEST_F(DBStoreTest, ListAllBuckets2) {
-  struct DBOpParams params = GlobalParams;
-  int ret = -1;
-  rgw_user owner;
-  int max = 2;
-  bool need_stats = true;
-  bool is_truncated = false;
-  RGWUserBuckets ulist;
-
-  marker1 = "";
-  do {
-    is_truncated = false;
-    ret = db->list_buckets(dpp, "all", owner, marker1, "", max, need_stats, &ulist,
-          &is_truncated);
-    ASSERT_EQ(ret, 0);
-
-    cout << "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ \n";
-    cout << "ownerID : " << owner.id << "\n";
-    cout << "marker1 :" << marker1 << "\n";
-
-    cout << "is_truncated :" << is_truncated << "\n";
-
-    for (const auto& ent: ulist.get_buckets()) {
-      RGWBucketEnt e = ent.second;
-      cout << "###################### \n";
-      cout << "ent.bucket.id : " << e.bucket.name << "\n";
-      cout << "ent.bucket.marker : " << e.bucket.marker << "\n";
-      cout << "ent.bucket.bucket_id : " << e.bucket.bucket_id << "\n";
-      cout << "ent.size : " << e.size << "\n";
-      cout << "ent.rule.name : " << e.placement_rule.name << "\n";
-
-      marker1 = e.bucket.name;
-    }
-    ulist.clear();
-  } while(is_truncated);
-}
-
-TEST_F(DBStoreTest, RemoveBucketAPI) {
-  int ret = -1;
-  RGWBucketInfo info;
-
-  info.bucket.name = "bucket5";
-
-  ret = db->remove_bucket(dpp, info);
-  ASSERT_EQ(ret, 0);
-}
-
-TEST_F(DBStoreTest, RemoveUserAPI) {
-  int ret = -1;
-  RGWUserInfo uinfo;
-  RGWObjVersionTracker objv;
-
-  uinfo.user_id.tenant = "tenant";
-  uinfo.user_id.id = "user_id2";
-
-  /* invalid version number...should fail */
-  objv.read_version.ver = 4;
-  ret = db->remove_user(dpp, uinfo, &objv);
-  ASSERT_EQ(ret, -125);
-
-  objv.read_version.ver = 2;
-  ret = db->remove_user(dpp, uinfo, &objv);
-  ASSERT_EQ(ret, 0);
-}
-
-TEST_F(DBStoreTest, PutObject) {
-  struct DBOpParams params = GlobalParams;
-  int ret = -1;
-
-  params.op.obj.category = RGWObjCategory::Main;
-  params.op.obj.storage_class = "STANDARD";
-  bufferlist b1;
-  encode("HELLO WORLD", b1);
-  cout<<"XXXXXXXXX Insert b1.length " << b1.length() << "\n";
-  params.op.obj.head_data = b1;
-  params.op.obj.state.size = 12;
-  params.op.obj.state.is_olh = false;
-  ret = db->ProcessOp(dpp, "PutObject", &params);
-  ASSERT_EQ(ret, 0);
-
-  /* Insert another objects */
-  params.op.obj.state.obj.key.name = "object2";
-  params.op.obj.state.obj.key.instance = "inst2";
-  ret = db->ProcessOp(dpp, "PutObject", &params);
-  ASSERT_EQ(ret, 0);
-
-  params.op.obj.state.obj.key.name = "object3";
-  params.op.obj.state.obj.key.instance = "inst3";
-  ret = db->ProcessOp(dpp, "PutObject", &params);
-  ASSERT_EQ(ret, 0);
-}
-
-TEST_F(DBStoreTest, ListAllObjects) {
-  struct DBOpParams params = GlobalParams;
-  int ret = -1;
-
-  ret = db->ListAllObjects(dpp, &params);
-  ASSERT_GE(ret, 0);
-}
-
-TEST_F(DBStoreTest, GetObject) {
-  struct DBOpParams params = GlobalParams;
-  int ret = -1;
-
-  ret = db->ProcessOp(dpp, "GetObject", &params);
-  ASSERT_EQ(ret, 0);
-  ASSERT_EQ(params.op.obj.category, RGWObjCategory::Main);
-  ASSERT_EQ(params.op.obj.storage_class, "STANDARD");
-  string data;
-  decode(data, params.op.obj.head_data);
-  ASSERT_EQ(data, "HELLO WORLD");
-  ASSERT_EQ(params.op.obj.state.size, 12);
-  cout << "versionNum :" << params.op.obj.version_num << "\n";
-}
-
-TEST_F(DBStoreTest, GetObjectState) {
-  struct DBOpParams params = GlobalParams;
-  int ret = -1;
-  RGWObjState* s;
-
-  params.op.obj.state.obj.key.name = "object2";
-  params.op.obj.state.obj.key.instance = "inst2";
-  DB::Object op_target(db, params.op.bucket.info,
-      params.op.obj.state.obj);
-
-  ret = op_target.get_obj_state(dpp, params.op.bucket.info, params.op.obj.state.obj,
-      false, &s);
-  ASSERT_EQ(ret, 0);
-  ASSERT_EQ(s->size, 12);
-  ASSERT_EQ(s->is_olh, false);
-  cout << "versionNum :" << params.op.obj.version_num << "\n";
-
-  /* Recheck with get_state API */
-  ret = op_target.get_state(dpp, &s, false);
-  ASSERT_EQ(ret, 0);
-  ASSERT_EQ(s->size, 12);
-  ASSERT_EQ(s->is_olh, false);
-  cout << "versionNum :" << params.op.obj.version_num << "\n";
-}
-
-TEST_F(DBStoreTest, ObjAttrs) {
-  struct DBOpParams params = GlobalParams;
-  int ret = -1;
-  map<string, bufferlist> setattrs;
-  map<string, bufferlist> rmattrs;
-  map<string, bufferlist> readattrs;
-
-  bufferlist b1, b2, b3;
-  encode("ACL", b1);
-  setattrs[RGW_ATTR_ACL] = b1;
-  encode("LC", b2);
-  setattrs[RGW_ATTR_LC] = b2;
-  encode("ETAG", b3);
-  setattrs[RGW_ATTR_ETAG] = b3;
-
-  DB::Object op_target(db, params.op.bucket.info,
-      params.op.obj.state.obj);
-
-  /* Set some attrs */
-  ret = op_target.set_attrs(dpp, setattrs, nullptr);
-  ASSERT_EQ(ret, 0);
-
-  /* read those attrs */
-  DB::Object::Read read_op(&op_target);
-  read_op.params.attrs = &readattrs;
-  ret = read_op.prepare(dpp);
-  ASSERT_EQ(ret, 0);
-
-  string val;
-  decode(val, readattrs[RGW_ATTR_ACL]);
-  ASSERT_EQ(val, "ACL");
-  decode(val, readattrs[RGW_ATTR_LC]);
-  ASSERT_EQ(val, "LC");
-  decode(val, readattrs[RGW_ATTR_ETAG]);
-  ASSERT_EQ(val, "ETAG");
-
-  /* Remove some attrs */
-  rmattrs[RGW_ATTR_ACL] = b1;
-  map<string, bufferlist> empty;
-  ret = op_target.set_attrs(dpp, empty, &rmattrs);
-  ASSERT_EQ(ret, 0);
-
-  /* read those attrs */
-  ret = read_op.prepare(dpp);
-  ASSERT_EQ(ret, 0);
-
-  ASSERT_EQ(readattrs.count(RGW_ATTR_ACL), 0);
-  decode(val, readattrs[RGW_ATTR_LC]);
-  ASSERT_EQ(val, "LC");
-  decode(val, readattrs[RGW_ATTR_ETAG]);
-  ASSERT_EQ(val, "ETAG");
-}
-
-TEST_F(DBStoreTest, WriteObject) {
-  struct DBOpParams params = GlobalParams;
-  int ret = -1;
-  params.op.obj.state.obj.key.name = "object3";
-  params.op.obj.state.obj.key.instance = "inst3";
-  DB::Object op_target(db, params.op.bucket.info,
-      params.op.obj.state.obj);
-
-  bufferlist b1;
-  encode("HELLO WORLD - Object3", b1);
-  params.op.obj.head_data = b1;
-  params.op.obj.state.size = 22;
-
-  ret = write_object(dpp, params);
-  ASSERT_EQ(ret, 0);
-}
-
-TEST_F(DBStoreTest, ReadObject) {
-  struct DBOpParams params = GlobalParams;
-  int ret = -1;
-  map<string, bufferlist> readattrs;
-  params.op.obj.state.obj.key.name = "object3";
-  params.op.obj.state.obj.key.instance = "inst3";
-  uint64_t obj_size;
-  DB::Object op_target(db, params.op.bucket.info,
-      params.op.obj.state.obj);
-  DB::Object::Read read_op(&op_target);
-  read_op.params.attrs = &readattrs;
-  read_op.params.obj_size = &obj_size;
-  ret = read_op.prepare(dpp);
-  ASSERT_EQ(ret, 0);
-
-  bufferlist bl;
-  ret = read_op.read(0, 25, bl, dpp);
-  cout<<"XXXXXXXXX Insert bl.length " << bl.length() << "\n";
-  ASSERT_EQ(ret, 25);
-
-  string data;
-  decode(data, bl);
-  ASSERT_EQ(data, "HELLO WORLD - Object3");
-  ASSERT_EQ(obj_size, 22);
-}
-
-TEST_F(DBStoreTest, IterateObject) {
-  struct DBOpParams params = GlobalParams;
-  int ret = -1;
-  map<string, bufferlist> readattrs;
-  uint64_t obj_size;
-  DBGetDataCB cb;
-
-  DB::Object op_target(db, params.op.bucket.info,
-      params.op.obj.state.obj);
-  DB::Object::Read read_op(&op_target);
-  read_op.params.attrs = &readattrs;
-  read_op.params.obj_size = &obj_size;
-  ret = read_op.prepare(dpp);
-  ASSERT_EQ(ret, 0);
-
-  bufferlist bl;
-  ret = read_op.iterate(dpp, 0, 15, &cb);
-  ASSERT_EQ(ret, 0);
-  string data;
-  decode(data, cb.data_bl);
-  cout << "XXXXXXXXXX iterate data is " << data << ", bl_ofs = " << cb.data_ofs << ", bl_len = " << cb.data_len << "\n";
-  ASSERT_EQ(data, "HELLO WORLD");
-  ASSERT_EQ(cb.data_ofs, 0);
-  ASSERT_EQ(cb.data_len, 15);
-}
-
-TEST_F(DBStoreTest, ListBucketObjects) {
-  struct DBOpParams params = GlobalParams;
-  int ret = -1;
-  
-  int max = 2;
-  bool is_truncated = false;
-  rgw_obj_key marker1;
-  DB::Bucket target(db, params.op.bucket.info);
-  DB::Bucket::List list_op(&target);
-
-  vector<rgw_bucket_dir_entry> dir_list;
-
-  marker1.name = "";
-  do {
-    is_truncated = false;
-    list_op.params.marker = marker1;
-    ret = list_op.list_objects(dpp, max, &dir_list, nullptr, &is_truncated);
-    ASSERT_EQ(ret, 0);
-
-    cout << "marker1 :" << marker1.name << "\n";
-
-    cout << "is_truncated :" << is_truncated << "\n";
-
-    for (const auto& ent: dir_list) {
-      cls_rgw_obj_key key = ent.key;
-      cout << "###################### \n";
-      cout << "key.name : " << key.name << "\n";
-      cout << "key.instance : " << key.instance << "\n";
-
-      marker1 = list_op.get_next_marker();
-    }
-    dir_list.clear();
-  } while(is_truncated);
-}
-
-TEST_F(DBStoreTest, DeleteObj) {
-  struct DBOpParams params = GlobalParams;
-  int ret = -1;
-  RGWObjState *s;
-
-  /* delete object2 */
-  params.op.obj.state.obj.key.name = "object2";
-  params.op.obj.state.obj.key.instance = "inst2";
-  DB::Object op_target(db, params.op.bucket.info,
-      params.op.obj.state.obj);
-
-  DB::Object::Delete delete_op(&op_target);
-  ret = delete_op.delete_obj(dpp);
-  ASSERT_EQ(ret, 0);
-
-  /* Should return ENOENT */
-  ret = op_target.get_state(dpp, &s, false);
-  ASSERT_EQ(ret, -2);
-}
-
-TEST_F(DBStoreTest, WriteVersionedObject) {
-  struct DBOpParams params = GlobalParams;
-  int ret = -1;
-  std::string instances[] = {"inst1", "inst2", "inst3"};
-  bufferlist b1;
-
-  params.op.obj.flags |= rgw_bucket_dir_entry::FLAG_CURRENT;
-  params.op.obj.state.obj.key.name = "object1";
-
-  /* Write versioned objects */
-  DB::Object op_target(db, params.op.bucket.info, params.op.obj.state.obj);
-  DB::Object::Write write_op(&op_target);
-
-  /* Version1 */
-  params.op.obj.state.obj.key.instance = instances[0];
-  encode("HELLO WORLD", b1);
-  params.op.obj.head_data = b1;
-  params.op.obj.state.size = 12;
-  ret = write_object(dpp, params);
-  ASSERT_EQ(ret, 0);
-
-  /* Version2 */
-  params.op.obj.state.obj.key.instance = instances[1];
-  b1.clear();
-  encode("HELLO WORLD ABC", b1);
-  params.op.obj.head_data = b1;
-  params.op.obj.state.size = 16;
-  ret = write_object(dpp, params);
-  ASSERT_EQ(ret, 0);
-
-  /* Version3 */
-  params.op.obj.state.obj.key.instance = instances[2];
-  b1.clear();
-  encode("HELLO WORLD A", b1);
-  params.op.obj.head_data = b1;
-  params.op.obj.state.size = 14;
-  ret = write_object(dpp, params);
-  ASSERT_EQ(ret, 0);
-}
-
-TEST_F(DBStoreTest, ListVersionedObject) {
-  struct DBOpParams params = GlobalParams;
-  int ret = -1;
-  std::string instances[] = {"inst1", "inst2", "inst3"};
-  int i = 0;
-
-  /* list versioned objects */
-  params.op.obj.state.obj.key.instance.clear();
-  params.op.list_max_count = MAX_VERSIONED_OBJECTS;
-  ret = db->ProcessOp(dpp, "ListVersionedObjects", &params);
-  ASSERT_EQ(ret, 0);
-
-  i = 2;
-  for (auto ent: params.op.obj.list_entries) {
-
-
-    ASSERT_EQ(ent.key.instance, instances[i]);
-    i--;
-  }
-}
-
-TEST_F(DBStoreTest, ReadVersionedObject) {
-  struct DBOpParams params = GlobalParams;
-  int ret = -1;
-  std::string instances[] = {"inst1", "inst2", "inst3"};
-  std::string data;
-
-  /* read object.. should fetch latest version */
-  RGWObjState* s;
-  params = GlobalParams;
-  params.op.obj.state.obj.key.instance.clear();
-  DB::Object op_target2(db, params.op.bucket.info, params.op.obj.state.obj);
-  ret = op_target2.get_obj_state(dpp, params.op.bucket.info, params.op.obj.state.obj,
-                                 true, &s);
-  ASSERT_EQ(ret, 0);
-  ASSERT_EQ(s->obj.key.instance, instances[2]);
-  decode(data, s->data);
-  ASSERT_EQ(data, "HELLO WORLD A");
-  ASSERT_EQ(s->size, 14);
-
-  /* read a particular non-current version */
-  params.op.obj.state.obj.key.instance = instances[1];
-  DB::Object op_target3(db, params.op.bucket.info, params.op.obj.state.obj);
-  ret = op_target3.get_obj_state(dpp, params.op.bucket.info, params.op.obj.state.obj,
-                                 true, &s);
-  ASSERT_EQ(ret, 0);
-  decode(data, s->data);
-  ASSERT_EQ(data, "HELLO WORLD ABC");
-  ASSERT_EQ(s->size, 16);
-}
-
-TEST_F(DBStoreTest, DeleteVersionedObject) {
-  struct DBOpParams params = GlobalParams;
-  int ret = -1;
-  std::string instances[] = {"inst1", "inst2", "inst3"};
-  std::string data;
-  std::string dm_instance;
-  int i = 0;
-
-  /* Delete object..should create delete marker */
-  params.op.obj.state.obj.key.instance.clear();
-  DB::Object op_target(db, params.op.bucket.info, params.op.obj.state.obj);
-  DB::Object::Delete delete_op(&op_target);
-  delete_op.params.versioning_status |= BUCKET_VERSIONED;
-
-  ret = delete_op.delete_obj(dpp);
-  ASSERT_EQ(ret, 0);
-
-  /* list versioned objects */
-  params = GlobalParams;
-  params.op.obj.state.obj.key.instance.clear();
-  params.op.list_max_count = MAX_VERSIONED_OBJECTS;
-  ret = db->ProcessOp(dpp, "ListVersionedObjects", &params);
-
-  i = 3;
-  for (auto ent: params.op.obj.list_entries) {
-    string is_delete_marker = (ent.flags & rgw_bucket_dir_entry::FLAG_DELETE_MARKER)? "true" : "false";
-    cout << "ent.name: " << ent.key.name << ". ent.instance: " << ent.key.instance << " is_delete_marker = " << is_delete_marker << "\n";
-
-    if (i == 3) {
-      ASSERT_EQ(is_delete_marker, "true");
-      dm_instance = ent.key.instance;
-    } else {
-      ASSERT_EQ(is_delete_marker, "false");
-      ASSERT_EQ(ent.key.instance, instances[i]);
-    }
-
-    i--;
-  }
-
-  /* read object.. should return -ENOENT */
-  RGWObjState* s;
-  params = GlobalParams;
-  params.op.obj.state.obj.key.instance.clear();
-  DB::Object op_target2(db, params.op.bucket.info, params.op.obj.state.obj);
-  ret = op_target2.get_obj_state(dpp, params.op.bucket.info, params.op.obj.state.obj,
-                                 true, &s);
-  ASSERT_EQ(ret, -ENOENT);
-
-  /* Delete delete marker..should be able to read object now */ 
-  params.op.obj.state.obj.key.instance = dm_instance;
-  DB::Object op_target3(db, params.op.bucket.info, params.op.obj.state.obj);
-  DB::Object::Delete delete_op2(&op_target3);
-  delete_op2.params.versioning_status |= BUCKET_VERSIONED;
-
-  ret = delete_op2.delete_obj(dpp);
-  ASSERT_EQ(ret, 0);
-
-  /* read object.. should fetch latest version */
-  params = GlobalParams;
-  params.op.obj.state.obj.key.instance.clear();
-  DB::Object op_target4(db, params.op.bucket.info, params.op.obj.state.obj);
-  ret = op_target4.get_obj_state(dpp, params.op.bucket.info, params.op.obj.state.obj,
-                                 true, &s);
-  ASSERT_EQ(s->obj.key.instance, instances[2]);
-  decode(data, s->data);
-  ASSERT_EQ(data, "HELLO WORLD A");
-  ASSERT_EQ(s->size, 14);
-
-  /* delete latest version using version-id. Next version should get promoted */
-  params.op.obj.state.obj.key.instance = instances[2];
-  DB::Object op_target5(db, params.op.bucket.info, params.op.obj.state.obj);
-  DB::Object::Delete delete_op3(&op_target5);
-  delete_op3.params.versioning_status |= BUCKET_VERSIONED;
-
-  ret = delete_op3.delete_obj(dpp);
-  ASSERT_EQ(ret, 0);
-
-  /* list versioned objects..only two versions should be present
-   * with second version marked as CURRENT */
-  params = GlobalParams;
-  params.op.obj.state.obj.key.instance.clear();
-  params.op.list_max_count = MAX_VERSIONED_OBJECTS;
-  ret = db->ProcessOp(dpp, "ListVersionedObjects", &params);
-
-  i = 1;
-  for (auto ent: params.op.obj.list_entries) {
-
-    if (i == 1) {
-      dm_instance = ent.key.instance;
-    } else {
-      ASSERT_EQ(ent.key.instance, instances[i]);
-    }
-
-    i--;
-  }
-
-}
-
-TEST_F(DBStoreTest, ObjectOmapSetVal) {
-  struct DBOpParams params = GlobalParams;
-  int ret = -1;
-
-  DB::Object op_target(db, params.op.bucket.info,
-      params.op.obj.state.obj);
-
-  string val = "part1_val";
-  bufferlist bl;
-  encode(val, bl);
-  ret = op_target.obj_omap_set_val_by_key(dpp, "part1", bl, false);
-  ASSERT_EQ(ret, 0);
-
-  val = "part2_val";
-  bl.clear();
-  encode(val, bl);
-  ret = op_target.obj_omap_set_val_by_key(dpp, "part2", bl, false);
-  ASSERT_EQ(ret, 0);
-
-  val = "part3_val";
-  bl.clear();
-  encode(val, bl);
-  ret = op_target.obj_omap_set_val_by_key(dpp, "part3", bl, false);
-  ASSERT_EQ(ret, 0);
-
-  val = "part4_val";
-  bl.clear();
-  encode(val, bl);
-  ret = op_target.obj_omap_set_val_by_key(dpp, "part4", bl, false);
-  ASSERT_EQ(ret, 0);
-}
-
-TEST_F(DBStoreTest, ObjectOmapGetValsByKeys) {
-  struct DBOpParams params = GlobalParams;
-  int ret = -1;
-  std::set<std::string> keys;
-  std::map<std::string, bufferlist> vals;
-
-  DB::Object op_target(db, params.op.bucket.info,
-      params.op.obj.state.obj);
-
-  keys.insert("part2");
-  keys.insert("part4");
-
-  ret = op_target.obj_omap_get_vals_by_keys(dpp, "", keys, &vals);
-  ASSERT_EQ(ret, 0);
-  ASSERT_EQ(vals.size(), 2);
-
-  string val;
-  decode(val, vals["part2"]);
-  ASSERT_EQ(val, "part2_val");
-  decode(val, vals["part4"]);
-  ASSERT_EQ(val, "part4_val");
-}
-
-TEST_F(DBStoreTest, ObjectOmapGetAll) {
-  struct DBOpParams params = GlobalParams;
-  int ret = -1;
-  std::map<std::string, bufferlist> vals;
-
-  DB::Object op_target(db, params.op.bucket.info,
-      params.op.obj.state.obj);
-
-  ret = op_target.obj_omap_get_all(dpp, &vals);
-  ASSERT_EQ(ret, 0);
-  ASSERT_EQ(vals.size(), 4);
-
-  string val;
-  decode(val, vals["part1"]);
-  ASSERT_EQ(val, "part1_val");
-  decode(val, vals["part2"]);
-  ASSERT_EQ(val, "part2_val");
-  decode(val, vals["part3"]);
-  ASSERT_EQ(val, "part3_val");
-  decode(val, vals["part4"]);
-  ASSERT_EQ(val, "part4_val");
-}
-
-TEST_F(DBStoreTest, ObjectOmapGetVals) {
-  struct DBOpParams params = GlobalParams;
-  int ret = -1;
-  std::set<std::string> keys;
-  std::map<std::string, bufferlist> vals;
-  bool pmore;
-
-  DB::Object op_target(db, params.op.bucket.info,
-      params.op.obj.state.obj);
-
-  ret = op_target.obj_omap_get_vals(dpp, "part3", 10, &vals, &pmore);
-  ASSERT_EQ(ret, 0);
-  ASSERT_EQ(vals.size(), 2);
-
-  string val;
-  decode(val, vals["part3"]);
-  ASSERT_EQ(val, "part3_val");
-  decode(val, vals["part4"]);
-  ASSERT_EQ(val, "part4_val");
-}
-
-TEST_F(DBStoreTest, PutObjectData) {
-  struct DBOpParams params = GlobalParams;
-  int ret = -1;
-
-  params.op.obj_data.part_num = 1;
-  params.op.obj_data.offset = 10;
-  params.op.obj_data.multipart_part_str = "2";
-  bufferlist b1;
-  encode("HELLO WORLD", b1);
-  params.op.obj_data.data = b1;
-  params.op.obj_data.size = 12;
-  params.op.obj.state.mtime = real_clock::now();
-  ret = db->ProcessOp(dpp, "PutObjectData", &params);
-  ASSERT_EQ(ret, 0);
-}
-
-TEST_F(DBStoreTest, UpdateObjectData) {
-  struct DBOpParams params = GlobalParams;
-  int ret = -1;
-
-  params.op.obj.state.mtime = bucket_mtime;
-  ret = db->ProcessOp(dpp, "UpdateObjectData", &params);
-  ASSERT_EQ(ret, 0);
-}
-
-TEST_F(DBStoreTest, GetObjectData) {
-  struct DBOpParams params = GlobalParams;
-  int ret = -1;
-
-  params.op.obj.state.obj.key.instance = "inst1";
-  params.op.obj.state.obj.key.name = "object1";
-  ret = db->ProcessOp(dpp, "GetObjectData", &params);
-  ASSERT_EQ(ret, 0);
-  ASSERT_EQ(params.op.obj_data.part_num, 1);
-  ASSERT_EQ(params.op.obj_data.offset, 10);
-  ASSERT_EQ(params.op.obj_data.multipart_part_str, "2");
-  ASSERT_EQ(params.op.obj.state.obj.key.instance, "inst1");
-  ASSERT_EQ(params.op.obj.state.obj.key.name, "object1");
-  ASSERT_EQ(params.op.obj.state.mtime, bucket_mtime);
-  string data;
-  decode(data, params.op.obj_data.data);
-  ASSERT_EQ(data, "HELLO WORLD");
-}
-
-TEST_F(DBStoreTest, DeleteObjectData) {
-  struct DBOpParams params = GlobalParams;
-  int ret = -1;
-
-  ret = db->ProcessOp(dpp, "DeleteObjectData", &params);
-  ASSERT_EQ(ret, 0);
-}
-
-TEST_F(DBStoreTest, DeleteObject) {
-  struct DBOpParams params = GlobalParams;
-  int ret = -1;
-
-  ret = db->ProcessOp(dpp, "DeleteObject", &params);
-  ASSERT_EQ(ret, 0);
-}
-
-TEST_F(DBStoreTest, LCTables) {
-  struct DBOpParams params = GlobalParams;
-  int ret = -1;
-
-  ret = db->createLCTables(dpp);
-  ASSERT_GE(ret, 0);
-}
-
-TEST_F(DBStoreTest, LCHead) {
-  struct DBOpParams params = GlobalParams;
-  int ret = -1;
-  std::string index1 = "bucket1";
-  std::string index2 = "bucket2";
-  time_t lc_time = ceph_clock_now();
-  std::unique_ptr<rgw::sal::Lifecycle::LCHead> head;
-  std::string ents[] = {"entry1", "entry2", "entry3"};
-  rgw::sal::StoreLifecycle::StoreLCHead head1(lc_time, 0, ents[0]);
-  rgw::sal::StoreLifecycle::StoreLCHead head2(lc_time, 0, ents[1]);
-  rgw::sal::StoreLifecycle::StoreLCHead head3(lc_time, 0, ents[2]);
-
-  ret = db->put_head(index1, head1);
-  ASSERT_EQ(ret, 0);
-  ret = db->put_head(index2, head2);
-  ASSERT_EQ(ret, 0);
-
-  ret = db->get_head(index1, &head);
-  ASSERT_EQ(ret, 0);
-  ASSERT_EQ(head->get_marker(), "entry1");
-
-  ret = db->get_head(index2, &head);
-  ASSERT_EQ(ret, 0);
-  ASSERT_EQ(head->get_marker(), "entry2");
-
-  // update index1
-  ret = db->put_head(index1, head3);
-  ASSERT_EQ(ret, 0);
-  ret = db->get_head(index1, &head);
-  ASSERT_EQ(ret, 0);
-  ASSERT_EQ(head->get_marker(), "entry3");
-
-}
-TEST_F(DBStoreTest, LCEntry) {
-  struct DBOpParams params = GlobalParams;
-  int ret = -1;
-  uint64_t lc_time = ceph_clock_now();
-  std::string index1 = "lcindex1";
-  std::string index2 = "lcindex2";
-  typedef enum {lc_uninitial = 1, lc_complete} status;
-  std::string ents[] = {"bucket1", "bucket2", "bucket3", "bucket4"};
-  std::unique_ptr<rgw::sal::Lifecycle::LCEntry> entry;
-  rgw::sal::StoreLifecycle::StoreLCEntry entry1(ents[0], lc_time, lc_uninitial);
-  rgw::sal::StoreLifecycle::StoreLCEntry entry2(ents[1], lc_time, lc_uninitial);
-  rgw::sal::StoreLifecycle::StoreLCEntry entry3(ents[2], lc_time, lc_uninitial);
-  rgw::sal::StoreLifecycle::StoreLCEntry entry4(ents[3], lc_time, lc_uninitial);
-
-  vector<std::unique_ptr<rgw::sal::Lifecycle::LCEntry>> lc_entries;
-
-  ret = db->set_entry(index1, entry1);
-  ASSERT_EQ(ret, 0);
-  ret = db->set_entry(index1, entry2);
-  ASSERT_EQ(ret, 0);
-  ret = db->set_entry(index1, entry3);
-  ASSERT_EQ(ret, 0);
-  ret = db->set_entry(index2, entry4);
-  ASSERT_EQ(ret, 0);
-
-  // get entry index1, entry1
-  ret = db->get_entry(index1, ents[0], &entry); 
-  ASSERT_EQ(ret, 0);
-  ASSERT_EQ(entry->get_status(), lc_uninitial);
-  ASSERT_EQ(entry->get_start_time(), lc_time);
-
-  // get next entry index1, entry2
-  ret = db->get_next_entry(index1, ents[1], &entry); 
-  ASSERT_EQ(ret, 0);
-  ASSERT_EQ(entry->get_bucket(), ents[2]);
-  ASSERT_EQ(entry->get_status(), lc_uninitial);
-  ASSERT_EQ(entry->get_start_time(), lc_time);
-
-  // update entry4 to entry5
-  entry4.status = lc_complete;
-  ret = db->set_entry(index2, entry4);
-  ASSERT_EQ(ret, 0);
-  ret = db->get_entry(index2, ents[3], &entry); 
-  ASSERT_EQ(ret, 0);
-  ASSERT_EQ(entry->get_status(), lc_complete);
-
-  // list entries
-  ret = db->list_entries(index1, "", 5, lc_entries);
-  ASSERT_EQ(ret, 0);
-  for (const auto& ent: lc_entries) {
-    cout << "###################### \n";
-    cout << "lc entry.bucket : " << ent->get_bucket() << "\n";
-    cout << "lc entry.status : " << ent->get_status() << "\n";
-  }
-
-  // remove index1, entry3
-  ret = db->rm_entry(index1, entry3); 
-  ASSERT_EQ(ret, 0);
-
-  // get next entry index1, entry2.. should be null
-  entry.release();
-  ret = db->get_next_entry(index1, ents[1], &entry); 
-  ASSERT_EQ(ret, 0);
-  ASSERT_EQ(entry.get(), nullptr);
-}
-
-TEST_F(DBStoreTest, RemoveBucket) {
-  struct DBOpParams params = GlobalParams;
-  int ret = -1;
-
-  ret = db->ProcessOp(dpp, "RemoveBucket", &params);
-  ASSERT_EQ(ret, 0);
-}
-
-TEST_F(DBStoreTest, RemoveUser) {
-  struct DBOpParams params = GlobalParams;
-  int ret = -1;
-
-  ret = db->ProcessOp(dpp, "RemoveUser", &params);
-  ASSERT_EQ(ret, 0);
-}
-
-TEST_F(DBStoreTest, InsertTestIDUser) {
-  struct DBOpParams params = GlobalParams;
-  int ret = -1;
-
-  params.op.user.uinfo.user_id.id = "testid";
-  params.op.user.uinfo.display_name = "M. Tester";
-  params.op.user.uinfo.user_id.tenant = "tenant";
-  params.op.user.uinfo.user_email = "tester@ceph.com";
-  RGWAccessKey k1("0555b35654ad1656d804", "h7GhxuBLTrlhVUyxSPUKUV8r/2EI4ngqJxD7iBdBYLhwluN30JaT3Q==");
-  params.op.user.uinfo.access_keys["0555b35654ad1656d804"] = k1;
-  params.op.user.user_version.ver = 1;    
-  params.op.user.user_version.tag = "UserTAG";    
-
-  ret = db->ProcessOp(dpp, "InsertUser", &params);
-  ASSERT_EQ(ret, 0);
-}
-
-int main(int argc, char **argv)
-{
-  int ret = -1;
-  string c_logfile = "rgw_dbstore_tests.log";
-  int c_loglevel = 20;
-
-  // format: ./dbstore-tests logfile loglevel
-  if (argc == 3) {
-    c_logfile = argv[1];
-    c_loglevel = (atoi)(argv[2]);
-    cout << "logfile:" << c_logfile << ", loglevel set to " << c_loglevel << "\n";
-  }
-
-  ::testing::InitGoogleTest(&argc, argv);
-
-  gtest::env = new gtest::Environment();
-  gtest::env->logfile = c_logfile;
-  gtest::env->loglevel = c_loglevel;
-  ::testing::AddGlobalTestEnvironment(gtest::env);
-
-  ret = RUN_ALL_TESTS();
-
-  return ret;
-}
diff --git a/src/rgw/store/immutable_config/store.cc b/src/rgw/store/immutable_config/store.cc
deleted file mode 100644 (file)
index 8d3e076..0000000
+++ /dev/null
@@ -1,422 +0,0 @@
-// vim: ts=8 sw=2 smarttab ft=cpp
-
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2022 Red Hat, Inc.
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation. See file COPYING.
- *
- */
-
-#include "rgw_zone.h"
-#include "store.h"
-
-namespace rgw::sal {
-
-ImmutableConfigStore::ImmutableConfigStore(const RGWZoneGroup& zonegroup,
-                                           const RGWZoneParams& zone,
-                                           const RGWPeriodConfig& period_config)
-    : zonegroup(zonegroup), zone(zone), period_config(period_config)
-{
-}
-
-// Realm
-int ImmutableConfigStore::write_default_realm_id(const DoutPrefixProvider* dpp,
-                                                 optional_yield y, bool exclusive,
-                                                 std::string_view realm_id)
-{
-  return -EROFS;
-}
-
-int ImmutableConfigStore::read_default_realm_id(const DoutPrefixProvider* dpp,
-                                                optional_yield y,
-                                                std::string& realm_id)
-{
-  return -ENOENT;
-}
-
-int ImmutableConfigStore::delete_default_realm_id(const DoutPrefixProvider* dpp,
-                                                  optional_yield y)
-{
-  return -EROFS;
-}
-
-
-int ImmutableConfigStore::create_realm(const DoutPrefixProvider* dpp,
-                                       optional_yield y, bool exclusive,
-                                       const RGWRealm& info,
-                                       std::unique_ptr<RealmWriter>* writer)
-{
-  return -EROFS;
-}
-
-int ImmutableConfigStore::read_realm_by_id(const DoutPrefixProvider* dpp,
-                                           optional_yield y,
-                                           std::string_view realm_id,
-                                           RGWRealm& info,
-                                           std::unique_ptr<RealmWriter>* writer)
-{
-  return -ENOENT;
-}
-
-int ImmutableConfigStore::read_realm_by_name(const DoutPrefixProvider* dpp,
-                                             optional_yield y,
-                                             std::string_view realm_name,
-                                             RGWRealm& info,
-                                             std::unique_ptr<RealmWriter>* writer)
-{
-  return -ENOENT;
-}
-
-int ImmutableConfigStore::read_default_realm(const DoutPrefixProvider* dpp,
-                                             optional_yield y,
-                                             RGWRealm& info,
-                                             std::unique_ptr<RealmWriter>* writer)
-{
-  return -ENOENT;
-}
-
-int ImmutableConfigStore::read_realm_id(const DoutPrefixProvider* dpp,
-                                        optional_yield y, std::string_view realm_name,
-                                        std::string& realm_id)
-{
-  return -ENOENT;
-}
-
-int ImmutableConfigStore::realm_notify_new_period(const DoutPrefixProvider* dpp,
-                                                  optional_yield y,
-                                                  const RGWPeriod& period)
-{
-  return -ENOTSUP;
-}
-
-int ImmutableConfigStore::list_realm_names(const DoutPrefixProvider* dpp,
-                                           optional_yield y, const std::string& marker,
-                                           std::span<std::string> entries,
-                                           ListResult<std::string>& result)
-{
-  result.next.clear();
-  result.entries = entries.first(0);
-  return 0;
-}
-
-
-// Period
-int ImmutableConfigStore::create_period(const DoutPrefixProvider* dpp,
-                                        optional_yield y, bool exclusive,
-                                        const RGWPeriod& info)
-{
-  return -EROFS;
-}
-
-int ImmutableConfigStore::read_period(const DoutPrefixProvider* dpp,
-                                      optional_yield y, std::string_view period_id,
-                                      std::optional<uint32_t> epoch, RGWPeriod& info)
-{
-  return -ENOENT;
-}
-
-int ImmutableConfigStore::delete_period(const DoutPrefixProvider* dpp,
-                                        optional_yield y,
-                                        std::string_view period_id)
-{
-  return -EROFS;
-}
-
-int ImmutableConfigStore::list_period_ids(const DoutPrefixProvider* dpp,
-                                          optional_yield y, const std::string& marker,
-                                          std::span<std::string> entries,
-                                          ListResult<std::string>& result)
-{
-  result.next.clear();
-  result.entries = entries.first(0);
-  return 0;
-}
-
-
-// ZoneGroup
-
-class ImmutableZoneGroupWriter : public ZoneGroupWriter {
- public:
-  int write(const DoutPrefixProvider* dpp, optional_yield y,
-            const RGWZoneGroup& info) override
-  {
-    return -EROFS;
-  }
-  int rename(const DoutPrefixProvider* dpp, optional_yield y,
-             RGWZoneGroup& info, std::string_view new_name) override
-  {
-    return -EROFS;
-  }
-  int remove(const DoutPrefixProvider* dpp, optional_yield y) override
-  {
-    return -EROFS;
-  }
-};
-
-int ImmutableConfigStore::write_default_zonegroup_id(const DoutPrefixProvider* dpp,
-                                                     optional_yield y, bool exclusive,
-                                                     std::string_view realm_id,
-                                                     std::string_view zonegroup_id)
-{
-  return -EROFS;
-}
-
-int ImmutableConfigStore::read_default_zonegroup_id(const DoutPrefixProvider* dpp,
-                                                    optional_yield y,
-                                                    std::string_view realm_id,
-                                                    std::string& zonegroup_id)
-{
-  if (!realm_id.empty()) {
-    return -ENOENT;
-  }
-  zonegroup_id = zonegroup.id;
-  return 0;
-}
-
-int ImmutableConfigStore::delete_default_zonegroup_id(const DoutPrefixProvider* dpp,
-                                                      optional_yield y,
-                                                      std::string_view realm_id)
-{
-  return -EROFS;
-}
-
-
-int ImmutableConfigStore::create_zonegroup(const DoutPrefixProvider* dpp,
-                                           optional_yield y, bool exclusive,
-                                           const RGWZoneGroup& info,
-                                           std::unique_ptr<ZoneGroupWriter>* writer)
-{
-  return -EROFS;
-}
-
-int ImmutableConfigStore::read_zonegroup_by_id(const DoutPrefixProvider* dpp,
-                                               optional_yield y,
-                                               std::string_view zonegroup_id,
-                                               RGWZoneGroup& info,
-                                               std::unique_ptr<ZoneGroupWriter>* writer)
-{
-  if (zonegroup_id != zonegroup.id) {
-    return -ENOENT;
-  }
-
-  info = zonegroup;
-
-  if (writer) {
-    *writer = std::make_unique<ImmutableZoneGroupWriter>();
-  }
-  return 0;
-}
-int ImmutableConfigStore::read_zonegroup_by_name(const DoutPrefixProvider* dpp,
-                                                 optional_yield y,
-                                                 std::string_view zonegroup_name,
-                                                 RGWZoneGroup& info,
-                                                 std::unique_ptr<ZoneGroupWriter>* writer)
-{
-  if (zonegroup_name != zonegroup.name) {
-    return -ENOENT;
-  }
-
-  info = zonegroup;
-
-  if (writer) {
-    *writer = std::make_unique<ImmutableZoneGroupWriter>();
-  }
-  return 0;
-}
-
-int ImmutableConfigStore::read_default_zonegroup(const DoutPrefixProvider* dpp,
-                                                 optional_yield y,
-                                                 std::string_view realm_id,
-                                                 RGWZoneGroup& info,
-                                                 std::unique_ptr<ZoneGroupWriter>* writer)
-{
-  info = zonegroup;
-
-  if (writer) {
-    *writer = std::make_unique<ImmutableZoneGroupWriter>();
-  }
-  return 0;
-}
-
-int ImmutableConfigStore::list_zonegroup_names(const DoutPrefixProvider* dpp,
-                                               optional_yield y, const std::string& marker,
-                                               std::span<std::string> entries,
-                                               ListResult<std::string>& result)
-{
-  if (marker < zonegroup.name) {
-    entries[0] = zonegroup.name;
-    result.next = zonegroup.name;
-    result.entries = entries.first(1);
-  } else {
-    result.next.clear();
-    result.entries = entries.first(0);
-  }
-  return 0;
-}
-
-// Zone
-
-class ImmutableZoneWriter : public ZoneWriter {
- public:
-  int write(const DoutPrefixProvider* dpp, optional_yield y,
-            const RGWZoneParams& info) override
-  {
-    return -EROFS;
-  }
-  int rename(const DoutPrefixProvider* dpp, optional_yield y,
-             RGWZoneParams& info, std::string_view new_name) override
-  {
-    return -EROFS;
-  }
-  int remove(const DoutPrefixProvider* dpp, optional_yield y) override
-  {
-    return -EROFS;
-  }
-};
-
-int ImmutableConfigStore::write_default_zone_id(const DoutPrefixProvider* dpp,
-                                                optional_yield y, bool exclusive,
-                                                std::string_view realm_id,
-                                                std::string_view zone_id)
-{
-  return -EROFS;
-}
-
-int ImmutableConfigStore::read_default_zone_id(const DoutPrefixProvider* dpp,
-                                               optional_yield y,
-                                               std::string_view realm_id,
-                                               std::string& zone_id)
-{
-  if (realm_id.empty()) {
-    return -ENOENT;
-  }
-  zone_id = zone.id;
-  return 0;
-}
-
-int ImmutableConfigStore::delete_default_zone_id(const DoutPrefixProvider* dpp,
-                                                 optional_yield y,
-                                                 std::string_view realm_id)
-{
-  return -EROFS;
-}
-
-
-int ImmutableConfigStore::create_zone(const DoutPrefixProvider* dpp,
-                                      optional_yield y, bool exclusive,
-                                      const RGWZoneParams& info,
-                                      std::unique_ptr<ZoneWriter>* writer)
-{
-  return -EROFS;
-}
-
-int ImmutableConfigStore::read_zone_by_id(const DoutPrefixProvider* dpp,
-                                          optional_yield y,
-                                          std::string_view zone_id,
-                                          RGWZoneParams& info,
-                                          std::unique_ptr<ZoneWriter>* writer)
-{
-  if (zone_id != zone.id) {
-    return -ENOENT;
-  }
-
-  info = zone;
-
-  if (writer) {
-    *writer = std::make_unique<ImmutableZoneWriter>();
-  }
-  return 0;
-}
-
-int ImmutableConfigStore::read_zone_by_name(const DoutPrefixProvider* dpp,
-                                            optional_yield y,
-                                            std::string_view zone_name,
-                                            RGWZoneParams& info,
-                                            std::unique_ptr<ZoneWriter>* writer)
-{
-  if (zone_name != zone.name) {
-    return -ENOENT;
-  }
-
-  info = zone;
-
-  if (writer) {
-    *writer = std::make_unique<ImmutableZoneWriter>();
-  }
-  return 0;
-}
-
-int ImmutableConfigStore::read_default_zone(const DoutPrefixProvider* dpp,
-                                            optional_yield y,
-                                            std::string_view realm_id,
-                                            RGWZoneParams& info,
-                                            std::unique_ptr<ZoneWriter>* writer)
-{
-  if (!realm_id.empty()) {
-    return -ENOENT;
-  }
-
-  info = zone;
-
-  if (writer) {
-    *writer = std::make_unique<ImmutableZoneWriter>();
-  }
-  return 0;
-}
-
-int ImmutableConfigStore::list_zone_names(const DoutPrefixProvider* dpp,
-                                          optional_yield y, const std::string& marker,
-                                          std::span<std::string> entries,
-                                          ListResult<std::string>& result)
-{
-  if (marker < zone.name) {
-    entries[0] = zone.name;
-    result.next = zone.name;
-    result.entries = entries.first(1);
-  } else {
-    result.next.clear();
-    result.entries = entries.first(0);
-  }
-  return 0;
-}
-
-
-// PeriodConfig
-int ImmutableConfigStore::read_period_config(const DoutPrefixProvider* dpp,
-                                             optional_yield y,
-                                             std::string_view realm_id,
-                                             RGWPeriodConfig& info)
-{
-  if (!realm_id.empty()) {
-    return -ENOENT;
-  }
-
-  info = period_config;
-  return 0;
-}
-
-int ImmutableConfigStore::write_period_config(const DoutPrefixProvider* dpp,
-                                              optional_yield y, bool exclusive,
-                                              std::string_view realm_id,
-                                              const RGWPeriodConfig& info)
-{
-  return -EROFS;
-}
-
-
-/// ImmutableConfigStore factory function
-auto create_immutable_config_store(const DoutPrefixProvider* dpp,
-                                   const RGWZoneGroup& zonegroup,
-                                   const RGWZoneParams& zone,
-                                   const RGWPeriodConfig& period_config)
-  -> std::unique_ptr<ConfigStore>
-{
-  return std::make_unique<ImmutableConfigStore>(zonegroup, zone, period_config);
-}
-
-} // namespace rgw::sal
diff --git a/src/rgw/store/immutable_config/store.h b/src/rgw/store/immutable_config/store.h
deleted file mode 100644 (file)
index 9a1ac5f..0000000
+++ /dev/null
@@ -1,180 +0,0 @@
-// vim: ts=8 sw=2 smarttab ft=cpp
-
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2022 Red Hat, Inc.
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation. See file COPYING.
- *
- */
-
-#pragma once
-
-#include "rgw_sal_config.h"
-
-namespace rgw::sal {
-
-/// A read-only ConfigStore that serves the given default zonegroup and zone.
-class ImmutableConfigStore : public ConfigStore {
- public:
-  explicit ImmutableConfigStore(const RGWZoneGroup& zonegroup,
-                                const RGWZoneParams& zone,
-                                const RGWPeriodConfig& period_config);
-
-  // Realm
-  virtual int write_default_realm_id(const DoutPrefixProvider* dpp,
-                                     optional_yield y, bool exclusive,
-                                     std::string_view realm_id) override;
-  virtual int read_default_realm_id(const DoutPrefixProvider* dpp,
-                                    optional_yield y,
-                                    std::string& realm_id) override;
-  virtual int delete_default_realm_id(const DoutPrefixProvider* dpp,
-                                      optional_yield y) override;
-
-  virtual int create_realm(const DoutPrefixProvider* dpp,
-                           optional_yield y, bool exclusive,
-                           const RGWRealm& info,
-                           std::unique_ptr<RealmWriter>* writer) override;
-  virtual int read_realm_by_id(const DoutPrefixProvider* dpp,
-                               optional_yield y,
-                               std::string_view realm_id,
-                               RGWRealm& info,
-                               std::unique_ptr<RealmWriter>* writer) override;
-  virtual int read_realm_by_name(const DoutPrefixProvider* dpp,
-                                 optional_yield y,
-                                 std::string_view realm_name,
-                                 RGWRealm& info,
-                                 std::unique_ptr<RealmWriter>* writer) override;
-  virtual int read_default_realm(const DoutPrefixProvider* dpp,
-                                 optional_yield y,
-                                 RGWRealm& info,
-                                 std::unique_ptr<RealmWriter>* writer) override;
-  virtual int read_realm_id(const DoutPrefixProvider* dpp,
-                            optional_yield y, std::string_view realm_name,
-                            std::string& realm_id) override;
-  virtual int realm_notify_new_period(const DoutPrefixProvider* dpp,
-                                      optional_yield y,
-                                      const RGWPeriod& period) override;
-  virtual int list_realm_names(const DoutPrefixProvider* dpp,
-                               optional_yield y, const std::string& marker,
-                               std::span<std::string> entries,
-                               ListResult<std::string>& result) override;
-
-  // Period
-  virtual int create_period(const DoutPrefixProvider* dpp,
-                            optional_yield y, bool exclusive,
-                            const RGWPeriod& info) override;
-  virtual int read_period(const DoutPrefixProvider* dpp,
-                          optional_yield y, std::string_view period_id,
-                          std::optional<uint32_t> epoch, RGWPeriod& info) override;
-  virtual int delete_period(const DoutPrefixProvider* dpp,
-                            optional_yield y,
-                            std::string_view period_id) override;
-  virtual int list_period_ids(const DoutPrefixProvider* dpp,
-                              optional_yield y, const std::string& marker,
-                              std::span<std::string> entries,
-                              ListResult<std::string>& result) override;
-
-  // ZoneGroup
-  virtual int write_default_zonegroup_id(const DoutPrefixProvider* dpp,
-                                         optional_yield y, bool exclusive,
-                                         std::string_view realm_id,
-                                         std::string_view zonegroup_id) override;
-  virtual int read_default_zonegroup_id(const DoutPrefixProvider* dpp,
-                                        optional_yield y,
-                                        std::string_view realm_id,
-                                        std::string& zonegroup_id) override;
-  virtual int delete_default_zonegroup_id(const DoutPrefixProvider* dpp,
-                                          optional_yield y,
-                                          std::string_view realm_id) override;
-
-  virtual int create_zonegroup(const DoutPrefixProvider* dpp,
-                               optional_yield y, bool exclusive,
-                               const RGWZoneGroup& info,
-                               std::unique_ptr<ZoneGroupWriter>* writer) override;
-  virtual int read_zonegroup_by_id(const DoutPrefixProvider* dpp,
-                                   optional_yield y,
-                                   std::string_view zonegroup_id,
-                                   RGWZoneGroup& info,
-                                   std::unique_ptr<ZoneGroupWriter>* writer) override;
-  virtual int read_zonegroup_by_name(const DoutPrefixProvider* dpp,
-                                     optional_yield y,
-                                     std::string_view zonegroup_name,
-                                     RGWZoneGroup& info,
-                                     std::unique_ptr<ZoneGroupWriter>* writer) override;
-  virtual int read_default_zonegroup(const DoutPrefixProvider* dpp,
-                                     optional_yield y,
-                                     std::string_view realm_id,
-                                     RGWZoneGroup& info,
-                                     std::unique_ptr<ZoneGroupWriter>* writer) override;
-  virtual int list_zonegroup_names(const DoutPrefixProvider* dpp,
-                                   optional_yield y, const std::string& marker,
-                                   std::span<std::string> entries,
-                                   ListResult<std::string>& result) override;
-
-  // Zone
-  virtual int write_default_zone_id(const DoutPrefixProvider* dpp,
-                                    optional_yield y, bool exclusive,
-                                    std::string_view realm_id,
-                                    std::string_view zone_id) override;
-  virtual int read_default_zone_id(const DoutPrefixProvider* dpp,
-                                   optional_yield y,
-                                   std::string_view realm_id,
-                                   std::string& zone_id) override;
-  virtual int delete_default_zone_id(const DoutPrefixProvider* dpp,
-                                     optional_yield y,
-                                     std::string_view realm_id) override;
-
-  virtual int create_zone(const DoutPrefixProvider* dpp,
-                          optional_yield y, bool exclusive,
-                          const RGWZoneParams& info,
-                          std::unique_ptr<ZoneWriter>* writer) override;
-  virtual int read_zone_by_id(const DoutPrefixProvider* dpp,
-                              optional_yield y,
-                              std::string_view zone_id,
-                              RGWZoneParams& info,
-                              std::unique_ptr<ZoneWriter>* writer) override;
-  virtual int read_zone_by_name(const DoutPrefixProvider* dpp,
-                                optional_yield y,
-                                std::string_view zone_name,
-                                RGWZoneParams& info,
-                                std::unique_ptr<ZoneWriter>* writer) override;
-  virtual int read_default_zone(const DoutPrefixProvider* dpp,
-                                optional_yield y,
-                                std::string_view realm_id,
-                                RGWZoneParams& info,
-                                std::unique_ptr<ZoneWriter>* writer) override;
-  virtual int list_zone_names(const DoutPrefixProvider* dpp,
-                              optional_yield y, const std::string& marker,
-                              std::span<std::string> entries,
-                              ListResult<std::string>& result) override;
-
-  // PeriodConfig
-  virtual int read_period_config(const DoutPrefixProvider* dpp,
-                                 optional_yield y,
-                                 std::string_view realm_id,
-                                 RGWPeriodConfig& info) override;
-  virtual int write_period_config(const DoutPrefixProvider* dpp,
-                                  optional_yield y, bool exclusive,
-                                  std::string_view realm_id,
-                                  const RGWPeriodConfig& info) override;
-
- private:
-  const RGWZoneGroup zonegroup;
-  const RGWZoneParams zone;
-  const RGWPeriodConfig period_config;
-}; // ImmutableConfigStore
-
-
-/// ImmutableConfigStore factory function
-auto create_immutable_config_store(const DoutPrefixProvider* dpp,
-                                   const RGWZoneGroup& zonegroup,
-                                   const RGWZoneParams& zone,
-                                   const RGWPeriodConfig& period_config)
-  -> std::unique_ptr<ConfigStore>;
-
-} // namespace rgw::sal
diff --git a/src/rgw/store/json_config/store.cc b/src/rgw/store/json_config/store.cc
deleted file mode 100644 (file)
index 49837a8..0000000
+++ /dev/null
@@ -1,176 +0,0 @@
-// vim: ts=8 sw=2 smarttab ft=cpp
-
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2022 Red Hat, Inc.
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation. See file COPYING.
- *
- */
-
-#include <system_error>
-#include "include/buffer.h"
-#include "common/errno.h"
-#include "common/ceph_json.h"
-#include "rgw_zone.h"
-#include "store/immutable_config/store.h"
-#include "store.h"
-
-namespace rgw::sal {
-
-namespace {
-
-struct DecodedConfig {
-  RGWZoneGroup zonegroup;
-  RGWZoneParams zone;
-  RGWPeriodConfig period_config;
-
-  void decode_json(JSONObj *obj)
-  {
-    JSONDecoder::decode_json("zonegroup", zonegroup, obj);
-    JSONDecoder::decode_json("zone", zone, obj);
-    JSONDecoder::decode_json("period_config", period_config, obj);
-  }
-};
-
-static void parse_config(const DoutPrefixProvider* dpp, const char* filename)
-{
-  bufferlist bl;
-  std::string errmsg;
-  int r = bl.read_file(filename, &errmsg);
-  if (r < 0) {
-    ldpp_dout(dpp, 0) << "failed to read json config file '" << filename
-        << "': " << errmsg << dendl;
-    throw std::system_error(-r, std::system_category());
-  }
-
-  JSONParser p;
-  if (!p.parse(bl.c_str(), bl.length())) {
-    ldpp_dout(dpp, 0) << "failed to parse json config file" << dendl;
-    throw std::system_error(make_error_code(std::errc::invalid_argument));
-  }
-
-  DecodedConfig config;
-  try {
-    decode_json_obj(config, &p);
-  } catch (const JSONDecoder::err& e) {
-    ldpp_dout(dpp, 0) << "failed to decode JSON input: " << e.what() << dendl;
-    throw std::system_error(make_error_code(std::errc::invalid_argument));
-  }
-}
-
-void sanity_check_config(const DoutPrefixProvider* dpp, DecodedConfig& config)
-{
-  if (config.zonegroup.id.empty()) {
-    config.zonegroup.id = "default";
-  }
-  if (config.zonegroup.name.empty()) {
-    config.zonegroup.name = "default";
-  }
-  if (config.zonegroup.api_name.empty()) {
-    config.zonegroup.api_name = config.zonegroup.name;
-  }
-
-  if (config.zone.id.empty()) {
-    config.zone.id = "default";
-  }
-  if (config.zone.name.empty()) {
-    config.zone.name = "default";
-  }
-
-  // add default placement if it doesn't exist
-  rgw_pool pool;
-  RGWZonePlacementInfo placement;
-  placement.storage_classes.set_storage_class(
-      RGW_STORAGE_CLASS_STANDARD, &pool, nullptr);
-  config.zone.placement_pools.emplace("default-placement",
-                                      std::move(placement));
-
-  std::set<rgw_pool> pools;
-  int r = rgw::init_zone_pool_names(dpp, null_yield, pools, config.zone);
-  if (r < 0) {
-    ldpp_dout(dpp, 0) << "failed to set default zone pool names" << dendl;
-    throw std::system_error(-r, std::system_category());
-  }
-
-  // verify that config.zonegroup only contains config.zone
-  if (config.zonegroup.zones.size() > 1) {
-    ldpp_dout(dpp, 0) << "zonegroup cannot contain multiple zones" << dendl;
-    throw std::system_error(make_error_code(std::errc::invalid_argument));
-  }
-
-  if (config.zonegroup.zones.size() == 1) {
-    auto z = config.zonegroup.zones.begin();
-    if (z->first != config.zone.id) {
-      ldpp_dout(dpp, 0) << "zonegroup contains unknown zone id="
-          << z->first << dendl;
-      throw std::system_error(make_error_code(std::errc::invalid_argument));
-    }
-    if (z->second.id != config.zone.id) {
-      ldpp_dout(dpp, 0) << "zonegroup contains unknown zone id="
-          << z->second.id << dendl;
-      throw std::system_error(make_error_code(std::errc::invalid_argument));
-    }
-    if (z->second.name != config.zone.name) {
-      ldpp_dout(dpp, 0) << "zonegroup contains unknown zone name="
-          << z->second.name << dendl;
-      throw std::system_error(make_error_code(std::errc::invalid_argument));
-    }
-    if (config.zonegroup.master_zone != config.zone.id) {
-      ldpp_dout(dpp, 0) << "zonegroup contains unknown master_zone="
-          << config.zonegroup.master_zone << dendl;
-      throw std::system_error(make_error_code(std::errc::invalid_argument));
-    }
-  } else {
-    // add the zone to the group
-    const bool is_master = true;
-    const bool read_only = false;
-    std::list<std::string> endpoints;
-    std::list<std::string> sync_from;
-    std::list<std::string> sync_from_rm;
-    rgw::zone_features::set enable_features;
-    rgw::zone_features::set disable_features;
-
-    enable_features.insert(rgw::zone_features::supported.begin(),
-                           rgw::zone_features::supported.end());
-
-    int r = rgw::add_zone_to_group(dpp, config.zonegroup, config.zone,
-                                   &is_master, &read_only, endpoints,
-                                   nullptr, nullptr, sync_from, sync_from_rm,
-                                   nullptr, std::nullopt,
-                                   enable_features, disable_features);
-    if (r < 0) {
-      ldpp_dout(dpp, 0) << "failed to add zone to zonegroup: "
-          << cpp_strerror(r) << dendl;
-      throw std::system_error(-r, std::system_category());
-    }
-
-    config.zonegroup.enabled_features = std::move(enable_features);
-  }
-
-  // insert the default placement target if it doesn't exist
-  auto target = RGWZoneGroupPlacementTarget{.name = "default-placement"};
-  config.zonegroup.placement_targets.emplace(target.name, target);
-  if (config.zonegroup.default_placement.name.empty()) {
-    config.zonegroup.default_placement.name = target.name;
-  }
-}
-
-} // anonymous namespace
-
-auto create_json_config_store(const DoutPrefixProvider* dpp,
-                              const std::string& filename)
-    -> std::unique_ptr<ConfigStore>
-{
-  DecodedConfig config;
-  parse_config(dpp, filename.c_str());
-  sanity_check_config(dpp, config);
-  return create_immutable_config_store(dpp, config.zonegroup, config.zone,
-                                       config.period_config);
-}
-
-} // namespace rgw::sal
diff --git a/src/rgw/store/json_config/store.h b/src/rgw/store/json_config/store.h
deleted file mode 100644 (file)
index 63ddf6f..0000000
+++ /dev/null
@@ -1,27 +0,0 @@
-// vim: ts=8 sw=2 smarttab ft=cpp
-
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2022 Red Hat, Inc.
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation. See file COPYING.
- *
- */
-
-#pragma once
-
-#include "store/immutable_config/store.h"
-
-namespace rgw::sal {
-
-/// Create an immutable ConfigStore by parsing the zonegroup and zone from the
-/// given json filename.
-auto create_json_config_store(const DoutPrefixProvider* dpp,
-                              const std::string& filename)
-    -> std::unique_ptr<ConfigStore>;
-
-} // namespace rgw::sal
diff --git a/src/rgw/store/rados/cls_fifo_legacy.cc b/src/rgw/store/rados/cls_fifo_legacy.cc
deleted file mode 100644 (file)
index 23b39b9..0000000
+++ /dev/null
@@ -1,2484 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2020 Red Hat <contact@redhat.com>
- * Author: Adam C. Emerson
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation.  See file COPYING.
- *
- */
-
-#include <cstdint>
-#include <numeric>
-#include <optional>
-#include <string_view>
-
-#undef FMT_HEADER_ONLY
-#define FMT_HEADER_ONLY 1
-#include <fmt/format.h>
-
-#include "include/rados/librados.hpp"
-
-#include "include/buffer.h"
-
-#include "common/async/yield_context.h"
-#include "common/random_string.h"
-
-#include "cls/fifo/cls_fifo_types.h"
-#include "cls/fifo/cls_fifo_ops.h"
-
-#include "cls_fifo_legacy.h"
-
-namespace rgw::cls::fifo {
-namespace cb = ceph::buffer;
-namespace fifo = rados::cls::fifo;
-
-using ceph::from_error_code;
-
-inline constexpr auto MAX_RACE_RETRIES = 10;
-
-void create_meta(lr::ObjectWriteOperation* op,
-                std::string_view id,
-                std::optional<fifo::objv> objv,
-                std::optional<std::string_view> oid_prefix,
-                bool exclusive,
-                std::uint64_t max_part_size,
-                std::uint64_t max_entry_size)
-{
-  fifo::op::create_meta cm;
-
-  cm.id = id;
-  cm.version = objv;
-  cm.oid_prefix = oid_prefix;
-  cm.max_part_size = max_part_size;
-  cm.max_entry_size = max_entry_size;
-  cm.exclusive = exclusive;
-
-  cb::list in;
-  encode(cm, in);
-  op->exec(fifo::op::CLASS, fifo::op::CREATE_META, in);
-}
-
-int get_meta(const DoutPrefixProvider *dpp, lr::IoCtx& ioctx, const std::string& oid,
-            std::optional<fifo::objv> objv, fifo::info* info,
-            std::uint32_t* part_header_size,
-            std::uint32_t* part_entry_overhead,
-            uint64_t tid, optional_yield y,
-            bool probe)
-{
-  lr::ObjectReadOperation op;
-  fifo::op::get_meta gm;
-  gm.version = objv;
-  cb::list in;
-  encode(gm, in);
-  cb::list bl;
-
-  op.exec(fifo::op::CLASS, fifo::op::GET_META, in,
-         &bl, nullptr);
-  auto r = rgw_rados_operate(dpp, ioctx, oid, &op, nullptr, y);
-  if (r >= 0) try {
-      fifo::op::get_meta_reply reply;
-      auto iter = bl.cbegin();
-      decode(reply, iter);
-      if (info) *info = std::move(reply.info);
-      if (part_header_size) *part_header_size = reply.part_header_size;
-      if (part_entry_overhead)
-       *part_entry_overhead = reply.part_entry_overhead;
-    } catch (const cb::error& err) {
-      ldpp_dout(dpp, -1)
-       << __PRETTY_FUNCTION__ << ":" << __LINE__
-       << " decode failed: " << err.what()
-       << " tid=" << tid << dendl;
-      r = from_error_code(err.code());
-    } else if (!(probe && (r == -ENOENT || r == -ENODATA))) {
-    ldpp_dout(dpp, -1)
-      << __PRETTY_FUNCTION__ << ":" << __LINE__
-      << " fifo::op::GET_META failed r=" << r << " tid=" << tid
-      << dendl;
-  }
-  return r;
-};
-
-namespace {
-void update_meta(lr::ObjectWriteOperation* op, const fifo::objv& objv,
-                const fifo::update& update)
-{
-  fifo::op::update_meta um;
-
-  um.version = objv;
-  um.tail_part_num = update.tail_part_num();
-  um.head_part_num = update.head_part_num();
-  um.min_push_part_num = update.min_push_part_num();
-  um.max_push_part_num = update.max_push_part_num();
-  um.journal_entries_add = std::move(update).journal_entries_add();
-  um.journal_entries_rm = std::move(update).journal_entries_rm();
-
-  cb::list in;
-  encode(um, in);
-  op->exec(fifo::op::CLASS, fifo::op::UPDATE_META, in);
-}
-
-void part_init(lr::ObjectWriteOperation* op, std::string_view tag,
-              fifo::data_params params)
-{
-  fifo::op::init_part ip;
-
-  ip.tag = tag;
-  ip.params = params;
-
-  cb::list in;
-  encode(ip, in);
-  op->exec(fifo::op::CLASS, fifo::op::INIT_PART, in);
-}
-
-int push_part(const DoutPrefixProvider *dpp, lr::IoCtx& ioctx, const std::string& oid, std::string_view tag,
-             std::deque<cb::list> data_bufs, std::uint64_t tid,
-             optional_yield y)
-{
-  lr::ObjectWriteOperation op;
-  fifo::op::push_part pp;
-
-  pp.tag = tag;
-  pp.data_bufs = data_bufs;
-  pp.total_len = 0;
-
-  for (const auto& bl : data_bufs)
-    pp.total_len += bl.length();
-
-  cb::list in;
-  encode(pp, in);
-  auto retval = 0;
-  op.exec(fifo::op::CLASS, fifo::op::PUSH_PART, in, nullptr, &retval);
-  auto r = rgw_rados_operate(dpp, ioctx, oid, &op, y, lr::OPERATION_RETURNVEC);
-  if (r < 0) {
-    ldpp_dout(dpp, -1)
-      << __PRETTY_FUNCTION__ << ":" << __LINE__
-      << " fifo::op::PUSH_PART failed r=" << r
-      << " tid=" << tid << dendl;
-    return r;
-  }
-  if (retval < 0) {
-    ldpp_dout(dpp, -1)
-      << __PRETTY_FUNCTION__ << ":" << __LINE__
-      << " error handling response retval=" << retval
-      << " tid=" << tid << dendl;
-  }
-  return retval;
-}
-
-void push_part(lr::IoCtx& ioctx, const std::string& oid, std::string_view tag,
-              std::deque<cb::list> data_bufs, std::uint64_t tid,
-              lr::AioCompletion* c)
-{
-  lr::ObjectWriteOperation op;
-  fifo::op::push_part pp;
-
-  pp.tag = tag;
-  pp.data_bufs = data_bufs;
-  pp.total_len = 0;
-
-  for (const auto& bl : data_bufs)
-    pp.total_len += bl.length();
-
-  cb::list in;
-  encode(pp, in);
-  op.exec(fifo::op::CLASS, fifo::op::PUSH_PART, in);
-  auto r = ioctx.aio_operate(oid, c, &op, lr::OPERATION_RETURNVEC);
-  ceph_assert(r >= 0);
-}
-
-void trim_part(lr::ObjectWriteOperation* op,
-              std::optional<std::string_view> tag,
-              std::uint64_t ofs, bool exclusive)
-{
-  fifo::op::trim_part tp;
-
-  tp.tag = tag;
-  tp.ofs = ofs;
-  tp.exclusive = exclusive;
-
-  cb::list in;
-  encode(tp, in);
-  op->exec(fifo::op::CLASS, fifo::op::TRIM_PART, in);
-}
-
-int list_part(const DoutPrefixProvider *dpp, lr::IoCtx& ioctx, const std::string& oid,
-             std::optional<std::string_view> tag, std::uint64_t ofs,
-             std::uint64_t max_entries,
-             std::vector<fifo::part_list_entry>* entries,
-             bool* more, bool* full_part, std::string* ptag,
-             std::uint64_t tid, optional_yield y)
-{
-  lr::ObjectReadOperation op;
-  fifo::op::list_part lp;
-
-  lp.tag = tag;
-  lp.ofs = ofs;
-  lp.max_entries = max_entries;
-
-  cb::list in;
-  encode(lp, in);
-  cb::list bl;
-  op.exec(fifo::op::CLASS, fifo::op::LIST_PART, in, &bl, nullptr);
-  auto r = rgw_rados_operate(dpp, ioctx, oid, &op, nullptr, y);
-  if (r >= 0) try {
-      fifo::op::list_part_reply reply;
-      auto iter = bl.cbegin();
-      decode(reply, iter);
-      if (entries) *entries = std::move(reply.entries);
-      if (more) *more = reply.more;
-      if (full_part) *full_part = reply.full_part;
-      if (ptag) *ptag = reply.tag;
-    } catch (const cb::error& err) {
-      ldpp_dout(dpp, -1)
-       << __PRETTY_FUNCTION__ << ":" << __LINE__
-       << " decode failed: " << err.what()
-       << " tid=" << tid << dendl;
-      r = from_error_code(err.code());
-    } else if (r != -ENOENT) {
-    ldpp_dout(dpp, -1)
-      << __PRETTY_FUNCTION__ << ":" << __LINE__
-      << " fifo::op::LIST_PART failed r=" << r << " tid=" << tid
-      << dendl;
-  }
-  return r;
-}
-
-struct list_entry_completion : public lr::ObjectOperationCompletion {
-  CephContext* cct;
-  int* r_out;
-  std::vector<fifo::part_list_entry>* entries;
-  bool* more;
-  bool* full_part;
-  std::string* ptag;
-  std::uint64_t tid;
-
-  list_entry_completion(CephContext* cct, int* r_out, std::vector<fifo::part_list_entry>* entries,
-                       bool* more, bool* full_part, std::string* ptag,
-                       std::uint64_t tid)
-    : cct(cct), r_out(r_out), entries(entries), more(more),
-      full_part(full_part), ptag(ptag), tid(tid) {}
-  virtual ~list_entry_completion() = default;
-  void handle_completion(int r, bufferlist& bl) override {
-    if (r >= 0) try {
-       fifo::op::list_part_reply reply;
-       auto iter = bl.cbegin();
-       decode(reply, iter);
-       if (entries) *entries = std::move(reply.entries);
-       if (more) *more = reply.more;
-       if (full_part) *full_part = reply.full_part;
-       if (ptag) *ptag = reply.tag;
-      } catch (const cb::error& err) {
-       lderr(cct)
-         << __PRETTY_FUNCTION__ << ":" << __LINE__
-         << " decode failed: " << err.what()
-         << " tid=" << tid << dendl;
-       r = from_error_code(err.code());
-      } else if (r < 0) {
-      lderr(cct)
-       << __PRETTY_FUNCTION__ << ":" << __LINE__
-       << " fifo::op::LIST_PART failed r=" << r << " tid=" << tid
-       << dendl;
-    }
-    if (r_out) *r_out = r;
-  }
-};
-
-lr::ObjectReadOperation list_part(CephContext* cct,
-                                 std::optional<std::string_view> tag,
-                                 std::uint64_t ofs,
-                                 std::uint64_t max_entries,
-                                 int* r_out,
-                                 std::vector<fifo::part_list_entry>* entries,
-                                 bool* more, bool* full_part,
-                                 std::string* ptag, std::uint64_t tid)
-{
-  lr::ObjectReadOperation op;
-  fifo::op::list_part lp;
-
-  lp.tag = tag;
-  lp.ofs = ofs;
-  lp.max_entries = max_entries;
-
-  cb::list in;
-  encode(lp, in);
-  op.exec(fifo::op::CLASS, fifo::op::LIST_PART, in,
-         new list_entry_completion(cct, r_out, entries, more, full_part,
-                                   ptag, tid));
-  return op;
-}
-
-int get_part_info(const DoutPrefixProvider *dpp, lr::IoCtx& ioctx, const std::string& oid,
-                 fifo::part_header* header,
-                 std::uint64_t tid, optional_yield y)
-{
-  lr::ObjectReadOperation op;
-  fifo::op::get_part_info gpi;
-
-  cb::list in;
-  cb::list bl;
-  encode(gpi, in);
-  op.exec(fifo::op::CLASS, fifo::op::GET_PART_INFO, in, &bl, nullptr);
-  auto r = rgw_rados_operate(dpp, ioctx, oid, &op, nullptr, y);
-  if (r >= 0) try {
-      fifo::op::get_part_info_reply reply;
-      auto iter = bl.cbegin();
-      decode(reply, iter);
-      if (header) *header = std::move(reply.header);
-    } catch (const cb::error& err) {
-      ldpp_dout(dpp, -1)
-       << __PRETTY_FUNCTION__ << ":" << __LINE__
-       << " decode failed: " << err.what()
-       << " tid=" << tid << dendl;
-      r = from_error_code(err.code());
-    } else {
-    ldpp_dout(dpp, -1)
-      << __PRETTY_FUNCTION__ << ":" << __LINE__
-      << " fifo::op::GET_PART_INFO failed r=" << r << " tid=" << tid
-      << dendl;
-  }
-  return r;
-}
-
-struct partinfo_completion : public lr::ObjectOperationCompletion {
-  CephContext* cct;
-  int* rp;
-  fifo::part_header* h;
-  std::uint64_t tid;
-  partinfo_completion(CephContext* cct, int* rp, fifo::part_header* h,
-                     std::uint64_t tid) :
-    cct(cct), rp(rp), h(h), tid(tid) {
-  }
-  virtual ~partinfo_completion() = default;
-  void handle_completion(int r, bufferlist& bl) override {
-    if (r >= 0) try {
-       fifo::op::get_part_info_reply reply;
-       auto iter = bl.cbegin();
-       decode(reply, iter);
-       if (h) *h = std::move(reply.header);
-      } catch (const cb::error& err) {
-       r = from_error_code(err.code());
-       lderr(cct) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                  << " decode failed: " << err.what()
-                  << " tid=" << tid << dendl;
-      } else {
-      lderr(cct) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                << " fifo::op::GET_PART_INFO failed r=" << r << " tid=" << tid
-                << dendl;
-    }
-    if (rp) {
-      *rp = r;
-    }
-  }
-};
-
-lr::ObjectReadOperation get_part_info(CephContext* cct,
-                                     fifo::part_header* header,
-                                     std::uint64_t tid, int* r = 0)
-{
-  lr::ObjectReadOperation op;
-  fifo::op::get_part_info gpi;
-
-  cb::list in;
-  cb::list bl;
-  encode(gpi, in);
-  op.exec(fifo::op::CLASS, fifo::op::GET_PART_INFO, in,
-         new partinfo_completion(cct, r, header, tid));
-  return op;
-}
-}
-
-std::optional<marker> FIFO::to_marker(std::string_view s)
-{
-  marker m;
-  if (s.empty()) {
-    m.num = info.tail_part_num;
-    m.ofs = 0;
-    return m;
-  }
-
-  auto pos = s.find(':');
-  if (pos == s.npos) {
-    return std::nullopt;
-  }
-
-  auto num = s.substr(0, pos);
-  auto ofs = s.substr(pos + 1);
-
-  auto n = ceph::parse<decltype(m.num)>(num);
-  if (!n) {
-    return std::nullopt;
-  }
-  m.num = *n;
-  auto o = ceph::parse<decltype(m.ofs)>(ofs);
-  if (!o) {
-    return std::nullopt;
-  }
-  m.ofs = *o;
-  return m;
-}
-
-std::string FIFO::generate_tag() const
-{
-  static constexpr auto HEADER_TAG_SIZE = 16;
-  return gen_rand_alphanumeric_plain(static_cast<CephContext*>(ioctx.cct()),
-                                    HEADER_TAG_SIZE);
-}
-
-
-int FIFO::apply_update(const DoutPrefixProvider *dpp,
-                       fifo::info* info,
-                      const fifo::objv& objv,
-                      const fifo::update& update,
-                      std::uint64_t tid)
-{
-  ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                << " entering: tid=" << tid << dendl;
-  std::unique_lock l(m);
-  if (objv != info->version) {
-    ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
-              << " version mismatch, canceling: tid=" << tid << dendl;
-    return -ECANCELED;
-  }
-  auto err = info->apply_update(update);
-  if (err) {
-    ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
-              << " error applying update: " << *err << " tid=" << tid << dendl;
-    return -ECANCELED;
-  }
-
-  ++info->version.ver;
-
-  return {};
-}
-
-int FIFO::_update_meta(const DoutPrefixProvider *dpp, const fifo::update& update,
-                      fifo::objv version, bool* pcanceled,
-                      std::uint64_t tid, optional_yield y)
-{
-  ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                << " entering: tid=" << tid << dendl;
-  lr::ObjectWriteOperation op;
-  bool canceled = false;
-  update_meta(&op, info.version, update);
-  auto r = rgw_rados_operate(dpp, ioctx, oid, &op, y);
-  if (r >= 0 || r == -ECANCELED) {
-    canceled = (r == -ECANCELED);
-    if (!canceled) {
-      r = apply_update(dpp, &info, version, update, tid);
-      if (r < 0) canceled = true;
-    }
-    if (canceled) {
-      r = read_meta(dpp, tid, y);
-      canceled = r < 0 ? false : true;
-    }
-  }
-  if (pcanceled) *pcanceled = canceled;
-  if (canceled) {
-    ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                  << " canceled: tid=" << tid << dendl;
-  }
-  if (r < 0) {
-    ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
-              << " returning error: r=" << r << " tid=" << tid << dendl;
-  }
-  return r;
-}
-
-struct Updater : public Completion<Updater> {
-  FIFO* fifo;
-  fifo::update update;
-  fifo::objv version;
-  bool reread = false;
-  bool* pcanceled = nullptr;
-  std::uint64_t tid;
-  Updater(const DoutPrefixProvider *dpp, FIFO* fifo, lr::AioCompletion* super,
-         const fifo::update& update, fifo::objv version,
-         bool* pcanceled, std::uint64_t tid)
-    : Completion(dpp, super), fifo(fifo), update(update), version(version),
-      pcanceled(pcanceled) {}
-
-  void handle(const DoutPrefixProvider *dpp, Ptr&& p, int r) {
-    ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                        << " entering: tid=" << tid << dendl;
-    if (reread)
-      handle_reread(dpp, std::move(p), r);
-    else
-      handle_update(dpp, std::move(p), r);
-  }
-
-  void handle_update(const DoutPrefixProvider *dpp, Ptr&& p, int r) {
-    ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                        << " handling async update_meta: tid="
-                        << tid << dendl;
-    if (r < 0 && r != -ECANCELED) {
-      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                << " update failed: r=" << r << " tid=" << tid << dendl;
-      complete(std::move(p), r);
-      return;
-    }
-    bool canceled = (r == -ECANCELED);
-    if (!canceled) {
-      int r = fifo->apply_update(dpp, &fifo->info, version, update, tid);
-      if (r < 0) {
-       ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                            << " update failed, marking canceled: r=" << r
-                            << " tid=" << tid << dendl;
-       canceled = true;
-      }
-    }
-    if (canceled) {
-      reread = true;
-      fifo->read_meta(dpp, tid, call(std::move(p)));
-      return;
-    }
-    if (pcanceled)
-      *pcanceled = false;
-    ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                        << " completing: tid=" << tid << dendl;
-    complete(std::move(p), 0);
-  }
-
-  void handle_reread(const DoutPrefixProvider *dpp, Ptr&& p, int r) {
-    ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                        << " handling async read_meta: tid="
-                        << tid << dendl;
-    if (r < 0 && pcanceled) {
-      *pcanceled = false;
-    } else if (r >= 0 && pcanceled) {
-      *pcanceled = true;
-    }
-    if (r < 0) {
-      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                      << " failed dispatching read_meta: r=" << r << " tid="
-                      << tid << dendl;
-    } else {
-      ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                          << " completing: tid=" << tid << dendl;
-    }
-    complete(std::move(p), r);
-  }
-};
-
-void FIFO::_update_meta(const DoutPrefixProvider *dpp, const fifo::update& update,
-                       fifo::objv version, bool* pcanceled,
-                       std::uint64_t tid, lr::AioCompletion* c)
-{
-  ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                << " entering: tid=" << tid << dendl;
-  lr::ObjectWriteOperation op;
-  update_meta(&op, info.version, update);
-  auto updater = std::make_unique<Updater>(dpp, this, c, update, version, pcanceled,
-                                          tid);
-  auto r = ioctx.aio_operate(oid, Updater::call(std::move(updater)), &op);
-  assert(r >= 0);
-}
-
-int FIFO::create_part(const DoutPrefixProvider *dpp, int64_t part_num, std::string_view tag, std::uint64_t tid,
-                     optional_yield y)
-{
-  ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                << " entering: tid=" << tid << dendl;
-  lr::ObjectWriteOperation op;
-  op.create(false); /* We don't need exclusivity, part_init ensures
-                      we're creating from the same journal entry. */
-  std::unique_lock l(m);
-  part_init(&op, tag, info.params);
-  auto oid = info.part_oid(part_num);
-  l.unlock();
-  auto r = rgw_rados_operate(dpp, ioctx, oid, &op, y);
-  if (r < 0) {
-    ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
-              << " part_init failed: r=" << r << " tid="
-              << tid << dendl;
-  }
-  return r;
-}
-
-int FIFO::remove_part(const DoutPrefixProvider *dpp, int64_t part_num, std::string_view tag, std::uint64_t tid,
-                     optional_yield y)
-{
-  ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                << " entering: tid=" << tid << dendl;
-  lr::ObjectWriteOperation op;
-  op.remove();
-  std::unique_lock l(m);
-  auto oid = info.part_oid(part_num);
-  l.unlock();
-  auto r = rgw_rados_operate(dpp, ioctx, oid, &op, y);
-  if (r < 0) {
-    ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
-              << " remove failed: r=" << r << " tid="
-              << tid << dendl;
-  }
-  return r;
-}
-
-int FIFO::process_journal(const DoutPrefixProvider *dpp, std::uint64_t tid, optional_yield y)
-{
-  ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                << " entering: tid=" << tid << dendl;
-  std::vector<fifo::journal_entry> processed;
-
-  std::unique_lock l(m);
-  auto tmpjournal = info.journal;
-  auto new_tail = info.tail_part_num;
-  auto new_head = info.head_part_num;
-  auto new_max = info.max_push_part_num;
-  l.unlock();
-
-  int r = 0;
-  for (auto& [n, entry] : tmpjournal) {
-    ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                  << " processing entry: entry=" << entry << " tid=" << tid
-                  << dendl;
-    switch (entry.op) {
-    case fifo::journal_entry::Op::create:
-      r = create_part(dpp, entry.part_num, entry.part_tag, tid, y);
-      if (entry.part_num > new_max) {
-       new_max = entry.part_num;
-      }
-      break;
-    case fifo::journal_entry::Op::set_head:
-      r = 0;
-      if (entry.part_num > new_head) {
-       new_head = entry.part_num;
-      }
-      break;
-    case fifo::journal_entry::Op::remove:
-      r = remove_part(dpp, entry.part_num, entry.part_tag, tid, y);
-      if (r == -ENOENT) r = 0;
-      if (entry.part_num >= new_tail) {
-       new_tail = entry.part_num + 1;
-      }
-      break;
-    default:
-      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                << " unknown journaled op: entry=" << entry << " tid="
-                << tid << dendl;
-      return -EIO;
-    }
-
-    if (r < 0) {
-      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                << " processing entry failed: entry=" << entry
-                << " r=" << r << " tid=" << tid << dendl;
-      return -r;
-    }
-
-    processed.push_back(std::move(entry));
-  }
-
-  // Postprocess
-  bool canceled = true;
-
-  for (auto i = 0; canceled && i < MAX_RACE_RETRIES; ++i) {
-    ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                  << " postprocessing: i=" << i << " tid=" << tid << dendl;
-
-    std::optional<int64_t> tail_part_num;
-    std::optional<int64_t> head_part_num;
-    std::optional<int64_t> max_part_num;
-
-    std::unique_lock l(m);
-    auto objv = info.version;
-    if (new_tail > tail_part_num) tail_part_num = new_tail;
-    if (new_head > info.head_part_num) head_part_num = new_head;
-    if (new_max > info.max_push_part_num) max_part_num = new_max;
-    l.unlock();
-
-    if (processed.empty() &&
-       !tail_part_num &&
-       !max_part_num) {
-      ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                    << " nothing to update any more: i=" << i << " tid="
-                    << tid << dendl;
-      canceled = false;
-      break;
-    }
-    auto u = fifo::update().tail_part_num(tail_part_num)
-      .head_part_num(head_part_num).max_push_part_num(max_part_num)
-      .journal_entries_rm(processed);
-    r = _update_meta(dpp, u, objv, &canceled, tid, y);
-    if (r < 0) {
-      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                << " _update_meta failed: update=" << u
-                << " r=" << r << " tid=" << tid << dendl;
-      break;
-    }
-
-    if (canceled) {
-      std::vector<fifo::journal_entry> new_processed;
-      std::unique_lock l(m);
-      ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                    << " update canceled, retrying: i=" << i << " tid="
-                    << tid << dendl;
-      for (auto& e : processed) {
-       auto jiter = info.journal.find(e.part_num);
-       /* journal entry was already processed */
-       if (jiter == info.journal.end() ||
-           !(jiter->second == e)) {
-         continue;
-       }
-       new_processed.push_back(e);
-      }
-      processed = std::move(new_processed);
-    }
-  }
-  if (r == 0 && canceled) {
-    ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
-              << " canceled too many times, giving up: tid=" << tid << dendl;
-    r = -ECANCELED;
-  }
-  if (r < 0) {
-    ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
-              << " failed, r=: " << r << " tid=" << tid << dendl;
-  }
-  return r;
-}
-
-int FIFO::_prepare_new_part(const DoutPrefixProvider *dpp, bool is_head, std::uint64_t tid, optional_yield y)
-{
-  ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                << " entering: tid=" << tid << dendl;
-  std::unique_lock l(m);
-  std::vector jentries = { info.next_journal_entry(generate_tag()) };
-  if (info.journal.find(jentries.front().part_num) != info.journal.end()) {
-    l.unlock();
-    ldpp_dout(dpp, 5) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                 << " new part journaled, but not processed: tid="
-                 << tid << dendl;
-    auto r = process_journal(dpp, tid, y);
-    if (r < 0) {
-      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                << " process_journal failed: r=" << r << " tid=" << tid << dendl;
-    }
-    return r;
-  }
-  std::int64_t new_head_part_num = info.head_part_num;
-  auto version = info.version;
-
-  if (is_head) {
-    ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                  << " needs new head: tid=" << tid << dendl;
-    auto new_head_jentry = jentries.front();
-    new_head_jentry.op = fifo::journal_entry::Op::set_head;
-    new_head_part_num = jentries.front().part_num;
-    jentries.push_back(std::move(new_head_jentry));
-  }
-  l.unlock();
-
-  int r = 0;
-  bool canceled = true;
-  for (auto i = 0; canceled && i < MAX_RACE_RETRIES; ++i) {
-    canceled = false;
-    ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                  << " updating metadata: i=" << i << " tid=" << tid << dendl;
-    auto u = fifo::update{}.journal_entries_add(jentries);
-    r = _update_meta(dpp, u, version, &canceled, tid, y);
-    if (r >= 0 && canceled) {
-      std::unique_lock l(m);
-      auto found = (info.journal.find(jentries.front().part_num) !=
-                   info.journal.end());
-      if ((info.max_push_part_num >= jentries.front().part_num &&
-          info.head_part_num >= new_head_part_num)) {
-       ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                      << " raced, but journaled and processed: i=" << i
-                      << " tid=" << tid << dendl;
-       return 0;
-      }
-      if (found) {
-       ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                      << " raced, journaled but not processed: i=" << i
-                      << " tid=" << tid << dendl;
-       canceled = false;
-      }
-      l.unlock();
-    }
-    if (r < 0) {
-      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                << " _update_meta failed: update=" << u << " r=" << r
-                << " tid=" << tid << dendl;
-      return r;
-    }
-  }
-  if (canceled) {
-    ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
-              << " canceled too many times, giving up: tid=" << tid << dendl;
-    return -ECANCELED;
-  }
-  r = process_journal(dpp, tid, y);
-  if (r < 0) {
-    ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
-              << " process_journal failed: r=" << r << " tid=" << tid << dendl;
-  }
-  return r;
-}
-
-int FIFO::_prepare_new_head(const DoutPrefixProvider *dpp, std::uint64_t tid, optional_yield y)
-{
-  ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                << " entering: tid=" << tid << dendl;
-  std::unique_lock l(m);
-  std::int64_t new_head_num = info.head_part_num + 1;
-  auto max_push_part_num = info.max_push_part_num;
-  auto version = info.version;
-  l.unlock();
-
-  int r = 0;
-  if (max_push_part_num < new_head_num) {
-    ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                  << " need new part: tid=" << tid << dendl;
-    r = _prepare_new_part(dpp, true, tid, y);
-    if (r < 0) {
-      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                << " _prepare_new_part failed: r=" << r
-                << " tid=" << tid << dendl;
-      return r;
-    }
-    std::unique_lock l(m);
-    if (info.max_push_part_num < new_head_num) {
-      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                << " inconsistency, push part less than head part: "
-                << " tid=" << tid << dendl;
-      return -EIO;
-    }
-    l.unlock();
-    return 0;
-  }
-
-  bool canceled = true;
-  for (auto i = 0; canceled && i < MAX_RACE_RETRIES; ++i) {
-    ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                  << " updating head: i=" << i << " tid=" << tid << dendl;
-    auto u = fifo::update{}.head_part_num(new_head_num);
-    r = _update_meta(dpp, u, version, &canceled, tid, y);
-    if (r < 0) {
-      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                << " _update_meta failed: update=" << u << " r=" << r
-                << " tid=" << tid << dendl;
-      return r;
-    }
-    std::unique_lock l(m);
-    auto head_part_num = info.head_part_num;
-    version = info.version;
-    l.unlock();
-    if (canceled && (head_part_num >= new_head_num)) {
-      ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                    << " raced, but completed by the other caller: i=" << i
-                    << " tid=" << tid << dendl;
-      canceled = false;
-    }
-  }
-  if (canceled) {
-    ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
-              << " canceled too many times, giving up: tid=" << tid << dendl;
-    return -ECANCELED;
-  }
-  return 0;
-}
-
-struct NewPartPreparer : public Completion<NewPartPreparer> {
-  FIFO* f;
-  std::vector<fifo::journal_entry> jentries;
-  int i = 0;
-  std::int64_t new_head_part_num;
-  bool canceled = false;
-  uint64_t tid;
-
-  NewPartPreparer(const DoutPrefixProvider *dpp, FIFO* f, lr::AioCompletion* super,
-                 std::vector<fifo::journal_entry> jentries,
-                 std::int64_t new_head_part_num,
-                 std::uint64_t tid)
-    : Completion(dpp, super), f(f), jentries(std::move(jentries)),
-      new_head_part_num(new_head_part_num), tid(tid) {}
-
-  void handle(const DoutPrefixProvider *dpp, Ptr&& p, int r) {
-    ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                     << " entering: tid=" << tid << dendl;
-    if (r < 0) {
-      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                   << " _update_meta failed:  r=" << r
-                   << " tid=" << tid << dendl;
-      complete(std::move(p), r);
-      return;
-    }
-
-    if (canceled) {
-      std::unique_lock l(f->m);
-      auto iter = f->info.journal.find(jentries.front().part_num);
-      auto max_push_part_num = f->info.max_push_part_num;
-      auto head_part_num = f->info.head_part_num;
-      auto version = f->info.version;
-      auto found = (iter != f->info.journal.end());
-      l.unlock();
-      if ((max_push_part_num >= jentries.front().part_num &&
-          head_part_num >= new_head_part_num)) {
-       ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                         << " raced, but journaled and processed: i=" << i
-                         << " tid=" << tid << dendl;
-       complete(std::move(p), 0);
-       return;
-      }
-      if (i >= MAX_RACE_RETRIES) {
-       complete(std::move(p), -ECANCELED);
-       return;
-      }
-      if (!found) {
-       ++i;
-       f->_update_meta(dpp, fifo::update{}
-                       .journal_entries_add(jentries),
-                        version, &canceled, tid, call(std::move(p)));
-       return;
-      } else {
-       ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                         << " raced, journaled but not processed: i=" << i
-                         << " tid=" << tid << dendl;
-       canceled = false;
-      }
-      // Fall through. We still need to process the journal.
-    }
-    f->process_journal(dpp, tid, super());
-    return;
-  }
-};
-
-void FIFO::_prepare_new_part(const DoutPrefixProvider *dpp, bool is_head, std::uint64_t tid,
-                            lr::AioCompletion* c)
-{
-  std::unique_lock l(m);
-  std::vector jentries = { info.next_journal_entry(generate_tag()) };
-  if (info.journal.find(jentries.front().part_num) != info.journal.end()) {
-    l.unlock();
-    ldpp_dout(dpp, 5) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                 << " new part journaled, but not processed: tid="
-                 << tid << dendl;
-    process_journal(dpp, tid, c);
-    return;
-  }
-  std::int64_t new_head_part_num = info.head_part_num;
-  auto version = info.version;
-
-  if (is_head) {
-    auto new_head_jentry = jentries.front();
-    new_head_jentry.op = fifo::journal_entry::Op::set_head;
-    new_head_part_num = jentries.front().part_num;
-    jentries.push_back(std::move(new_head_jentry));
-  }
-  l.unlock();
-
-  auto n = std::make_unique<NewPartPreparer>(dpp, this, c, jentries,
-                                            new_head_part_num, tid);
-  auto np = n.get();
-  _update_meta(dpp, fifo::update{}.journal_entries_add(jentries), version,
-              &np->canceled, tid, NewPartPreparer::call(std::move(n)));
-}
-
-struct NewHeadPreparer : public Completion<NewHeadPreparer> {
-  FIFO* f;
-  int i = 0;
-  bool newpart;
-  std::int64_t new_head_num;
-  bool canceled = false;
-  std::uint64_t tid;
-
-  NewHeadPreparer(const DoutPrefixProvider *dpp, FIFO* f, lr::AioCompletion* super,
-                 bool newpart, std::int64_t new_head_num, std::uint64_t tid)
-    : Completion(dpp, super), f(f), newpart(newpart), new_head_num(new_head_num),
-      tid(tid) {}
-
-  void handle(const DoutPrefixProvider *dpp, Ptr&& p, int r) {
-    if (newpart)
-      handle_newpart(std::move(p), r);
-    else
-      handle_update(dpp, std::move(p), r);
-  }
-
-  void handle_newpart(Ptr&& p, int r) {
-    if (r < 0) {
-      lderr(f->cct) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                   << " _prepare_new_part failed: r=" << r
-                   << " tid=" << tid << dendl;
-      complete(std::move(p), r);
-      return;
-    }
-    std::unique_lock l(f->m);
-    if (f->info.max_push_part_num < new_head_num) {
-      l.unlock();
-      lderr(f->cct) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                   << " _prepare_new_part failed: r=" << r
-                   << " tid=" << tid << dendl;
-      complete(std::move(p), -EIO);
-    } else {
-      l.unlock();
-      complete(std::move(p), 0);
-    }
-  }
-
-  void handle_update(const DoutPrefixProvider *dpp, Ptr&& p, int r) {
-    std::unique_lock l(f->m);
-    auto head_part_num = f->info.head_part_num;
-    auto version = f->info.version;
-    l.unlock();
-
-    if (r < 0) {
-      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                   << " _update_meta failed: r=" << r
-                   << " tid=" << tid << dendl;
-      complete(std::move(p), r);
-      return;
-    }
-    if (canceled) {
-      if (i >= MAX_RACE_RETRIES) {
-       ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                     << " canceled too many times, giving up: tid=" << tid << dendl;
-       complete(std::move(p), -ECANCELED);
-       return;
-      }
-
-      // Raced, but there's still work to do!
-      if (head_part_num < new_head_num) {
-       canceled = false;
-       ++i;
-       ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                         << " updating head: i=" << i << " tid=" << tid << dendl;
-       f->_update_meta(dpp, fifo::update{}.head_part_num(new_head_num),
-                       version, &this->canceled, tid, call(std::move(p)));
-       return;
-      }
-    }
-    ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                  << " succeeded : i=" << i << " tid=" << tid << dendl;
-    complete(std::move(p), 0);
-    return;
-  }
-};
-
-void FIFO::_prepare_new_head(const DoutPrefixProvider *dpp, std::uint64_t tid, lr::AioCompletion* c)
-{
-  ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                << " entering: tid=" << tid << dendl;
-  std::unique_lock l(m);
-  int64_t new_head_num = info.head_part_num + 1;
-  auto max_push_part_num = info.max_push_part_num;
-  auto version = info.version;
-  l.unlock();
-
-  if (max_push_part_num < new_head_num) {
-    ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                  << " need new part: tid=" << tid << dendl;
-    auto n = std::make_unique<NewHeadPreparer>(dpp, this, c, true, new_head_num,
-                                              tid);
-    _prepare_new_part(dpp, true, tid, NewHeadPreparer::call(std::move(n)));
-  } else {
-    ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                  << " updating head: tid=" << tid << dendl;
-    auto n = std::make_unique<NewHeadPreparer>(dpp, this, c, false, new_head_num,
-                                              tid);
-    auto np = n.get();
-    _update_meta(dpp, fifo::update{}.head_part_num(new_head_num), version,
-                &np->canceled, tid, NewHeadPreparer::call(std::move(n)));
-  }
-}
-
-int FIFO::push_entries(const DoutPrefixProvider *dpp, const std::deque<cb::list>& data_bufs,
-                      std::uint64_t tid, optional_yield y)
-{
-  ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                << " entering: tid=" << tid << dendl;
-  std::unique_lock l(m);
-  auto head_part_num = info.head_part_num;
-  auto tag = info.head_tag;
-  const auto part_oid = info.part_oid(head_part_num);
-  l.unlock();
-
-  auto r = push_part(dpp, ioctx, part_oid, tag, data_bufs, tid, y);
-  if (r < 0) {
-    ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
-              << " push_part failed: r=" << r << " tid=" << tid << dendl;
-  }
-  return r;
-}
-
-void FIFO::push_entries(const std::deque<cb::list>& data_bufs,
-                       std::uint64_t tid, lr::AioCompletion* c)
-{
-  std::unique_lock l(m);
-  auto head_part_num = info.head_part_num;
-  auto tag = info.head_tag;
-  const auto part_oid = info.part_oid(head_part_num);
-  l.unlock();
-
-  push_part(ioctx, part_oid, tag, data_bufs, tid, c);
-}
-
-int FIFO::trim_part(const DoutPrefixProvider *dpp, int64_t part_num, uint64_t ofs,
-                   std::optional<std::string_view> tag,
-                   bool exclusive, std::uint64_t tid,
-                   optional_yield y)
-{
-  ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                << " entering: tid=" << tid << dendl;
-  lr::ObjectWriteOperation op;
-  std::unique_lock l(m);
-  const auto part_oid = info.part_oid(part_num);
-  l.unlock();
-  rgw::cls::fifo::trim_part(&op, tag, ofs, exclusive);
-  auto r = rgw_rados_operate(dpp, ioctx, part_oid, &op, y);
-  if (r < 0) {
-    ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
-              << " trim_part failed: r=" << r << " tid=" << tid << dendl;
-  }
-  return 0;
-}
-
-void FIFO::trim_part(const DoutPrefixProvider *dpp, int64_t part_num, uint64_t ofs,
-                    std::optional<std::string_view> tag,
-                    bool exclusive, std::uint64_t tid,
-                    lr::AioCompletion* c)
-{
-  ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                << " entering: tid=" << tid << dendl;
-  lr::ObjectWriteOperation op;
-  std::unique_lock l(m);
-  const auto part_oid = info.part_oid(part_num);
-  l.unlock();
-  rgw::cls::fifo::trim_part(&op, tag, ofs, exclusive);
-  auto r = ioctx.aio_operate(part_oid, c, &op);
-  ceph_assert(r >= 0);
-}
-
-int FIFO::open(const DoutPrefixProvider *dpp, lr::IoCtx ioctx, std::string oid, std::unique_ptr<FIFO>* fifo,
-              optional_yield y, std::optional<fifo::objv> objv,
-              bool probe)
-{
-  ldpp_dout(dpp, 20)
-    << __PRETTY_FUNCTION__ << ":" << __LINE__
-    << " entering" << dendl;
-  fifo::info info;
-  std::uint32_t size;
-  std::uint32_t over;
-  int r = get_meta(dpp, ioctx, std::move(oid), objv, &info, &size, &over, 0, y,
-                  probe);
-  if (r < 0) {
-    if (!(probe && (r == -ENOENT || r == -ENODATA))) {
-      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                << " get_meta failed: r=" << r << dendl;
-    }
-    return r;
-  }
-  std::unique_ptr<FIFO> f(new FIFO(std::move(ioctx), oid));
-  f->info = info;
-  f->part_header_size = size;
-  f->part_entry_overhead = over;
-  // If there are journal entries, process them, in case
-  // someone crashed mid-transaction.
-  if (!info.journal.empty()) {
-    ldpp_dout(dpp, 20)
-      << __PRETTY_FUNCTION__ << ":" << __LINE__
-      << " processing leftover journal" << dendl;
-    r = f->process_journal(dpp, 0, y);
-    if (r < 0) {
-      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                << " process_journal failed: r=" << r << dendl;
-      return r;
-    }
-  }
-  *fifo = std::move(f);
-  return 0;
-}
-
-int FIFO::create(const DoutPrefixProvider *dpp, lr::IoCtx ioctx, std::string oid, std::unique_ptr<FIFO>* fifo,
-                optional_yield y, std::optional<fifo::objv> objv,
-                std::optional<std::string_view> oid_prefix,
-                bool exclusive, std::uint64_t max_part_size,
-                std::uint64_t max_entry_size)
-{
-  ldpp_dout(dpp, 20)
-    << __PRETTY_FUNCTION__ << ":" << __LINE__
-    << " entering" << dendl;
-  lr::ObjectWriteOperation op;
-  create_meta(&op, oid, objv, oid_prefix, exclusive, max_part_size,
-             max_entry_size);
-  auto r = rgw_rados_operate(dpp, ioctx, oid, &op, y);
-  if (r < 0) {
-    ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
-              << " create_meta failed: r=" << r << dendl;
-    return r;
-  }
-  r = open(dpp, std::move(ioctx), std::move(oid), fifo, y, objv);
-  return r;
-}
-
-int FIFO::read_meta(const DoutPrefixProvider *dpp, std::uint64_t tid, optional_yield y) {
-  ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                << " entering: tid=" << tid << dendl;
-  fifo::info _info;
-  std::uint32_t _phs;
-  std::uint32_t _peo;
-
-  auto r = get_meta(dpp, ioctx, oid, std::nullopt, &_info, &_phs, &_peo, tid, y);
-  if (r < 0) {
-    ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
-              << " get_meta failed: r=" << r << " tid=" << tid << dendl;
-    return r;
-  }
-  std::unique_lock l(m);
-  // We have a newer version already!
-  if (_info.version.same_or_later(this->info.version)) {
-    info = std::move(_info);
-    part_header_size = _phs;
-    part_entry_overhead = _peo;
-  }
-  return 0;
-}
-
-int FIFO::read_meta(const DoutPrefixProvider *dpp, optional_yield y) {
-  std::unique_lock l(m);
-  auto tid = ++next_tid;
-  l.unlock();
-  return read_meta(dpp, tid, y);
-}
-
-struct Reader : public Completion<Reader> {
-  FIFO* fifo;
-  cb::list bl;
-  std::uint64_t tid;
-  Reader(const DoutPrefixProvider *dpp, FIFO* fifo, lr::AioCompletion* super, std::uint64_t tid)
-    : Completion(dpp, super), fifo(fifo), tid(tid) {}
-
-  void handle(const DoutPrefixProvider *dpp, Ptr&& p, int r) {
-    ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                  << " entering: tid=" << tid << dendl;
-    if (r >= 0) try {
-       fifo::op::get_meta_reply reply;
-       auto iter = bl.cbegin();
-       decode(reply, iter);
-       std::unique_lock l(fifo->m);
-       if (reply.info.version.same_or_later(fifo->info.version)) {
-         fifo->info = std::move(reply.info);
-         fifo->part_header_size = reply.part_header_size;
-         fifo->part_entry_overhead = reply.part_entry_overhead;
-       }
-      } catch (const cb::error& err) {
-       ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                  << " failed to decode response err=" << err.what()
-                  << " tid=" << tid << dendl;
-       r = from_error_code(err.code());
-      } else {
-      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                << " read_meta failed r=" << r
-                << " tid=" << tid << dendl;
-    }
-    complete(std::move(p), r);
-  }
-};
-
-void FIFO::read_meta(const DoutPrefixProvider *dpp, std::uint64_t tid, lr::AioCompletion* c)
-{
-  ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                << " entering: tid=" << tid << dendl;
-  lr::ObjectReadOperation op;
-  fifo::op::get_meta gm;
-  cb::list in;
-  encode(gm, in);
-  auto reader = std::make_unique<Reader>(dpp, this, c, tid);
-  auto rp = reader.get();
-  auto r = ioctx.aio_exec(oid, Reader::call(std::move(reader)), fifo::op::CLASS,
-                         fifo::op::GET_META, in, &rp->bl);
-  assert(r >= 0);
-}
-
-const fifo::info& FIFO::meta() const {
-  return info;
-}
-
-std::pair<std::uint32_t, std::uint32_t> FIFO::get_part_layout_info() const {
-  return {part_header_size, part_entry_overhead};
-}
-
-int FIFO::push(const DoutPrefixProvider *dpp, const cb::list& bl, optional_yield y) {
-  return push(dpp, std::vector{ bl }, y);
-}
-
-void FIFO::push(const DoutPrefixProvider *dpp, const cb::list& bl, lr::AioCompletion* c) {
-  push(dpp, std::vector{ bl }, c);
-}
-
-int FIFO::push(const DoutPrefixProvider *dpp, const std::vector<cb::list>& data_bufs, optional_yield y)
-{
-  std::unique_lock l(m);
-  auto tid = ++next_tid;
-  auto max_entry_size = info.params.max_entry_size;
-  auto need_new_head = info.need_new_head();
-  l.unlock();
-  ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                << " entering: tid=" << tid << dendl;
-  if (data_bufs.empty()) {
-    ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                  << " empty push, returning success tid=" << tid << dendl;
-    return 0;
-  }
-
-  // Validate sizes
-  for (const auto& bl : data_bufs) {
-    if (bl.length() > max_entry_size) {
-      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                << " entry bigger than max_entry_size tid=" << tid << dendl;
-      return -E2BIG;
-    }
-  }
-
-  int r = 0;
-  if (need_new_head) {
-    ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                  << " need new head tid=" << tid << dendl;
-    r = _prepare_new_head(dpp, tid, y);
-    if (r < 0) {
-      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                << " _prepare_new_head failed: r=" << r
-                << " tid=" << tid << dendl;
-      return r;
-    }
-  }
-
-  std::deque<cb::list> remaining(data_bufs.begin(), data_bufs.end());
-  std::deque<cb::list> batch;
-
-  uint64_t batch_len = 0;
-  auto retries = 0;
-  bool canceled = true;
-  while ((!remaining.empty() || !batch.empty()) &&
-        (retries <= MAX_RACE_RETRIES)) {
-    ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                  << " preparing push: remaining=" << remaining.size()
-                  << " batch=" << batch.size() << " retries=" << retries
-                  << " tid=" << tid << dendl;
-    std::unique_lock l(m);
-    auto max_part_size = info.params.max_part_size;
-    auto overhead = part_entry_overhead;
-    l.unlock();
-
-    while (!remaining.empty() &&
-          (remaining.front().length() + batch_len <= max_part_size)) {
-      /* We can send entries with data_len up to max_entry_size,
-        however, we want to also account the overhead when
-        dealing with multiple entries. Previous check doesn't
-        account for overhead on purpose. */
-      batch_len += remaining.front().length() + overhead;
-      batch.push_back(std::move(remaining.front()));
-      remaining.pop_front();
-    }
-    ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                  << " prepared push: remaining=" << remaining.size()
-                  << " batch=" << batch.size() << " retries=" << retries
-                  << " batch_len=" << batch_len
-                  << " tid=" << tid << dendl;
-
-    auto r = push_entries(dpp, batch, tid, y);
-    if (r == -ERANGE) {
-      canceled = true;
-      ++retries;
-      ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                    << " need new head tid=" << tid << dendl;
-      r = _prepare_new_head(dpp, tid, y);
-      if (r < 0) {
-       ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                  << " prepare_new_head failed: r=" << r
-                  << " tid=" << tid << dendl;
-       return r;
-      }
-      r = 0;
-      continue;
-    }
-    if (r < 0) {
-      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                << " push_entries failed: r=" << r
-                << " tid=" << tid << dendl;
-      return r;
-    }
-    // Made forward progress!
-    canceled = false;
-    retries = 0;
-    batch_len = 0;
-    if (r == ssize(batch)) {
-      batch.clear();
-    } else  {
-      batch.erase(batch.begin(), batch.begin() + r);
-      for (const auto& b : batch) {
-       batch_len +=  b.length() + part_entry_overhead;
-      }
-    }
-  }
-  if (canceled) {
-    ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
-              << " canceled too many times, giving up: tid=" << tid << dendl;
-    return -ECANCELED;
-  }
-  return 0;
-}
-
-struct Pusher : public Completion<Pusher> {
-  FIFO* f;
-  std::deque<cb::list> remaining;
-  std::deque<cb::list> batch;
-  int i = 0;
-  std::uint64_t tid;
-  bool new_heading = false;
-
-  void prep_then_push(const DoutPrefixProvider *dpp, Ptr&& p, const unsigned successes) {
-    std::unique_lock l(f->m);
-    auto max_part_size = f->info.params.max_part_size;
-    auto part_entry_overhead = f->part_entry_overhead;
-    l.unlock();
-
-    ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                     << " preparing push: remaining=" << remaining.size()
-                     << " batch=" << batch.size() << " i=" << i
-                     << " tid=" << tid << dendl;
-
-    uint64_t batch_len = 0;
-    if (successes > 0) {
-      if (successes == batch.size()) {
-       batch.clear();
-      } else  {
-       batch.erase(batch.begin(), batch.begin() + successes);
-       for (const auto& b : batch) {
-         batch_len +=  b.length() + part_entry_overhead;
-       }
-      }
-    }
-
-    if (batch.empty() && remaining.empty()) {
-      complete(std::move(p), 0);
-      return;
-    }
-
-    while (!remaining.empty() &&
-          (remaining.front().length() + batch_len <= max_part_size)) {
-
-      /* We can send entries with data_len up to max_entry_size,
-        however, we want to also account the overhead when
-        dealing with multiple entries. Previous check doesn't
-        account for overhead on purpose. */
-      batch_len += remaining.front().length() + part_entry_overhead;
-      batch.push_back(std::move(remaining.front()));
-      remaining.pop_front();
-    }
-    ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                     << " prepared push: remaining=" << remaining.size()
-                     << " batch=" << batch.size() << " i=" << i
-                     << " batch_len=" << batch_len
-                     << " tid=" << tid << dendl;
-    push(std::move(p));
-  }
-
-  void push(Ptr&& p) {
-    f->push_entries(batch, tid, call(std::move(p)));
-  }
-
-  void new_head(const DoutPrefixProvider *dpp, Ptr&& p) {
-    new_heading = true;
-    f->_prepare_new_head(dpp, tid, call(std::move(p)));
-  }
-
-  void handle(const DoutPrefixProvider *dpp, Ptr&& p, int r) {
-    if (!new_heading) {
-      if (r == -ERANGE) {
-       ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                      << " need new head tid=" << tid << dendl;
-       new_head(dpp, std::move(p));
-       return;
-      }
-      if (r < 0) {
-       ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                     << " push_entries failed: r=" << r
-                     << " tid=" << tid << dendl;
-       complete(std::move(p), r);
-       return;
-      }
-      i = 0; // We've made forward progress, so reset the race counter!
-      prep_then_push(dpp, std::move(p), r);
-    } else {
-      if (r < 0) {
-       ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                     << " prepare_new_head failed: r=" << r
-                     << " tid=" << tid << dendl;
-       complete(std::move(p), r);
-       return;
-      }
-      new_heading = false;
-      handle_new_head(dpp, std::move(p), r);
-    }
-  }
-
-  void handle_new_head(const DoutPrefixProvider *dpp, Ptr&& p, int r) {
-    if (r == -ECANCELED) {
-      if (p->i == MAX_RACE_RETRIES) {
-       ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                     << " canceled too many times, giving up: tid=" << tid << dendl;
-       complete(std::move(p), -ECANCELED);
-       return;
-      }
-      ++p->i;
-    } else if (r) {
-      complete(std::move(p), r);
-      return;
-    }
-
-    if (p->batch.empty()) {
-      prep_then_push(dpp, std::move(p), 0);
-      return;
-    } else {
-      push(std::move(p));
-      return;
-    }
-  }
-
-  Pusher(const DoutPrefixProvider *dpp, FIFO* f, std::deque<cb::list>&& remaining,
-        std::uint64_t tid, lr::AioCompletion* super)
-    : Completion(dpp, super), f(f), remaining(std::move(remaining)),
-      tid(tid) {}
-};
-
-void FIFO::push(const DoutPrefixProvider *dpp, const std::vector<cb::list>& data_bufs,
-               lr::AioCompletion* c)
-{
-  std::unique_lock l(m);
-  auto tid = ++next_tid;
-  auto max_entry_size = info.params.max_entry_size;
-  auto need_new_head = info.need_new_head();
-  l.unlock();
-  ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                << " entering: tid=" << tid << dendl;
-  auto p = std::make_unique<Pusher>(dpp, this, std::deque<cb::list>(data_bufs.begin(), data_bufs.end()),
-                                   tid, c);
-  // Validate sizes
-  for (const auto& bl : data_bufs) {
-    if (bl.length() > max_entry_size) {
-      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                << " entry bigger than max_entry_size tid=" << tid << dendl;
-      Pusher::complete(std::move(p), -E2BIG);
-      return;
-    }
-  }
-
-  if (data_bufs.empty() ) {
-    ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                  << " empty push, returning success tid=" << tid << dendl;
-    Pusher::complete(std::move(p), 0);
-    return;
-  }
-
-  if (need_new_head) {
-    ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                  << " need new head tid=" << tid << dendl;
-    p->new_head(dpp, std::move(p));
-  } else {
-    p->prep_then_push(dpp, std::move(p), 0);
-  }
-}
-
-int FIFO::list(const DoutPrefixProvider *dpp, int max_entries,
-              std::optional<std::string_view> markstr,
-              std::vector<list_entry>* presult, bool* pmore,
-              optional_yield y)
-{
-  std::unique_lock l(m);
-  auto tid = ++next_tid;
-  std::int64_t part_num = info.tail_part_num;
-  l.unlock();
-  ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                << " entering: tid=" << tid << dendl;
-  std::uint64_t ofs = 0;
-  if (markstr) {
-    auto marker = to_marker(*markstr);
-    if (!marker) {
-      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                << " invalid marker string: " << markstr
-                << " tid= "<< tid << dendl;
-      return -EINVAL;
-    }
-    part_num = marker->num;
-    ofs = marker->ofs;
-  }
-
-  std::vector<list_entry> result;
-  result.reserve(max_entries);
-  bool more = false;
-
-  std::vector<fifo::part_list_entry> entries;
-  int r = 0;
-  while (max_entries > 0) {
-    ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                  << " max_entries=" << max_entries << " tid=" << tid << dendl;
-    bool part_more = false;
-    bool part_full = false;
-
-    std::unique_lock l(m);
-    auto part_oid = info.part_oid(part_num);
-    l.unlock();
-
-    r = list_part(dpp, ioctx, part_oid, {}, ofs, max_entries, &entries,
-                 &part_more, &part_full, nullptr, tid, y);
-    if (r == -ENOENT) {
-      ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                    << " missing part, rereading metadata"
-                    << " tid= "<< tid << dendl;
-      r = read_meta(dpp, tid, y);
-      if (r < 0) {
-       ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                  << " read_meta failed: r=" << r
-                  << " tid= "<< tid << dendl;
-       return r;
-      }
-      if (part_num < info.tail_part_num) {
-       /* raced with trim? restart */
-       ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                      << " raced with trim, restarting: tid=" << tid << dendl;
-       max_entries += result.size();
-       result.clear();
-       std::unique_lock l(m);
-       part_num = info.tail_part_num;
-       l.unlock();
-       ofs = 0;
-       continue;
-      }
-      ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                    << " assuming part was not written yet, so end of data: "
-                    << "tid=" << tid << dendl;
-      more = false;
-      r = 0;
-      break;
-    }
-    if (r < 0) {
-      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                << " list_entries failed: r=" << r
-                << " tid= "<< tid << dendl;
-      return r;
-    }
-    more = part_full || part_more;
-    for (auto& entry : entries) {
-      list_entry e;
-      e.data = std::move(entry.data);
-      e.marker = marker{part_num, entry.ofs}.to_string();
-      e.mtime = entry.mtime;
-      result.push_back(std::move(e));
-      --max_entries;
-      if (max_entries == 0)
-       break;
-    }
-    entries.clear();
-    if (max_entries > 0 &&
-       part_more) {
-    }
-
-    if (!part_full) {
-      ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                    << " head part is not full, so we can assume we're done: "
-                    << "tid=" << tid << dendl;
-      break;
-    }
-    if (!part_more) {
-      ++part_num;
-      ofs = 0;
-    }
-  }
-  if (presult)
-    *presult = std::move(result);
-  if (pmore)
-    *pmore =  more;
-  return 0;
-}
-
-int FIFO::trim(const DoutPrefixProvider *dpp, std::string_view markstr, bool exclusive, optional_yield y)
-{
-  bool overshoot = false;
-  auto marker = to_marker(markstr);
-  if (!marker) {
-    return -EINVAL;
-  }
-  auto part_num = marker->num;
-  auto ofs = marker->ofs;
-  std::unique_lock l(m);
-  auto tid = ++next_tid;
-  auto hn = info.head_part_num;
-  const auto max_part_size = info.params.max_part_size;
-  if (part_num > hn) {
-    l.unlock();
-    auto r = read_meta(dpp, tid, y);
-    if (r < 0) {
-      return r;
-    }
-    l.lock();
-    auto hn = info.head_part_num;
-    if (part_num > hn) {
-      overshoot = true;
-      part_num = hn;
-      ofs = max_part_size;
-    }
-  }
-  if (part_num < info.tail_part_num) {
-    return -ENODATA;
-  }
-  auto pn = info.tail_part_num;
-  l.unlock();
-  ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                << " entering: tid=" << tid << dendl;
-
-  int r = 0;
-  while (pn < part_num) {
-    ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                  << " pn=" << pn << " tid=" << tid << dendl;
-    std::unique_lock l(m);
-    l.unlock();
-    r = trim_part(dpp, pn, max_part_size, std::nullopt, false, tid, y);
-    if (r < 0 && r == -ENOENT) {
-      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                << " trim_part failed: r=" << r
-                << " tid= "<< tid << dendl;
-      return r;
-    }
-    ++pn;
-  }
-  r = trim_part(dpp, part_num, ofs, std::nullopt, exclusive, tid, y);
-  if (r < 0 && r != -ENOENT) {
-    ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
-              << " trim_part failed: r=" << r
-              << " tid= "<< tid << dendl;
-    return r;
-  }
-
-  l.lock();
-  auto tail_part_num = info.tail_part_num;
-  auto objv = info.version;
-  l.unlock();
-  bool canceled = tail_part_num < part_num;
-  int retries = 0;
-  while ((tail_part_num < part_num) &&
-        canceled &&
-        (retries <= MAX_RACE_RETRIES)) {
-    r = _update_meta(dpp, fifo::update{}.tail_part_num(part_num), objv, &canceled,
-                    tid, y);
-    if (r < 0) {
-      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                << " _update_meta failed: r=" << r
-                << " tid= "<< tid << dendl;
-      return r;
-    }
-    if (canceled) {
-      ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                    << " canceled: retries=" << retries
-                    << " tid=" << tid << dendl;
-      l.lock();
-      tail_part_num = info.tail_part_num;
-      objv = info.version;
-      l.unlock();
-      ++retries;
-    }
-  }
-  if (canceled) {
-    ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
-              << " canceled too many times, giving up: tid=" << tid << dendl;
-    return -EIO;
-  }
-  return overshoot ? -ENODATA : 0;
-}
-
-struct Trimmer : public Completion<Trimmer> {
-  FIFO* fifo;
-  std::int64_t part_num;
-  std::uint64_t ofs;
-  std::int64_t pn;
-  bool exclusive;
-  std::uint64_t tid;
-  bool update = false;
-  bool reread = false;
-  bool canceled = false;
-  bool overshoot = false;
-  int retries = 0;
-
-  Trimmer(const DoutPrefixProvider *dpp, FIFO* fifo, std::int64_t part_num, std::uint64_t ofs, std::int64_t pn,
-         bool exclusive, lr::AioCompletion* super, std::uint64_t tid)
-    : Completion(dpp, super), fifo(fifo), part_num(part_num), ofs(ofs), pn(pn),
-      exclusive(exclusive), tid(tid) {}
-
-  void handle(const DoutPrefixProvider *dpp, Ptr&& p, int r) {
-    ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                  << " entering: tid=" << tid << dendl;
-
-    if (reread) {
-      reread = false;
-      if (r < 0) {
-       ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                  << " read_meta failed: r="
-                  << r << " tid=" << tid << dendl;
-       complete(std::move(p), r);
-       return;
-      }
-      std::unique_lock l(fifo->m);
-      auto hn = fifo->info.head_part_num;
-      const auto max_part_size = fifo->info.params.max_part_size;
-      const auto tail_part_num = fifo->info.tail_part_num;
-      l.unlock();
-      if (part_num > hn) {
-       part_num = hn;
-       ofs = max_part_size;
-       overshoot = true;
-      }
-      if (part_num < tail_part_num) {
-       complete(std::move(p), -ENODATA);
-       return;
-      }
-      pn = tail_part_num;
-      if (pn < part_num) {
-       ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                      << " pn=" << pn << " tid=" << tid << dendl;
-       fifo->trim_part(dpp, pn++, max_part_size, std::nullopt,
-                       false, tid, call(std::move(p)));
-      } else {
-       update = true;
-       canceled = tail_part_num < part_num;
-       fifo->trim_part(dpp, part_num, ofs, std::nullopt, exclusive, tid,
-                       call(std::move(p)));
-      }
-      return;
-    }
-
-    if (r == -ENOENT) {
-      r = 0;
-    }
-
-    if (r < 0) {
-      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                << (update ? " update_meta " : " trim ") << "failed: r="
-                << r << " tid=" << tid << dendl;
-      complete(std::move(p), r);
-      return;
-    }
-
-    if (!update) {
-      ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                    << " handling preceding trim callback: tid=" << tid << dendl;
-      retries = 0;
-      if (pn < part_num) {
-       ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                      << " pn=" << pn << " tid=" << tid << dendl;
-       std::unique_lock l(fifo->m);
-       const auto max_part_size = fifo->info.params.max_part_size;
-       l.unlock();
-       fifo->trim_part(dpp, pn++, max_part_size, std::nullopt,
-                       false, tid, call(std::move(p)));
-       return;
-      }
-
-      std::unique_lock l(fifo->m);
-      const auto tail_part_num = fifo->info.tail_part_num;
-      l.unlock();
-      update = true;
-      canceled = tail_part_num < part_num;
-      fifo->trim_part(dpp, part_num, ofs, std::nullopt, exclusive, tid,
-                     call(std::move(p)));
-      return;
-    }
-
-    ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                  << " handling update-needed callback: tid=" << tid << dendl;
-    std::unique_lock l(fifo->m);
-    auto tail_part_num = fifo->info.tail_part_num;
-    auto objv = fifo->info.version;
-    l.unlock();
-    if ((tail_part_num < part_num) &&
-       canceled) {
-      if (retries > MAX_RACE_RETRIES) {
-       ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                  << " canceled too many times, giving up: tid=" << tid << dendl;
-       complete(std::move(p), -EIO);
-       return;
-      }
-      ++retries;
-      fifo->_update_meta(dpp, fifo::update{}
-                        .tail_part_num(part_num), objv, &canceled,
-                         tid, call(std::move(p)));
-    } else {
-      complete(std::move(p), overshoot ? -ENODATA : 0);
-    }
-  }
-};
-
-void FIFO::trim(const DoutPrefixProvider *dpp, std::string_view markstr, bool exclusive,
-               lr::AioCompletion* c) {
-  auto marker = to_marker(markstr);
-  auto realmark = marker.value_or(::rgw::cls::fifo::marker{});
-  std::unique_lock l(m);
-  const auto hn = info.head_part_num;
-  const auto max_part_size = info.params.max_part_size;
-  const auto pn = info.tail_part_num;
-  const auto part_oid = info.part_oid(pn);
-  auto tid = ++next_tid;
-  l.unlock();
-  ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                << " entering: tid=" << tid << dendl;
-  auto trimmer = std::make_unique<Trimmer>(dpp, this, realmark.num, realmark.ofs,
-                                          pn, exclusive, c, tid);
-  if (!marker) {
-    Trimmer::complete(std::move(trimmer), -EINVAL);
-    return;
-  }
-  ++trimmer->pn;
-  auto ofs = marker->ofs;
-  if (marker->num > hn) {
-    trimmer->reread = true;
-    read_meta(dpp, tid, Trimmer::call(std::move(trimmer)));
-    return;
-  }
-  if (pn < marker->num) {
-    ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                  << " pn=" << pn << " tid=" << tid << dendl;
-    ofs = max_part_size;
-  } else {
-    trimmer->update = true;
-  }
-  trim_part(dpp, pn, ofs, std::nullopt, exclusive,
-           tid, Trimmer::call(std::move(trimmer)));
-}
-
-int FIFO::get_part_info(const DoutPrefixProvider *dpp, int64_t part_num,
-                       fifo::part_header* header,
-                       optional_yield y)
-{
-  std::unique_lock l(m);
-  const auto part_oid = info.part_oid(part_num);
-  auto tid = ++next_tid;
-  l.unlock();
-  auto r = rgw::cls::fifo::get_part_info(dpp, ioctx, part_oid, header, tid, y);
-  if (r < 0) {
-    ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
-              << " get_part_info failed: r="
-              << r << " tid=" << tid << dendl;
-  }
-  return r;
-}
-
-void FIFO::get_part_info(int64_t part_num,
-                        fifo::part_header* header,
-                        lr::AioCompletion* c)
-{
-  std::unique_lock l(m);
-  const auto part_oid = info.part_oid(part_num);
-  auto tid = ++next_tid;
-  l.unlock();
-  auto op = rgw::cls::fifo::get_part_info(cct, header, tid);
-  auto r = ioctx.aio_operate(part_oid, c, &op, nullptr);
-  ceph_assert(r >= 0);
-}
-
-struct InfoGetter : Completion<InfoGetter> {
-  FIFO* fifo;
-  fifo::part_header header;
-  fu2::function<void(int r, fifo::part_header&&)> f;
-  std::uint64_t tid;
-  bool headerread = false;
-
-  InfoGetter(const DoutPrefixProvider *dpp, FIFO* fifo, fu2::function<void(int r, fifo::part_header&&)> f,
-            std::uint64_t tid, lr::AioCompletion* super)
-    : Completion(dpp, super), fifo(fifo), f(std::move(f)), tid(tid) {}
-  void handle(const DoutPrefixProvider *dpp, Ptr&& p, int r) {
-    if (!headerread) {
-      if (r < 0) {
-       ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                        << " read_meta failed: r="
-                        << r << " tid=" << tid << dendl;
-       if (f)
-         f(r, {});
-       complete(std::move(p), r);
-       return;
-      }
-
-      auto info = fifo->meta();
-      auto hpn = info.head_part_num;
-      if (hpn < 0) {
-       ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                            << " no head, returning empty partinfo r="
-                            << r << " tid=" << tid << dendl;
-       if (f)
-         f(0, {});
-       complete(std::move(p), r);
-       return;
-      }
-      headerread = true;
-      auto op = rgw::cls::fifo::get_part_info(fifo->cct, &header, tid);
-      std::unique_lock l(fifo->m);
-      auto oid = fifo->info.part_oid(hpn);
-      l.unlock();
-      r = fifo->ioctx.aio_operate(oid, call(std::move(p)), &op,
-                                 nullptr);
-      ceph_assert(r >= 0);
-      return;
-    }
-
-    if (r < 0) {
-      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                      << " get_part_info failed: r="
-                      << r << " tid=" << tid << dendl;
-    }
-
-    if (f)
-      f(r, std::move(header));
-    complete(std::move(p), r);
-    return;
-  }
-};
-
-void FIFO::get_head_info(const DoutPrefixProvider *dpp, fu2::unique_function<void(int r,
-                                                  fifo::part_header&&)> f,
-                        lr::AioCompletion* c)
-{
-  std::unique_lock l(m);
-  auto tid = ++next_tid;
-  l.unlock();
-  auto ig = std::make_unique<InfoGetter>(dpp, this, std::move(f), tid, c);
-  read_meta(dpp, tid, InfoGetter::call(std::move(ig)));
-}
-
-struct JournalProcessor : public Completion<JournalProcessor> {
-private:
-  FIFO* const fifo;
-
-  std::vector<fifo::journal_entry> processed;
-  std::multimap<std::int64_t, fifo::journal_entry> journal;
-  std::multimap<std::int64_t, fifo::journal_entry>::iterator iter;
-  std::int64_t new_tail;
-  std::int64_t new_head;
-  std::int64_t new_max;
-  int race_retries = 0;
-  bool first_pp = true;
-  bool canceled = false;
-  std::uint64_t tid;
-
-  enum {
-    entry_callback,
-    pp_callback,
-  } state;
-
-  void create_part(const DoutPrefixProvider *dpp, Ptr&& p, int64_t part_num,
-                  std::string_view tag) {
-    ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                        << " entering: tid=" << tid << dendl;
-    state = entry_callback;
-    lr::ObjectWriteOperation op;
-    op.create(false); /* We don't need exclusivity, part_init ensures
-                        we're creating from the  same journal entry. */
-    std::unique_lock l(fifo->m);
-    part_init(&op, tag, fifo->info.params);
-    auto oid = fifo->info.part_oid(part_num);
-    l.unlock();
-    auto r = fifo->ioctx.aio_operate(oid, call(std::move(p)), &op);
-    ceph_assert(r >= 0);
-    return;
-  }
-
-  void remove_part(const DoutPrefixProvider *dpp, Ptr&& p, int64_t part_num,
-                  std::string_view tag) {
-    ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                        << " entering: tid=" << tid << dendl;
-    state = entry_callback;
-    lr::ObjectWriteOperation op;
-    op.remove();
-    std::unique_lock l(fifo->m);
-    auto oid = fifo->info.part_oid(part_num);
-    l.unlock();
-    auto r = fifo->ioctx.aio_operate(oid, call(std::move(p)), &op);
-    ceph_assert(r >= 0);
-    return;
-  }
-
-  void finish_je(const DoutPrefixProvider *dpp, Ptr&& p, int r,
-                const fifo::journal_entry& entry) {
-    ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                        << " entering: tid=" << tid << dendl;
-
-    ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                        << " finishing entry: entry=" << entry
-                        << " tid=" << tid << dendl;
-
-    if (entry.op == fifo::journal_entry::Op::remove && r == -ENOENT)
-      r = 0;
-
-    if (r < 0) {
-      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                      << " processing entry failed: entry=" << entry
-                      << " r=" << r << " tid=" << tid << dendl;
-      complete(std::move(p), r);
-      return;
-    } else {
-      switch (entry.op) {
-      case fifo::journal_entry::Op::unknown:
-      case fifo::journal_entry::Op::set_head:
-       // Can't happen. Filtered out in process.
-       complete(std::move(p), -EIO);
-       return;
-
-      case fifo::journal_entry::Op::create:
-       if (entry.part_num > new_max) {
-         new_max = entry.part_num;
-       }
-       break;
-      case fifo::journal_entry::Op::remove:
-       if (entry.part_num >= new_tail) {
-         new_tail = entry.part_num + 1;
-       }
-       break;
-      }
-      processed.push_back(entry);
-    }
-    ++iter;
-    process(dpp, std::move(p));
-  }
-
-  void postprocess(const DoutPrefixProvider *dpp, Ptr&& p) {
-    ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                        << " entering: tid=" << tid << dendl;
-    if (processed.empty()) {
-      ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                          << " nothing to update any more: race_retries="
-                          << race_retries << " tid=" << tid << dendl;
-      complete(std::move(p), 0);
-      return;
-    }
-    pp_run(dpp, std::move(p), 0, false);
-  }
-
-public:
-
-  JournalProcessor(const DoutPrefixProvider *dpp, FIFO* fifo, std::uint64_t tid, lr::AioCompletion* super)
-    : Completion(dpp, super), fifo(fifo), tid(tid) {
-    std::unique_lock l(fifo->m);
-    journal = fifo->info.journal;
-    iter = journal.begin();
-    new_tail = fifo->info.tail_part_num;
-    new_head = fifo->info.head_part_num;
-    new_max = fifo->info.max_push_part_num;
-  }
-
-  void pp_run(const DoutPrefixProvider *dpp, Ptr&& p, int r, bool canceled) {
-    ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                        << " entering: tid=" << tid << dendl;
-    std::optional<int64_t> tail_part_num;
-    std::optional<int64_t> head_part_num;
-    std::optional<int64_t> max_part_num;
-
-    if (r < 0) {
-      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                      << " failed, r=: " << r << " tid=" << tid << dendl;
-      complete(std::move(p), r);
-    }
-
-
-    ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                        << " postprocessing: race_retries="
-                        << race_retries << " tid=" << tid << dendl;
-
-    if (!first_pp && r == 0 && !canceled) {
-      ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                          << " nothing to update any more: race_retries="
-                          << race_retries << " tid=" << tid << dendl;
-      complete(std::move(p), 0);
-      return;
-    }
-
-    first_pp = false;
-
-    if (canceled) {
-      if (race_retries >= MAX_RACE_RETRIES) {
-       ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                        << " canceled too many times, giving up: tid="
-                        << tid << dendl;
-       complete(std::move(p), -ECANCELED);
-       return;
-      }
-      ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                          << " update canceled, retrying: race_retries="
-                          << race_retries << " tid=" << tid << dendl;
-
-      ++race_retries;
-
-      std::vector<fifo::journal_entry> new_processed;
-      std::unique_lock l(fifo->m);
-      for (auto& e : processed) {
-       auto jiter = fifo->info.journal.find(e.part_num);
-       /* journal entry was already processed */
-       if (jiter == fifo->info.journal.end() ||
-           !(jiter->second == e)) {
-         continue;
-       }
-       new_processed.push_back(e);
-      }
-      processed = std::move(new_processed);
-    }
-
-    std::unique_lock l(fifo->m);
-    auto objv = fifo->info.version;
-    if (new_tail > fifo->info.tail_part_num) {
-      tail_part_num = new_tail;
-    }
-
-    if (new_head > fifo->info.head_part_num) {
-      head_part_num = new_head;
-    }
-
-    if (new_max > fifo->info.max_push_part_num) {
-      max_part_num = new_max;
-    }
-    l.unlock();
-
-    if (processed.empty() &&
-       !tail_part_num &&
-       !max_part_num) {
-      /* nothing to update anymore */
-      ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                          << " nothing to update any more: race_retries="
-                          << race_retries << " tid=" << tid << dendl;
-      complete(std::move(p), 0);
-      return;
-    }
-    state = pp_callback;
-    fifo->_update_meta(dpp, fifo::update{}
-                      .tail_part_num(tail_part_num)
-                      .head_part_num(head_part_num)
-                      .max_push_part_num(max_part_num)
-                      .journal_entries_rm(processed),
-                       objv, &this->canceled, tid, call(std::move(p)));
-    return;
-  }
-
-  JournalProcessor(const JournalProcessor&) = delete;
-  JournalProcessor& operator =(const JournalProcessor&) = delete;
-  JournalProcessor(JournalProcessor&&) = delete;
-  JournalProcessor& operator =(JournalProcessor&&) = delete;
-
-  void process(const DoutPrefixProvider *dpp, Ptr&& p) {
-    ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                        << " entering: tid=" << tid << dendl;
-    while (iter != journal.end()) {
-      ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                          << " processing entry: entry=" << *iter
-                          << " tid=" << tid << dendl;
-      const auto entry = iter->second;
-      switch (entry.op) {
-      case fifo::journal_entry::Op::create:
-       create_part(dpp, std::move(p), entry.part_num, entry.part_tag);
-       return;
-      case fifo::journal_entry::Op::set_head:
-       if (entry.part_num > new_head) {
-         new_head = entry.part_num;
-       }
-       processed.push_back(entry);
-       ++iter;
-       continue;
-      case fifo::journal_entry::Op::remove:
-       remove_part(dpp, std::move(p), entry.part_num, entry.part_tag);
-       return;
-      default:
-       ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                        << " unknown journaled op: entry=" << entry << " tid="
-                        << tid << dendl;
-       complete(std::move(p), -EIO);
-       return;
-      }
-    }
-    postprocess(dpp, std::move(p));
-    return;
-  }
-
-  void handle(const DoutPrefixProvider *dpp, Ptr&& p, int r) {
-    ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                        << " entering: tid=" << tid << dendl;
-    switch (state) {
-    case entry_callback:
-      finish_je(dpp, std::move(p), r, iter->second);
-      return;
-    case pp_callback:
-      auto c = canceled;
-      canceled = false;
-      pp_run(dpp, std::move(p), r, c);
-      return;
-    }
-
-    abort();
-  }
-
-};
-
-void FIFO::process_journal(const DoutPrefixProvider *dpp, std::uint64_t tid, lr::AioCompletion* c) {
-  auto p = std::make_unique<JournalProcessor>(dpp, this, tid, c);
-  p->process(dpp, std::move(p));
-}
-
-struct Lister : Completion<Lister> {
-  FIFO* f;
-  std::vector<list_entry> result;
-  bool more = false;
-  std::int64_t part_num;
-  std::uint64_t ofs;
-  int max_entries;
-  int r_out = 0;
-  std::vector<fifo::part_list_entry> entries;
-  bool part_more = false;
-  bool part_full = false;
-  std::vector<list_entry>* entries_out;
-  bool* more_out;
-  std::uint64_t tid;
-
-  bool read = false;
-
-  void complete(Ptr&& p, int r) {
-    if (r >= 0) {
-      if (more_out) *more_out = more;
-      if (entries_out) *entries_out = std::move(result);
-    }
-    Completion::complete(std::move(p), r);
-  }
-
-public:
-  Lister(const DoutPrefixProvider *dpp, FIFO* f, std::int64_t part_num, std::uint64_t ofs, int max_entries,
-        std::vector<list_entry>* entries_out, bool* more_out,
-        std::uint64_t tid, lr::AioCompletion* super)
-    : Completion(dpp, super), f(f), part_num(part_num), ofs(ofs), max_entries(max_entries),
-      entries_out(entries_out), more_out(more_out), tid(tid) {
-    result.reserve(max_entries);
-  }
-
-  Lister(const Lister&) = delete;
-  Lister& operator =(const Lister&) = delete;
-  Lister(Lister&&) = delete;
-  Lister& operator =(Lister&&) = delete;
-
-  void handle(const DoutPrefixProvider *dpp, Ptr&& p, int r) {
-    if (read)
-      handle_read(std::move(p), r);
-    else
-      handle_list(dpp, std::move(p), r);
-  }
-
-  void list(Ptr&& p) {
-    if (max_entries > 0) {
-      part_more = false;
-      part_full = false;
-      entries.clear();
-
-      std::unique_lock l(f->m);
-      auto part_oid = f->info.part_oid(part_num);
-      l.unlock();
-
-      read = false;
-      auto op = list_part(f->cct, {}, ofs, max_entries, &r_out,
-                         &entries, &part_more, &part_full,
-                         nullptr, tid);
-      f->ioctx.aio_operate(part_oid, call(std::move(p)), &op, nullptr);
-    } else {
-      complete(std::move(p), 0);
-    }
-  }
-
-  void handle_read(Ptr&& p, int r) {
-    read = false;
-    if (r >= 0) r = r_out;
-    r_out = 0;
-
-    if (r < 0) {
-      complete(std::move(p), r);
-      return;
-    }
-
-    if (part_num < f->info.tail_part_num) {
-      /* raced with trim? restart */
-      max_entries += result.size();
-      result.clear();
-      part_num = f->info.tail_part_num;
-      ofs = 0;
-      list(std::move(p));
-      return;
-    }
-    /* assuming part was not written yet, so end of data */
-    more = false;
-    complete(std::move(p), 0);
-    return;
-  }
-
-  void handle_list(const DoutPrefixProvider *dpp, Ptr&& p, int r) {
-    if (r >= 0) r = r_out;
-    r_out = 0;
-    std::unique_lock l(f->m);
-    auto part_oid = f->info.part_oid(part_num);
-    l.unlock();
-    if (r == -ENOENT) {
-      read = true;
-      f->read_meta(dpp, tid, call(std::move(p)));
-      return;
-    }
-    if (r < 0) {
-      complete(std::move(p), r);
-      return;
-    }
-
-    more = part_full || part_more;
-    for (auto& entry : entries) {
-      list_entry e;
-      e.data = std::move(entry.data);
-      e.marker = marker{part_num, entry.ofs}.to_string();
-      e.mtime = entry.mtime;
-      result.push_back(std::move(e));
-    }
-    max_entries -= entries.size();
-    entries.clear();
-    if (max_entries > 0 && part_more) {
-      list(std::move(p));
-      return;
-    }
-
-    if (!part_full) { /* head part is not full */
-      complete(std::move(p), 0);
-      return;
-    }
-    ++part_num;
-    ofs = 0;
-    list(std::move(p));
-  }
-};
-
-void FIFO::list(const DoutPrefixProvider *dpp, int max_entries,
-               std::optional<std::string_view> markstr,
-               std::vector<list_entry>* out,
-               bool* more,
-               lr::AioCompletion* c) {
-  std::unique_lock l(m);
-  auto tid = ++next_tid;
-  std::int64_t part_num = info.tail_part_num;
-  l.unlock();
-  std::uint64_t ofs = 0;
-  std::optional<::rgw::cls::fifo::marker> marker;
-
-  if (markstr) {
-    marker = to_marker(*markstr);
-    if (marker) {
-      part_num = marker->num;
-      ofs = marker->ofs;
-    }
-  }
-
-  auto ls = std::make_unique<Lister>(dpp, this, part_num, ofs, max_entries, out,
-                                    more, tid, c);
-  if (markstr && !marker) {
-    auto l = ls.get();
-    l->complete(std::move(ls), -EINVAL);
-  } else {
-    ls->list(std::move(ls));
-  }
-}
-}
diff --git a/src/rgw/store/rados/cls_fifo_legacy.h b/src/rgw/store/rados/cls_fifo_legacy.h
deleted file mode 100644 (file)
index 9a35e4d..0000000
+++ /dev/null
@@ -1,342 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2020 Red Hat <contact@redhat.com>
- * Author: Adam C. Emerson
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation.  See file COPYING.
- *
- */
-
-#ifndef CEPH_RGW_CLS_FIFO_LEGACY_H
-#define CEPH_RGW_CLS_FIFO_LEGACY_H
-
-#include <cstdint>
-#include <deque>
-#include <map>
-#include <memory>
-#include <mutex>
-#include <optional>
-#include <string_view>
-#include <vector>
-
-#undef FMT_HEADER_ONLY
-#define FMT_HEADER_ONLY 1
-#include <fmt/format.h>
-
-#include "include/rados/librados.hpp"
-#include "include/buffer.h"
-#include "include/function2.hpp"
-
-#include "common/async/yield_context.h"
-
-#include "cls/fifo/cls_fifo_types.h"
-#include "cls/fifo/cls_fifo_ops.h"
-
-#include "librados/AioCompletionImpl.h"
-
-#include "rgw_tools.h"
-
-namespace rgw::cls::fifo {
-namespace cb = ceph::buffer;
-namespace fifo = rados::cls::fifo;
-namespace lr = librados;
-
-inline constexpr std::uint64_t default_max_part_size = 4 * 1024 * 1024;
-inline constexpr std::uint64_t default_max_entry_size = 32 * 1024;
-
-void create_meta(lr::ObjectWriteOperation* op, std::string_view id,
-                std::optional<fifo::objv> objv,
-                std::optional<std::string_view> oid_prefix,
-                bool exclusive = false,
-                std::uint64_t max_part_size = default_max_part_size,
-                std::uint64_t max_entry_size = default_max_entry_size);
-int get_meta(const DoutPrefixProvider *dpp, lr::IoCtx& ioctx, const std::string& oid,
-            std::optional<fifo::objv> objv, fifo::info* info,
-            std::uint32_t* part_header_size,
-            std::uint32_t* part_entry_overhead,
-            std::uint64_t tid, optional_yield y,
-            bool probe = false);
-struct marker {
-  std::int64_t num = 0;
-  std::uint64_t ofs = 0;
-
-  marker() = default;
-  marker(std::int64_t num, std::uint64_t ofs) : num(num), ofs(ofs) {}
-  static marker max() {
-    return { std::numeric_limits<decltype(num)>::max(),
-            std::numeric_limits<decltype(ofs)>::max() };
-  }
-
-  std::string to_string() {
-    return fmt::format("{:0>20}:{:0>20}", num, ofs);
-  }
-};
-
-struct list_entry {
-  cb::list data;
-  std::string marker;
-  ceph::real_time mtime;
-};
-
-using part_info = fifo::part_header;
-
-/// This is an implementation of FIFO using librados to facilitate
-/// backports. Please see /src/neorados/cls/fifo.h for full
-/// information.
-///
-/// This library uses optional_yield. Please see
-/// /src/common/async/yield_context.h. In summary, optional_yield
-/// contains either a spawn::yield_context (in which case the current
-/// coroutine is suspended until completion) or null_yield (in which
-/// case the current thread is blocked until completion.)
-///
-/// Please see the librados documentation for information on
-/// AioCompletion and IoCtx.
-
-class FIFO {
-  friend struct Reader;
-  friend struct Updater;
-  friend struct Trimmer;
-  friend struct InfoGetter;
-  friend struct Pusher;
-  friend struct NewPartPreparer;
-  friend struct NewHeadPreparer;
-  friend struct JournalProcessor;
-  friend struct Lister;
-
-  mutable lr::IoCtx ioctx;
-  CephContext* cct = static_cast<CephContext*>(ioctx.cct());
-  const std::string oid;
-  std::mutex m;
-  std::uint64_t next_tid = 0;
-
-  fifo::info info;
-
-  std::uint32_t part_header_size = 0xdeadbeef;
-  std::uint32_t part_entry_overhead = 0xdeadbeef;
-
-  std::optional<marker> to_marker(std::string_view s);
-
-  FIFO(lr::IoCtx&& ioc,
-       std::string oid)
-    : ioctx(std::move(ioc)), oid(oid) {}
-
-  std::string generate_tag() const;
-
-  int apply_update(const DoutPrefixProvider *dpp,
-                   fifo::info* info,
-                  const fifo::objv& objv,
-                  const fifo::update& update,
-                  std::uint64_t tid);
-  int _update_meta(const DoutPrefixProvider *dpp, const fifo::update& update,
-                  fifo::objv version, bool* pcanceled,
-                  std::uint64_t tid, optional_yield y);
-  void _update_meta(const DoutPrefixProvider *dpp, const fifo::update& update,
-                   fifo::objv version, bool* pcanceled,
-                   std::uint64_t tid, lr::AioCompletion* c);
-  int create_part(const DoutPrefixProvider *dpp, int64_t part_num, std::string_view tag, std::uint64_t tid,
-                 optional_yield y);
-  int remove_part(const DoutPrefixProvider *dpp, int64_t part_num, std::string_view tag, std::uint64_t tid,
-                 optional_yield y);
-  int process_journal(const DoutPrefixProvider *dpp, std::uint64_t tid, optional_yield y);
-  void process_journal(const DoutPrefixProvider *dpp, std::uint64_t tid, lr::AioCompletion* c);
-  int _prepare_new_part(const DoutPrefixProvider *dpp, bool is_head, std::uint64_t tid, optional_yield y);
-  void _prepare_new_part(const DoutPrefixProvider *dpp, bool is_head, std::uint64_t tid, lr::AioCompletion* c);
-  int _prepare_new_head(const DoutPrefixProvider *dpp, std::uint64_t tid, optional_yield y);
-  void _prepare_new_head(const DoutPrefixProvider *dpp, std::uint64_t tid, lr::AioCompletion* c);
-  int push_entries(const DoutPrefixProvider *dpp, const std::deque<cb::list>& data_bufs,
-                  std::uint64_t tid, optional_yield y);
-  void push_entries(const std::deque<cb::list>& data_bufs,
-                   std::uint64_t tid, lr::AioCompletion* c);
-  int trim_part(const DoutPrefixProvider *dpp, int64_t part_num, uint64_t ofs,
-               std::optional<std::string_view> tag, bool exclusive,
-               std::uint64_t tid, optional_yield y);
-  void trim_part(const DoutPrefixProvider *dpp, int64_t part_num, uint64_t ofs,
-                std::optional<std::string_view> tag, bool exclusive,
-                std::uint64_t tid, lr::AioCompletion* c);
-
-  /// Force refresh of metadata, yielding/blocking style
-  int read_meta(const DoutPrefixProvider *dpp, std::uint64_t tid, optional_yield y);
-  /// Force refresh of metadata, with a librados Completion
-  void read_meta(const DoutPrefixProvider *dpp, std::uint64_t tid, lr::AioCompletion* c);
-
-public:
-
-  FIFO(const FIFO&) = delete;
-  FIFO& operator =(const FIFO&) = delete;
-  FIFO(FIFO&&) = delete;
-  FIFO& operator =(FIFO&&) = delete;
-
-  /// Open an existing FIFO.
-  static int open(const DoutPrefixProvider *dpp, lr::IoCtx ioctx, //< IO Context
-                 std::string oid, //< OID for metadata object
-                 std::unique_ptr<FIFO>* fifo, //< OUT: Pointer to FIFO object
-                 optional_yield y, //< Optional yield context
-                 /// Operation will fail if FIFO is not at this version
-                 std::optional<fifo::objv> objv = std::nullopt,
-                 /// Probing for existence, don't print errors if we
-                 /// can't find it.
-                 bool probe = false);
-  /// Create a new or open an existing FIFO.
-  static int create(const DoutPrefixProvider *dpp, lr::IoCtx ioctx, //< IO Context
-                   std::string oid, //< OID for metadata object
-                   std::unique_ptr<FIFO>* fifo, //< OUT: Pointer to FIFO object
-                   optional_yield y, //< Optional yield context
-                   /// Operation will fail if the FIFO exists and is
-                   /// not of this version.
-                   std::optional<fifo::objv> objv = std::nullopt,
-                   /// Prefix for all objects
-                   std::optional<std::string_view> oid_prefix = std::nullopt,
-                   /// Fail if the FIFO already exists
-                   bool exclusive = false,
-                   /// Maximum allowed size of parts
-                   std::uint64_t max_part_size = default_max_part_size,
-                   /// Maximum allowed size of entries
-                   std::uint64_t max_entry_size = default_max_entry_size);
-
-  /// Force refresh of metadata, yielding/blocking style
-  int read_meta(const DoutPrefixProvider *dpp, optional_yield y);
-  /// Get currently known metadata
-  const fifo::info& meta() const;
-  /// Get partition header and entry overhead size
-  std::pair<std::uint32_t, std::uint32_t> get_part_layout_info() const;
-  /// Push an entry to the FIFO
-  int push(const DoutPrefixProvider *dpp, 
-           const cb::list& bl, //< Entry to push
-          optional_yield y //< Optional yield
-    );
-  /// Push an entry to the FIFO
-  void push(const DoutPrefixProvider *dpp, const cb::list& bl, //< Entry to push
-           lr::AioCompletion* c //< Async Completion
-    );
-  /// Push entries to the FIFO
-  int push(const DoutPrefixProvider *dpp, 
-           const std::vector<cb::list>& data_bufs, //< Entries to push
-          optional_yield y //< Optional yield
-    );
-  /// Push entries to the FIFO
-  void push(const DoutPrefixProvider *dpp, const std::vector<cb::list>& data_bufs, //< Entries to push
-           lr::AioCompletion* c //< Async Completion
-    );
-  /// List entries
-  int list(const DoutPrefixProvider *dpp, 
-           int max_entries, //< Maximum entries to list
-          /// Point after which to begin listing. Start at tail if null
-          std::optional<std::string_view> markstr,
-          std::vector<list_entry>* out, //< OUT: entries
-          /// OUT: True if more entries in FIFO beyond the last returned
-          bool* more,
-          optional_yield y //< Optional yield
-    );
-  void list(const DoutPrefixProvider *dpp, 
-            int max_entries, //< Maximum entries to list
-           /// Point after which to begin listing. Start at tail if null
-           std::optional<std::string_view> markstr,
-           std::vector<list_entry>* out, //< OUT: entries
-           /// OUT: True if more entries in FIFO beyond the last returned
-           bool* more,
-           lr::AioCompletion* c //< Async Completion
-    );
-  /// Trim entries, coroutine/block style
-  int trim(const DoutPrefixProvider *dpp, 
-           std::string_view markstr, //< Position to which to trim, inclusive
-          bool exclusive, //< If true, do not trim the target entry
-                          //< itself, just all those before it.
-          optional_yield y //< Optional yield
-    );
-  /// Trim entries, librados AioCompletion style
-  void trim(const DoutPrefixProvider *dpp, 
-            std::string_view markstr, //< Position to which to trim, inclusive
-           bool exclusive, //< If true, do not trim the target entry
-                           //< itself, just all those before it.
-           lr::AioCompletion* c //< librados AIO Completion
-    );
-  /// Get part info
-  int get_part_info(const DoutPrefixProvider *dpp, int64_t part_num, /// Part number
-                   fifo::part_header* header, //< OUT: Information
-                   optional_yield y //< Optional yield
-    );
-  /// Get part info
-  void get_part_info(int64_t part_num, //< Part number
-                   fifo::part_header* header, //< OUT: Information
-                   lr::AioCompletion* c //< AIO Completion
-    );
-  /// A convenience method to fetch the part information for the FIFO
-  /// head, using librados::AioCompletion, since
-  /// libradio::AioCompletions compose lousily.
-  void get_head_info(const DoutPrefixProvider *dpp, fu2::unique_function< //< Function to receive info
-                      void(int r, fifo::part_header&&)>,
-                    lr::AioCompletion* c //< AIO Completion
-    );
-};
-
-template<typename T>
-struct Completion {
-private:
-  const DoutPrefixProvider *_dpp;
-  lr::AioCompletion* _cur = nullptr;
-  lr::AioCompletion* _super;
-public:
-
-  using Ptr = std::unique_ptr<T>;
-
-  lr::AioCompletion* cur() const {
-    return _cur;
-  }
-  lr::AioCompletion* super() const {
-    return _super;
-  }
-
-  Completion(const DoutPrefixProvider *dpp, lr::AioCompletion* super) : _dpp(dpp), _super(super) {
-    super->pc->get();
-  }
-
-  ~Completion() {
-    if (_super) {
-      _super->pc->put();
-    }
-    if (_cur)
-      _cur->release();
-    _super = nullptr;
-    _cur = nullptr;
-  }
-
-  // The only times that aio_operate can return an error are:
-  // 1. The completion contains a null pointer. This should just
-  //    crash, and in our case it does.
-  // 2. An attempt is made to write to a snapshot. RGW doesn't use
-  //    snapshots, so we don't care.
-  //
-  // So we will just assert that initiating an Aio operation succeeds
-  // and not worry about recovering.
-  static lr::AioCompletion* call(Ptr&& p) {
-    p->_cur = lr::Rados::aio_create_completion(static_cast<void*>(p.get()),
-                                              &cb);
-    auto c = p->_cur;
-    p.release();
-    return c;
-  }
-  static void complete(Ptr&& p, int r) {
-    auto c = p->_super;
-    p->_super = nullptr;
-    rgw_complete_aio_completion(c, r);
-  }
-
-  static void cb(lr::completion_t, void* arg) {
-    auto t = static_cast<T*>(arg);
-    auto r = t->_cur->get_return_value();
-    t->_cur->release();
-    t->_cur = nullptr;
-    t->handle(t->_dpp, Ptr(t), r);
-  }
-};
-
-}
-
-#endif // CEPH_RGW_CLS_FIFO_LEGACY_H
diff --git a/src/rgw/store/rados/config/impl.cc b/src/rgw/store/rados/config/impl.cc
deleted file mode 100644 (file)
index f1b2bef..0000000
+++ /dev/null
@@ -1,129 +0,0 @@
-// vim: ts=8 sw=2 smarttab ft=cpp
-
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2022 Red Hat, Inc.
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation. See file COPYING.
- *
- */
-
-#include "impl.h"
-
-#include "common/async/yield_context.h"
-#include "common/errno.h"
-#include "rgw_string.h"
-#include "rgw_zone.h"
-
-namespace rgw::rados {
-
-// default pool names
-constexpr std::string_view default_zone_root_pool = "rgw.root";
-constexpr std::string_view default_zonegroup_root_pool = "rgw.root";
-constexpr std::string_view default_realm_root_pool = "rgw.root";
-constexpr std::string_view default_period_root_pool = "rgw.root";
-
-static rgw_pool default_pool(std::string_view name,
-                             std::string_view default_name)
-{
-  return std::string{name_or_default(name, default_name)};
-}
-
-ConfigImpl::ConfigImpl(const ceph::common::ConfigProxy& conf)
-  : realm_pool(default_pool(conf->rgw_realm_root_pool,
-                            default_realm_root_pool)),
-    period_pool(default_pool(conf->rgw_period_root_pool,
-                             default_period_root_pool)),
-    zonegroup_pool(default_pool(conf->rgw_zonegroup_root_pool,
-                                default_zonegroup_root_pool)),
-    zone_pool(default_pool(conf->rgw_zone_root_pool,
-                           default_zone_root_pool))
-{
-}
-
-int ConfigImpl::read(const DoutPrefixProvider* dpp, optional_yield y,
-                     const rgw_pool& pool, const std::string& oid,
-                     bufferlist& bl, RGWObjVersionTracker* objv)
-{
-  librados::IoCtx ioctx;
-  int r = rgw_init_ioctx(dpp, &rados, pool, ioctx, true, false);
-  if (r < 0) {
-    return r;
-  }
-  librados::ObjectReadOperation op;
-  if (objv) {
-    objv->prepare_op_for_read(&op);
-  }
-  op.read(0, 0, &bl, nullptr);
-  return rgw_rados_operate(dpp, ioctx, oid, &op, nullptr, y);
-}
-
-int ConfigImpl::write(const DoutPrefixProvider* dpp, optional_yield y,
-                      const rgw_pool& pool, const std::string& oid,
-                      Create create, const bufferlist& bl,
-                      RGWObjVersionTracker* objv)
-{
-  librados::IoCtx ioctx;
-  int r = rgw_init_ioctx(dpp, &rados, pool, ioctx, true, false);
-  if (r < 0) {
-    return r;
-  }
-
-  librados::ObjectWriteOperation op;
-  switch (create) {
-    case Create::MustNotExist: op.create(true); break;
-    case Create::MayExist: op.create(false); break;
-    case Create::MustExist: op.assert_exists(); break;
-  }
-  if (objv) {
-    objv->prepare_op_for_write(&op);
-  }
-  op.write_full(bl);
-
-  r = rgw_rados_operate(dpp, ioctx, oid, &op, y);
-  if (r >= 0 && objv) {
-    objv->apply_write();
-  }
-  return r;
-}
-
-int ConfigImpl::remove(const DoutPrefixProvider* dpp, optional_yield y,
-                       const rgw_pool& pool, const std::string& oid,
-                       RGWObjVersionTracker* objv)
-{
-  librados::IoCtx ioctx;
-  int r = rgw_init_ioctx(dpp, &rados, pool, ioctx, true, false);
-  if (r < 0) {
-    return r;
-  }
-
-  librados::ObjectWriteOperation op;
-  if (objv) {
-    objv->prepare_op_for_write(&op);
-  }
-  op.remove();
-
-  r = rgw_rados_operate(dpp, ioctx, oid, &op, y);
-  if (r >= 0 && objv) {
-    objv->apply_write();
-  }
-  return r;
-}
-
-int ConfigImpl::notify(const DoutPrefixProvider* dpp, optional_yield y,
-                       const rgw_pool& pool, const std::string& oid,
-                       bufferlist& bl, uint64_t timeout_ms)
-{
-  librados::IoCtx ioctx;
-  int r = rgw_init_ioctx(dpp, &rados, pool, ioctx, true, false);
-  if (r < 0) {
-    return r;
-  }
-  return rgw_rados_notify(dpp, ioctx, oid, bl, timeout_ms, nullptr, y);
-}
-
-} // namespace rgw::rados
diff --git a/src/rgw/store/rados/config/impl.h b/src/rgw/store/rados/config/impl.h
deleted file mode 100644 (file)
index 3aed451..0000000
+++ /dev/null
@@ -1,139 +0,0 @@
-// vim: ts=8 sw=2 smarttab ft=cpp
-
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2022 Red Hat, Inc.
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation. See file COPYING.
- *
- */
-
-#pragma once
-
-#include "include/rados/librados.hpp"
-#include "common/dout.h"
-#include "rgw_basic_types.h"
-#include "rgw_tools.h"
-#include "rgw_sal_config.h"
-
-namespace rgw::rados {
-
-// write options that control object creation
-enum class Create {
-  MustNotExist, // fail with EEXIST if the object already exists
-  MayExist, // create if the object didn't exist, overwrite if it did
-  MustExist, // fail with ENOENT if the object doesn't exist
-};
-
-struct ConfigImpl {
-  librados::Rados rados;
-
-  const rgw_pool realm_pool;
-  const rgw_pool period_pool;
-  const rgw_pool zonegroup_pool;
-  const rgw_pool zone_pool;
-
-  ConfigImpl(const ceph::common::ConfigProxy& conf);
-
-  int read(const DoutPrefixProvider* dpp, optional_yield y,
-           const rgw_pool& pool, const std::string& oid,
-           bufferlist& bl, RGWObjVersionTracker* objv);
-
-  template <typename T>
-  int read(const DoutPrefixProvider* dpp, optional_yield y,
-           const rgw_pool& pool, const std::string& oid,
-           T& data, RGWObjVersionTracker* objv)
-  {
-    bufferlist bl;
-    int r = read(dpp, y, pool, oid, bl, objv);
-    if (r < 0) {
-      return r;
-    }
-    try {
-      auto p = bl.cbegin();
-      decode(data, p);
-    } catch (const buffer::error& err) {
-      ldpp_dout(dpp, 0) << "ERROR: failed to decode obj from "
-          << pool << ":" << oid << dendl;
-      return -EIO;
-    }
-    return 0;
-  }
-
-  int write(const DoutPrefixProvider* dpp, optional_yield y,
-            const rgw_pool& pool, const std::string& oid, Create create,
-            const bufferlist& bl, RGWObjVersionTracker* objv);
-
-  template <typename T>
-  int write(const DoutPrefixProvider* dpp, optional_yield y,
-            const rgw_pool& pool, const std::string& oid, Create create,
-            const T& data, RGWObjVersionTracker* objv)
-  {
-    bufferlist bl;
-    encode(data, bl);
-
-    return write(dpp, y, pool, oid, create, bl, objv);
-  }
-
-  int remove(const DoutPrefixProvider* dpp, optional_yield y,
-             const rgw_pool& pool, const std::string& oid,
-             RGWObjVersionTracker* objv);
-
-  int list(const DoutPrefixProvider* dpp, optional_yield y,
-           const rgw_pool& pool, const std::string& marker,
-           std::regular_invocable<std::string> auto filter,
-           std::span<std::string> entries,
-           sal::ListResult<std::string>& result)
-  {
-    librados::IoCtx ioctx;
-    int r = rgw_init_ioctx(dpp, &rados, pool, ioctx, true, false);
-    if (r < 0) {
-      return r;
-    }
-    librados::ObjectCursor oc;
-    if (!oc.from_str(marker)) {
-      ldpp_dout(dpp, 10) << "failed to parse cursor: " << marker << dendl;
-      return -EINVAL;
-    }
-    std::size_t count = 0;
-    try {
-      auto iter = ioctx.nobjects_begin(oc);
-      const auto end = ioctx.nobjects_end();
-      for (; count < entries.size() && iter != end; ++iter) {
-        std::string entry = filter(iter->get_oid());
-        if (!entry.empty()) {
-          entries[count++] = std::move(entry);
-        }
-      }
-      if (iter == end) {
-        result.next.clear();
-      } else {
-        result.next = iter.get_cursor().to_str();
-      }
-    } catch (const std::exception& e) {
-      ldpp_dout(dpp, 10) << "NObjectIterator exception " << e.what() << dendl;
-      return -EIO;
-    }
-    result.entries = entries.first(count);
-    return 0;
-  }
-
-  int notify(const DoutPrefixProvider* dpp, optional_yield y,
-             const rgw_pool& pool, const std::string& oid,
-             bufferlist& bl, uint64_t timeout_ms);
-};
-
-inline std::string_view name_or_default(std::string_view name,
-                                        std::string_view default_name)
-{
-  if (!name.empty()) {
-    return name;
-  }
-  return default_name;
-}
-
-} // namespace rgw::rados
diff --git a/src/rgw/store/rados/config/period.cc b/src/rgw/store/rados/config/period.cc
deleted file mode 100644 (file)
index 04650ce..0000000
+++ /dev/null
@@ -1,230 +0,0 @@
-// vim: ts=8 sw=2 smarttab ft=cpp
-
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2022 Red Hat, Inc.
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation. See file COPYING.
- *
- */
-
-#include "common/dout.h"
-#include "common/errno.h"
-#include "rgw_zone.h"
-#include "store/rados/config/store.h"
-
-#include "impl.h"
-
-namespace rgw::rados {
-
-// period oids
-constexpr std::string_view period_info_oid_prefix = "periods.";
-constexpr std::string_view period_latest_epoch_info_oid = ".latest_epoch";
-constexpr std::string_view period_staging_suffix = ":staging";
-
-static std::string period_oid(std::string_view period_id, uint32_t epoch)
-{
-  // omit the epoch for the staging period
-  if (period_id.ends_with(period_staging_suffix)) {
-    return string_cat_reserve(period_info_oid_prefix, period_id);
-  }
-  return fmt::format("{}{}.{}", period_info_oid_prefix, period_id, epoch);
-}
-
-static std::string latest_epoch_oid(const ceph::common::ConfigProxy& conf,
-                                    std::string_view period_id)
-{
-  return string_cat_reserve(
-      period_info_oid_prefix, period_id,
-      name_or_default(conf->rgw_period_latest_epoch_info_oid,
-                      period_latest_epoch_info_oid));
-}
-
-static int read_latest_epoch(const DoutPrefixProvider* dpp, optional_yield y,
-                             ConfigImpl* impl, std::string_view period_id,
-                             uint32_t& epoch, RGWObjVersionTracker* objv)
-{
-  const auto& pool = impl->period_pool;
-  const auto latest_oid = latest_epoch_oid(dpp->get_cct()->_conf, period_id);
-  RGWPeriodLatestEpochInfo latest;
-  int r = impl->read(dpp, y, pool, latest_oid, latest, objv);
-  if (r >= 0) {
-    epoch = latest.epoch;
-  }
-  return r;
-}
-
-static int write_latest_epoch(const DoutPrefixProvider* dpp, optional_yield y,
-                              ConfigImpl* impl, bool exclusive,
-                              std::string_view period_id, uint32_t epoch,
-                              RGWObjVersionTracker* objv)
-{
-  const auto& pool = impl->period_pool;
-  const auto latest_oid = latest_epoch_oid(dpp->get_cct()->_conf, period_id);
-  const auto create = exclusive ? Create::MustNotExist : Create::MayExist;
-  RGWPeriodLatestEpochInfo latest{epoch};
-  return impl->write(dpp, y, pool, latest_oid, create, latest, objv);
-}
-
-static int delete_latest_epoch(const DoutPrefixProvider* dpp, optional_yield y,
-                               ConfigImpl* impl, std::string_view period_id,
-                               RGWObjVersionTracker* objv)
-{
-  const auto& pool = impl->period_pool;
-  const auto latest_oid = latest_epoch_oid(dpp->get_cct()->_conf, period_id);
-  return impl->remove(dpp, y, pool, latest_oid, objv);
-}
-
-static int update_latest_epoch(const DoutPrefixProvider* dpp, optional_yield y,
-                               ConfigImpl* impl, std::string_view period_id,
-                               uint32_t epoch)
-{
-  static constexpr int MAX_RETRIES = 20;
-
-  for (int i = 0; i < MAX_RETRIES; i++) {
-    uint32_t existing_epoch = 0;
-    RGWObjVersionTracker objv;
-    bool exclusive = false;
-
-    // read existing epoch
-    int r = read_latest_epoch(dpp, y, impl, period_id, existing_epoch, &objv);
-    if (r == -ENOENT) {
-      // use an exclusive create to set the epoch atomically
-      exclusive = true;
-      objv.generate_new_write_ver(dpp->get_cct());
-      ldpp_dout(dpp, 20) << "creating initial latest_epoch=" << epoch
-          << " for period=" << period_id << dendl;
-    } else if (r < 0) {
-      ldpp_dout(dpp, 0) << "ERROR: failed to read latest_epoch" << dendl;
-      return r;
-    } else if (epoch <= existing_epoch) {
-      r = -EEXIST; // fail with EEXIST if epoch is not newer
-      ldpp_dout(dpp, 10) << "found existing latest_epoch " << existing_epoch
-          << " >= given epoch " << epoch << ", returning r=" << r << dendl;
-      return r;
-    } else {
-      ldpp_dout(dpp, 20) << "updating latest_epoch from " << existing_epoch
-          << " -> " << epoch << " on period=" << period_id << dendl;
-    }
-
-    r = write_latest_epoch(dpp, y, impl, exclusive, period_id, epoch, &objv);
-    if (r == -EEXIST) {
-      continue; // exclusive create raced with another update, retry
-    } else if (r == -ECANCELED) {
-      continue; // write raced with a conflicting version, retry
-    }
-    if (r < 0) {
-      ldpp_dout(dpp, 0) << "ERROR: failed to write latest_epoch" << dendl;
-      return r;
-    }
-    return 0; // return success
-  }
-
-  return -ECANCELED; // fail after max retries
-}
-
-int RadosConfigStore::create_period(const DoutPrefixProvider* dpp,
-                                    optional_yield y, bool exclusive,
-                                    const RGWPeriod& info)
-{
-  if (info.get_id().empty()) {
-    ldpp_dout(dpp, 0) << "period cannot have an empty id" << dendl;
-    return -EINVAL;
-  }
-  if (info.get_epoch() == 0) {
-    ldpp_dout(dpp, 0) << "period cannot have an empty epoch" << dendl;
-    return -EINVAL;
-  }
-  const auto& pool = impl->period_pool;
-  const auto info_oid = period_oid(info.get_id(), info.get_epoch());
-  const auto create = exclusive ? Create::MustNotExist : Create::MayExist;
-  RGWObjVersionTracker objv;
-  objv.generate_new_write_ver(dpp->get_cct());
-  int r = impl->write(dpp, y, pool, info_oid, create, info, &objv);
-  if (r < 0) {
-    return r;
-  }
-
-  (void) update_latest_epoch(dpp, y, impl.get(), info.get_id(), info.get_epoch());
-  return 0;
-}
-
-int RadosConfigStore::read_period(const DoutPrefixProvider* dpp,
-                                  optional_yield y,
-                                  std::string_view period_id,
-                                  std::optional<uint32_t> epoch,
-                                  RGWPeriod& info)
-{
-  int r = 0;
-  if (!epoch) {
-    epoch = 0;
-    r = read_latest_epoch(dpp, y, impl.get(), period_id, *epoch, nullptr);
-    if (r < 0) {
-      return r;
-    }
-  }
-
-  const auto& pool = impl->period_pool;
-  const auto info_oid = period_oid(period_id, *epoch);
-  return impl->read(dpp, y, pool, info_oid, info, nullptr);
-}
-
-int RadosConfigStore::delete_period(const DoutPrefixProvider* dpp,
-                                    optional_yield y,
-                                    std::string_view period_id)
-{
-  const auto& pool = impl->period_pool;
-
-  // read the latest_epoch
-  uint32_t latest_epoch = 0;
-  RGWObjVersionTracker latest_objv;
-  int r = read_latest_epoch(dpp, y, impl.get(), period_id,
-                            latest_epoch, &latest_objv);
-  if (r < 0 && r != -ENOENT) { // just delete epoch=0 on ENOENT
-    ldpp_dout(dpp, 0) << "failed to read latest epoch for period "
-        << period_id << ": " << cpp_strerror(r) << dendl;
-    return r;
-  }
-
-  for (uint32_t epoch = 0; epoch <= latest_epoch; epoch++) {
-    const auto info_oid = period_oid(period_id, epoch);
-    r = impl->remove(dpp, y, pool, info_oid, nullptr);
-    if (r < 0 && r != -ENOENT) { // ignore ENOENT
-      ldpp_dout(dpp, 0) << "failed to delete period " << info_oid
-          << ": " << cpp_strerror(r) << dendl;
-      return r;
-    }
-  }
-
-  return delete_latest_epoch(dpp, y, impl.get(), period_id, &latest_objv);
-}
-
-int RadosConfigStore::list_period_ids(const DoutPrefixProvider* dpp,
-                                      optional_yield y,
-                                      const std::string& marker,
-                                      std::span<std::string> entries,
-                                      sal::ListResult<std::string>& result)
-{
-  const auto& pool = impl->period_pool;
-  constexpr auto prefix = [] (std::string oid) -> std::string {
-      if (!oid.starts_with(period_info_oid_prefix)) {
-        return {};
-      }
-      if (!oid.ends_with(period_latest_epoch_info_oid)) {
-        return {};
-      }
-      // trim the prefix and suffix
-      const std::size_t count = oid.size() -
-          period_info_oid_prefix.size() -
-          period_latest_epoch_info_oid.size();
-      return oid.substr(period_info_oid_prefix.size(), count);
-    };
-
-  return impl->list(dpp, y, pool, marker, prefix, entries, result);
-}
-
-} // namespace rgw::rados
diff --git a/src/rgw/store/rados/config/period_config.cc b/src/rgw/store/rados/config/period_config.cc
deleted file mode 100644 (file)
index b17a48a..0000000
+++ /dev/null
@@ -1,55 +0,0 @@
-// vim: ts=8 sw=2 smarttab ft=cpp
-
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2022 Red Hat, Inc.
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation. See file COPYING.
- *
- */
-
-#include "rgw_zone.h"
-#include "store/rados/config/store.h"
-
-#include "impl.h"
-
-namespace rgw::rados {
-
-// period config oids
-constexpr std::string_view period_config_prefix = "period_config.";
-constexpr std::string_view period_config_realm_default = "default";
-
-std::string period_config_oid(std::string_view realm_id)
-{
-  if (realm_id.empty()) {
-    realm_id = period_config_realm_default;
-  }
-  return string_cat_reserve(period_config_prefix, realm_id);
-}
-
-int RadosConfigStore::read_period_config(const DoutPrefixProvider* dpp,
-                                         optional_yield y,
-                                         std::string_view realm_id,
-                                         RGWPeriodConfig& info)
-{
-  const auto& pool = impl->period_pool;
-  const auto oid = period_config_oid(realm_id);
-  return impl->read(dpp, y, pool, oid, info, nullptr);
-}
-
-int RadosConfigStore::write_period_config(const DoutPrefixProvider* dpp,
-                                          optional_yield y, bool exclusive,
-                                          std::string_view realm_id,
-                                          const RGWPeriodConfig& info)
-{
-  const auto& pool = impl->period_pool;
-  const auto oid = period_config_oid(realm_id);
-  const auto create = exclusive ? Create::MustNotExist : Create::MayExist;
-  return impl->write(dpp, y, pool, oid, create, info, nullptr);
-}
-
-} // namespace rgw::rados
diff --git a/src/rgw/store/rados/config/realm.cc b/src/rgw/store/rados/config/realm.cc
deleted file mode 100644 (file)
index f62cb7a..0000000
+++ /dev/null
@@ -1,364 +0,0 @@
-// vim: ts=8 sw=2 smarttab ft=cpp
-
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2022 Red Hat, Inc.
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation. See file COPYING.
- *
- */
-
-#include "common/dout.h"
-#include "common/errno.h"
-#include "rgw_realm_watcher.h"
-#include "rgw_zone.h"
-#include "store/rados/config/store.h"
-
-#include "impl.h"
-
-namespace rgw::rados {
-
-// realm oids
-constexpr std::string_view realm_names_oid_prefix = "realms_names.";
-constexpr std::string_view realm_info_oid_prefix = "realms.";
-constexpr std::string_view realm_control_oid_suffix = ".control";
-constexpr std::string_view default_realm_info_oid = "default.realm";
-
-static std::string realm_info_oid(std::string_view realm_id)
-{
-  return string_cat_reserve(realm_info_oid_prefix, realm_id);
-}
-static std::string realm_name_oid(std::string_view realm_id)
-{
-  return string_cat_reserve(realm_names_oid_prefix, realm_id);
-}
-static std::string realm_control_oid(std::string_view realm_id)
-{
-  return string_cat_reserve(realm_info_oid_prefix, realm_id,
-                            realm_control_oid_suffix);
-}
-static std::string default_realm_oid(const ceph::common::ConfigProxy& conf)
-{
-  return std::string{name_or_default(conf->rgw_default_realm_info_oid,
-                                     default_realm_info_oid)};
-}
-
-
-int RadosConfigStore::write_default_realm_id(const DoutPrefixProvider* dpp,
-                                             optional_yield y, bool exclusive,
-                                             std::string_view realm_id)
-{
-  const auto& pool = impl->realm_pool;
-  const auto oid = default_realm_oid(dpp->get_cct()->_conf);
-  const auto create = exclusive ? Create::MustNotExist : Create::MayExist;
-
-  RGWDefaultSystemMetaObjInfo default_info;
-  default_info.default_id = realm_id;
-
-  return impl->write(dpp, y, pool, oid, create, default_info, nullptr);
-}
-
-int RadosConfigStore::read_default_realm_id(const DoutPrefixProvider* dpp,
-                                            optional_yield y,
-                                            std::string& realm_id)
-{
-  const auto& pool = impl->realm_pool;
-  const auto oid = default_realm_oid(dpp->get_cct()->_conf);
-
-  RGWDefaultSystemMetaObjInfo default_info;
-  int r = impl->read(dpp, y, pool, oid, default_info, nullptr);
-  if (r >= 0) {
-    realm_id = default_info.default_id;
-  }
-  return r;
-}
-
-int RadosConfigStore::delete_default_realm_id(const DoutPrefixProvider* dpp,
-                                              optional_yield y)
-{
-  const auto& pool = impl->realm_pool;
-  const auto oid = default_realm_oid(dpp->get_cct()->_conf);
-
-  return impl->remove(dpp, y, pool, oid, nullptr);
-}
-
-
-class RadosRealmWriter : public sal::RealmWriter {
-  ConfigImpl* impl;
-  RGWObjVersionTracker objv;
-  std::string realm_id;
-  std::string realm_name;
- public:
-  RadosRealmWriter(ConfigImpl* impl, RGWObjVersionTracker objv,
-                   std::string_view realm_id, std::string_view realm_name)
-    : impl(impl), objv(std::move(objv)),
-      realm_id(realm_id), realm_name(realm_name)
-  {
-  }
-
-  int write(const DoutPrefixProvider* dpp, optional_yield y,
-            const RGWRealm& info) override
-  {
-    if (realm_id != info.get_id() || realm_name != info.get_name()) {
-      return -EINVAL; // can't modify realm id or name directly
-    }
-
-    const auto& pool = impl->realm_pool;
-    const auto info_oid = realm_info_oid(info.get_id());
-    return impl->write(dpp, y, pool, info_oid, Create::MustExist, info, &objv);
-  }
-
-  int rename(const DoutPrefixProvider* dpp, optional_yield y,
-             RGWRealm& info, std::string_view new_name) override
-  {
-    if (realm_id != info.get_id() || realm_name != info.get_name()) {
-      return -EINVAL; // can't modify realm id or name directly
-    }
-    if (new_name.empty()) {
-      ldpp_dout(dpp, 0) << "realm cannot have an empty name" << dendl;
-      return -EINVAL;
-    }
-
-    const auto& pool = impl->realm_pool;
-    const auto name = RGWNameToId{info.get_id()};
-    const auto info_oid = realm_info_oid(info.get_id());
-    const auto old_oid = realm_name_oid(info.get_name());
-    const auto new_oid = realm_name_oid(new_name);
-
-    // link the new name
-    RGWObjVersionTracker new_objv;
-    new_objv.generate_new_write_ver(dpp->get_cct());
-    int r = impl->write(dpp, y, pool, new_oid, Create::MustNotExist,
-                        name, &new_objv);
-    if (r < 0) {
-      return r;
-    }
-
-    // write the info with updated name
-    info.set_name(std::string{new_name});
-    r = impl->write(dpp, y, pool, info_oid, Create::MustExist, info, &objv);
-    if (r < 0) {
-      // on failure, unlink the new name
-      (void) impl->remove(dpp, y, pool, new_oid, &new_objv);
-      return r;
-    }
-
-    // unlink the old name
-    (void) impl->remove(dpp, y, pool, old_oid, nullptr);
-
-    realm_name = new_name;
-    return 0;
-  }
-
-  int remove(const DoutPrefixProvider* dpp, optional_yield y) override
-  {
-    const auto& pool = impl->realm_pool;
-    const auto info_oid = realm_info_oid(realm_id);
-    int r = impl->remove(dpp, y, pool, info_oid, &objv);
-    if (r < 0) {
-      return r;
-    }
-    const auto name_oid = realm_name_oid(realm_name);
-    (void) impl->remove(dpp, y, pool, name_oid, nullptr);
-    const auto control_oid = realm_control_oid(realm_id);
-    (void) impl->remove(dpp, y, pool, control_oid, nullptr);
-    return 0;
-  }
-}; // RadosRealmWriter
-
-
-int RadosConfigStore::create_realm(const DoutPrefixProvider* dpp,
-                                   optional_yield y, bool exclusive,
-                                   const RGWRealm& info,
-                                   std::unique_ptr<sal::RealmWriter>* writer)
-{
-  if (info.get_id().empty()) {
-    ldpp_dout(dpp, 0) << "realm cannot have an empty id" << dendl;
-    return -EINVAL;
-  }
-  if (info.get_name().empty()) {
-    ldpp_dout(dpp, 0) << "realm cannot have an empty name" << dendl;
-    return -EINVAL;
-  }
-
-  const auto& pool = impl->realm_pool;
-  const auto create = exclusive ? Create::MustNotExist : Create::MayExist;
-
-  // write the realm info
-  const auto info_oid = realm_info_oid(info.get_id());
-  RGWObjVersionTracker objv;
-  objv.generate_new_write_ver(dpp->get_cct());
-
-  int r = impl->write(dpp, y, pool, info_oid, create, info, &objv);
-  if (r < 0) {
-    return r;
-  }
-
-  // write the realm name
-  const auto name_oid = realm_name_oid(info.get_name());
-  const auto name = RGWNameToId{info.get_id()};
-  RGWObjVersionTracker name_objv;
-  name_objv.generate_new_write_ver(dpp->get_cct());
-
-  r = impl->write(dpp, y, pool, name_oid, create, name, &name_objv);
-  if (r < 0) {
-    (void) impl->remove(dpp, y, pool, info_oid, &objv);
-    return r;
-  }
-
-  // create control object for watch/notify
-  const auto control_oid = realm_control_oid(info.get_id());
-  bufferlist empty_bl;
-  r = impl->write(dpp, y, pool, control_oid, Create::MayExist,
-                  empty_bl, nullptr);
-  if (r < 0) {
-    (void) impl->remove(dpp, y, pool, name_oid, &name_objv);
-    (void) impl->remove(dpp, y, pool, info_oid, &objv);
-    return r;
-  }
-
-  if (writer) {
-    *writer = std::make_unique<RadosRealmWriter>(
-        impl.get(), std::move(objv), info.get_id(), info.get_name());
-  }
-  return 0;
-}
-
-int RadosConfigStore::read_realm_by_id(const DoutPrefixProvider* dpp,
-                                       optional_yield y,
-                                       std::string_view realm_id,
-                                       RGWRealm& info,
-                                       std::unique_ptr<sal::RealmWriter>* writer)
-{
-  const auto& pool = impl->realm_pool;
-  const auto info_oid = realm_info_oid(realm_id);
-  RGWObjVersionTracker objv;
-  int r = impl->read(dpp, y, pool, info_oid, info, &objv);
-  if (r < 0) {
-    return r;
-  }
-
-  if (writer) {
-    *writer = std::make_unique<RadosRealmWriter>(
-        impl.get(), std::move(objv), info.get_id(), info.get_name());
-  }
-  return 0;
-}
-
-int RadosConfigStore::read_realm_by_name(const DoutPrefixProvider* dpp,
-                                         optional_yield y,
-                                         std::string_view realm_name,
-                                         RGWRealm& info,
-                                         std::unique_ptr<sal::RealmWriter>* writer)
-{
-  const auto& pool = impl->realm_pool;
-
-  // look up realm id by name
-  RGWNameToId name;
-  const auto name_oid = realm_name_oid(realm_name);
-  int r = impl->read(dpp, y, pool, name_oid, name, nullptr);
-  if (r < 0) {
-    return r;
-  }
-
-  const auto info_oid = realm_info_oid(name.obj_id);
-  RGWObjVersionTracker objv;
-  r = impl->read(dpp, y, pool, info_oid, info, &objv);
-  if (r < 0) {
-    return r;
-  }
-
-  if (writer) {
-    *writer = std::make_unique<RadosRealmWriter>(
-        impl.get(), std::move(objv), info.get_id(), info.get_name());
-  }
-  return 0;
-}
-
-int RadosConfigStore::read_default_realm(const DoutPrefixProvider* dpp,
-                                         optional_yield y,
-                                         RGWRealm& info,
-                                         std::unique_ptr<sal::RealmWriter>* writer)
-{
-  const auto& pool = impl->realm_pool;
-
-  // read default realm id
-  RGWDefaultSystemMetaObjInfo default_info;
-  const auto default_oid = default_realm_oid(dpp->get_cct()->_conf);
-  int r = impl->read(dpp, y, pool, default_oid, default_info, nullptr);
-  if (r < 0) {
-    return r;
-  }
-
-  const auto info_oid = realm_info_oid(default_info.default_id);
-  RGWObjVersionTracker objv;
-  r = impl->read(dpp, y, pool, info_oid, info, &objv);
-  if (r < 0) {
-    return r;
-  }
-
-  if (writer) {
-    *writer = std::make_unique<RadosRealmWriter>(
-        impl.get(), std::move(objv), info.get_id(), info.get_name());
-  }
-  return 0;
-}
-
-int RadosConfigStore::read_realm_id(const DoutPrefixProvider* dpp,
-                                    optional_yield y,
-                                    std::string_view realm_name,
-                                    std::string& realm_id)
-{
-  const auto& pool = impl->realm_pool;
-  RGWNameToId name;
-
-  // look up realm id by name
-  const auto name_oid = realm_name_oid(realm_name);
-  int r = impl->read(dpp, y, pool, name_oid, name, nullptr);
-  if (r < 0) {
-    return r;
-  }
-  realm_id = std::move(name.obj_id);
-  return 0;
-}
-
-int RadosConfigStore::realm_notify_new_period(const DoutPrefixProvider* dpp,
-                                              optional_yield y,
-                                              const RGWPeriod& period)
-{
-  const auto& pool = impl->realm_pool;
-  const auto control_oid = realm_control_oid(period.get_realm());
-
-  bufferlist bl;
-  using ceph::encode;
-  // push the period to dependent zonegroups/zones
-  encode(RGWRealmNotify::ZonesNeedPeriod, bl);
-  encode(period, bl);
-  // reload the gateway with the new period
-  encode(RGWRealmNotify::Reload, bl);
-
-  constexpr uint64_t timeout_ms = 0;
-  return impl->notify(dpp, y, pool, control_oid, bl, timeout_ms);
-}
-
-int RadosConfigStore::list_realm_names(const DoutPrefixProvider* dpp,
-                                       optional_yield y,
-                                       const std::string& marker,
-                                       std::span<std::string> entries,
-                                       sal::ListResult<std::string>& result)
-{
-  const auto& pool = impl->realm_pool;
-  constexpr auto prefix = [] (std::string oid) -> std::string {
-      if (!oid.starts_with(realm_names_oid_prefix)) {
-        return {};
-      }
-      return oid.substr(realm_names_oid_prefix.size());
-    };
-  return impl->list(dpp, y, pool, marker, prefix, entries, result);
-}
-
-} // namespace rgw::rados
diff --git a/src/rgw/store/rados/config/store.cc b/src/rgw/store/rados/config/store.cc
deleted file mode 100644 (file)
index ec2b034..0000000
+++ /dev/null
@@ -1,52 +0,0 @@
-// vim: ts=8 sw=2 smarttab ft=cpp
-
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2022 Red Hat, Inc.
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation. See file COPYING.
- *
- */
-
-#include "include/rados/librados.hpp"
-#include "common/errno.h"
-#include "impl.h"
-#include "store.h"
-
-namespace rgw::rados {
-
-RadosConfigStore::RadosConfigStore(std::unique_ptr<ConfigImpl> impl)
-  : impl(std::move(impl))
-{
-}
-
-RadosConfigStore::~RadosConfigStore() = default;
-
-
-auto create_config_store(const DoutPrefixProvider* dpp)
-    -> std::unique_ptr<RadosConfigStore>
-{
-  auto impl = std::make_unique<ConfigImpl>(dpp->get_cct()->_conf);
-
-  // initialize a Rados client
-  int r = impl->rados.init_with_context(dpp->get_cct());
-  if (r < 0) {
-    ldpp_dout(dpp, -1) << "Rados client initialization failed with "
-        << cpp_strerror(-r) << dendl;
-    return nullptr;
-  }
-  r = impl->rados.connect();
-  if (r < 0) {
-    ldpp_dout(dpp, -1) << "Rados client connection failed with "
-        << cpp_strerror(-r) << dendl;
-    return nullptr;
-  }
-
-  return std::make_unique<RadosConfigStore>(std::move(impl));
-}
-
-} // namespace rgw::rados
diff --git a/src/rgw/store/rados/config/store.h b/src/rgw/store/rados/config/store.h
deleted file mode 100644 (file)
index 1b93a80..0000000
+++ /dev/null
@@ -1,182 +0,0 @@
-// vim: ts=8 sw=2 smarttab ft=cpp
-
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2022 Red Hat, Inc.
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation. See file COPYING.
- *
- */
-
-#pragma once
-
-#include <list>
-#include <memory>
-#include <string>
-#include "rgw_common.h"
-#include "rgw_sal_config.h"
-
-class DoutPrefixProvider;
-class optional_yield;
-
-namespace rgw::rados {
-
-struct ConfigImpl;
-
-class RadosConfigStore : public sal::ConfigStore {
- public:
-  explicit RadosConfigStore(std::unique_ptr<ConfigImpl> impl);
-  virtual ~RadosConfigStore() override;
-
-  // Realm
-  virtual int write_default_realm_id(const DoutPrefixProvider* dpp,
-                                     optional_yield y, bool exclusive,
-                                     std::string_view realm_id) override;
-  virtual int read_default_realm_id(const DoutPrefixProvider* dpp,
-                                    optional_yield y,
-                                    std::string& realm_id) override;
-  virtual int delete_default_realm_id(const DoutPrefixProvider* dpp,
-                                      optional_yield y) override;
-
-  virtual int create_realm(const DoutPrefixProvider* dpp,
-                           optional_yield y, bool exclusive,
-                           const RGWRealm& info,
-                           std::unique_ptr<sal::RealmWriter>* writer) override;
-  virtual int read_realm_by_id(const DoutPrefixProvider* dpp,
-                               optional_yield y,
-                               std::string_view realm_id,
-                               RGWRealm& info,
-                               std::unique_ptr<sal::RealmWriter>* writer) override;
-  virtual int read_realm_by_name(const DoutPrefixProvider* dpp,
-                                 optional_yield y,
-                                 std::string_view realm_name,
-                                 RGWRealm& info,
-                                 std::unique_ptr<sal::RealmWriter>* writer) override;
-  virtual int read_default_realm(const DoutPrefixProvider* dpp,
-                                 optional_yield y,
-                                 RGWRealm& info,
-                                 std::unique_ptr<sal::RealmWriter>* writer) override;
-  virtual int read_realm_id(const DoutPrefixProvider* dpp,
-                            optional_yield y, std::string_view realm_name,
-                            std::string& realm_id) override;
-  virtual int realm_notify_new_period(const DoutPrefixProvider* dpp,
-                                      optional_yield y,
-                                      const RGWPeriod& period) override;
-  virtual int list_realm_names(const DoutPrefixProvider* dpp,
-                               optional_yield y, const std::string& marker,
-                               std::span<std::string> entries,
-                               sal::ListResult<std::string>& result) override;
-
-  // Period
-  virtual int create_period(const DoutPrefixProvider* dpp,
-                            optional_yield y, bool exclusive,
-                            const RGWPeriod& info) override;
-  virtual int read_period(const DoutPrefixProvider* dpp,
-                          optional_yield y, std::string_view period_id,
-                          std::optional<uint32_t> epoch, RGWPeriod& info) override;
-  virtual int delete_period(const DoutPrefixProvider* dpp,
-                            optional_yield y,
-                            std::string_view period_id) override;
-  virtual int list_period_ids(const DoutPrefixProvider* dpp,
-                              optional_yield y, const std::string& marker,
-                              std::span<std::string> entries,
-                              sal::ListResult<std::string>& result) override;
-
-  // ZoneGroup
-  virtual int write_default_zonegroup_id(const DoutPrefixProvider* dpp,
-                                         optional_yield y, bool exclusive,
-                                         std::string_view realm_id,
-                                         std::string_view zonegroup_id) override;
-  virtual int read_default_zonegroup_id(const DoutPrefixProvider* dpp,
-                                        optional_yield y,
-                                        std::string_view realm_id,
-                                        std::string& zonegroup_id) override;
-  virtual int delete_default_zonegroup_id(const DoutPrefixProvider* dpp,
-                                          optional_yield y,
-                                          std::string_view realm_id) override;
-
-  virtual int create_zonegroup(const DoutPrefixProvider* dpp,
-                               optional_yield y, bool exclusive,
-                               const RGWZoneGroup& info,
-                               std::unique_ptr<sal::ZoneGroupWriter>* writer) override;
-  virtual int read_zonegroup_by_id(const DoutPrefixProvider* dpp,
-                                   optional_yield y,
-                                   std::string_view zonegroup_id,
-                                   RGWZoneGroup& info,
-                                   std::unique_ptr<sal::ZoneGroupWriter>* writer) override;
-  virtual int read_zonegroup_by_name(const DoutPrefixProvider* dpp,
-                                     optional_yield y,
-                                     std::string_view zonegroup_name,
-                                     RGWZoneGroup& info,
-                                     std::unique_ptr<sal::ZoneGroupWriter>* writer) override;
-  virtual int read_default_zonegroup(const DoutPrefixProvider* dpp,
-                                     optional_yield y,
-                                     std::string_view realm_id,
-                                     RGWZoneGroup& info,
-                                     std::unique_ptr<sal::ZoneGroupWriter>* writer) override;
-  virtual int list_zonegroup_names(const DoutPrefixProvider* dpp,
-                                   optional_yield y, const std::string& marker,
-                                   std::span<std::string> entries,
-                                   sal::ListResult<std::string>& result) override;
-
-  // Zone
-  virtual int write_default_zone_id(const DoutPrefixProvider* dpp,
-                                    optional_yield y, bool exclusive,
-                                    std::string_view realm_id,
-                                    std::string_view zone_id) override;
-  virtual int read_default_zone_id(const DoutPrefixProvider* dpp,
-                                   optional_yield y,
-                                   std::string_view realm_id,
-                                   std::string& zone_id) override;
-  virtual int delete_default_zone_id(const DoutPrefixProvider* dpp,
-                                     optional_yield y,
-                                     std::string_view realm_id) override;
-
-  virtual int create_zone(const DoutPrefixProvider* dpp,
-                          optional_yield y, bool exclusive,
-                          const RGWZoneParams& info,
-                          std::unique_ptr<sal::ZoneWriter>* writer) override;
-  virtual int read_zone_by_id(const DoutPrefixProvider* dpp,
-                              optional_yield y,
-                              std::string_view zone_id,
-                              RGWZoneParams& info,
-                              std::unique_ptr<sal::ZoneWriter>* writer) override;
-  virtual int read_zone_by_name(const DoutPrefixProvider* dpp,
-                                optional_yield y,
-                                std::string_view zone_name,
-                                RGWZoneParams& info,
-                                std::unique_ptr<sal::ZoneWriter>* writer) override;
-  virtual int read_default_zone(const DoutPrefixProvider* dpp,
-                                optional_yield y,
-                                std::string_view realm_id,
-                                RGWZoneParams& info,
-                                std::unique_ptr<sal::ZoneWriter>* writer) override;
-  virtual int list_zone_names(const DoutPrefixProvider* dpp,
-                              optional_yield y, const std::string& marker,
-                              std::span<std::string> entries,
-                              sal::ListResult<std::string>& result) override;
-
-  // PeriodConfig
-  virtual int read_period_config(const DoutPrefixProvider* dpp,
-                                 optional_yield y,
-                                 std::string_view realm_id,
-                                 RGWPeriodConfig& info) override;
-  virtual int write_period_config(const DoutPrefixProvider* dpp,
-                                  optional_yield y, bool exclusive,
-                                  std::string_view realm_id,
-                                  const RGWPeriodConfig& info) override;
-
- private:
-  std::unique_ptr<ConfigImpl> impl;
-}; // RadosConfigStore
-
-
-/// RadosConfigStore factory function
-auto create_config_store(const DoutPrefixProvider* dpp)
-    -> std::unique_ptr<RadosConfigStore>;
-
-} // namespace rgw::rados
diff --git a/src/rgw/store/rados/config/zone.cc b/src/rgw/store/rados/config/zone.cc
deleted file mode 100644 (file)
index c1151f5..0000000
+++ /dev/null
@@ -1,312 +0,0 @@
-// vim: ts=8 sw=2 smarttab ft=cpp
-
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2022 Red Hat, Inc.
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation. See file COPYING.
- *
- */
-
-#include "common/dout.h"
-#include "common/errno.h"
-#include "rgw_zone.h"
-#include "store/rados/config/store.h"
-
-#include "impl.h"
-
-namespace rgw::rados {
-
-// zone oids
-constexpr std::string_view zone_info_oid_prefix = "zone_info.";
-constexpr std::string_view zone_names_oid_prefix = "zone_names.";
-
-std::string zone_info_oid(std::string_view zone_id)
-{
-  return string_cat_reserve(zone_info_oid_prefix, zone_id);
-}
-std::string zone_name_oid(std::string_view zone_id)
-{
-  return string_cat_reserve(zone_names_oid_prefix, zone_id);
-}
-std::string default_zone_oid(const ceph::common::ConfigProxy& conf,
-                             std::string_view realm_id)
-{
-  return fmt::format("{}.{}", conf->rgw_default_zone_info_oid, realm_id);
-}
-
-
-int RadosConfigStore::write_default_zone_id(const DoutPrefixProvider* dpp,
-                                            optional_yield y,
-                                            bool exclusive,
-                                            std::string_view realm_id,
-                                            std::string_view zone_id)
-{
-  const auto& pool = impl->zone_pool;
-  const auto default_oid = default_zone_oid(dpp->get_cct()->_conf, realm_id);
-  const auto create = exclusive ? Create::MustNotExist : Create::MayExist;
-
-  RGWDefaultSystemMetaObjInfo default_info;
-  default_info.default_id = zone_id;
-
-  return impl->write(dpp, y, pool, default_oid, create, default_info, nullptr);
-}
-
-int RadosConfigStore::read_default_zone_id(const DoutPrefixProvider* dpp,
-                                           optional_yield y,
-                                           std::string_view realm_id,
-                                           std::string& zone_id)
-{
-  const auto& pool = impl->zone_pool;
-  const auto default_oid = default_zone_oid(dpp->get_cct()->_conf, realm_id);
-
-  RGWDefaultSystemMetaObjInfo default_info;
-  int r = impl->read(dpp, y, pool, default_oid, default_info, nullptr);
-  if (r >= 0) {
-    zone_id = default_info.default_id;
-  }
-  return r;
-}
-
-int RadosConfigStore::delete_default_zone_id(const DoutPrefixProvider* dpp,
-                                             optional_yield y,
-                                             std::string_view realm_id)
-{
-  const auto& pool = impl->zone_pool;
-  const auto default_oid = default_zone_oid(dpp->get_cct()->_conf, realm_id);
-
-  return impl->remove(dpp, y, pool, default_oid, nullptr);
-}
-
-
-class RadosZoneWriter : public sal::ZoneWriter {
-  ConfigImpl* impl;
-  RGWObjVersionTracker objv;
-  std::string zone_id;
-  std::string zone_name;
- public:
-  RadosZoneWriter(ConfigImpl* impl, RGWObjVersionTracker objv,
-                  std::string_view zone_id, std::string_view zone_name)
-      : impl(impl), objv(std::move(objv)),
-        zone_id(zone_id), zone_name(zone_name)
-  {
-  }
-
-  int write(const DoutPrefixProvider* dpp, optional_yield y,
-            const RGWZoneParams& info) override
-  {
-    if (zone_id != info.get_id() || zone_name != info.get_name()) {
-      return -EINVAL; // can't modify zone id or name directly
-    }
-
-    const auto& pool = impl->zone_pool;
-    const auto info_oid = zone_info_oid(info.get_id());
-    return impl->write(dpp, y, pool, info_oid, Create::MustExist, info, &objv);
-  }
-
-  int rename(const DoutPrefixProvider* dpp, optional_yield y,
-             RGWZoneParams& info, std::string_view new_name) override
-  {
-    if (zone_id != info.get_id() || zone_name != info.get_name()) {
-      return -EINVAL; // can't modify zone id or name directly
-    }
-    if (new_name.empty()) {
-      ldpp_dout(dpp, 0) << "zone cannot have an empty name" << dendl;
-      return -EINVAL;
-    }
-
-    const auto& pool = impl->zone_pool;
-    const auto name = RGWNameToId{info.get_id()};
-    const auto info_oid = zone_info_oid(info.get_id());
-    const auto old_oid = zone_name_oid(info.get_name());
-    const auto new_oid = zone_name_oid(new_name);
-
-    // link the new name
-    RGWObjVersionTracker new_objv;
-    new_objv.generate_new_write_ver(dpp->get_cct());
-    int r = impl->write(dpp, y, pool, new_oid, Create::MustNotExist,
-                        name, &new_objv);
-    if (r < 0) {
-      return r;
-    }
-
-    // write the info with updated name
-    info.set_name(std::string{new_name});
-    r = impl->write(dpp, y, pool, info_oid, Create::MustExist, info, &objv);
-    if (r < 0) {
-      // on failure, unlink the new name
-      (void) impl->remove(dpp, y, pool, new_oid, &new_objv);
-      return r;
-    }
-
-    // unlink the old name
-    (void) impl->remove(dpp, y, pool, old_oid, nullptr);
-
-    zone_name = new_name;
-    return 0;
-  }
-
-  int remove(const DoutPrefixProvider* dpp, optional_yield y) override
-  {
-    const auto& pool = impl->zone_pool;
-    const auto info_oid = zone_info_oid(zone_id);
-    int r = impl->remove(dpp, y, pool, info_oid, &objv);
-    if (r < 0) {
-      return r;
-    }
-    const auto name_oid = zone_name_oid(zone_name);
-    (void) impl->remove(dpp, y, pool, name_oid, nullptr);
-    return 0;
-  }
-}; // RadosZoneWriter
-
-
-int RadosConfigStore::create_zone(const DoutPrefixProvider* dpp,
-                                  optional_yield y, bool exclusive,
-                                  const RGWZoneParams& info,
-                                  std::unique_ptr<sal::ZoneWriter>* writer)
-{
-  if (info.get_id().empty()) {
-    ldpp_dout(dpp, 0) << "zone cannot have an empty id" << dendl;
-    return -EINVAL;
-  }
-  if (info.get_name().empty()) {
-    ldpp_dout(dpp, 0) << "zone cannot have an empty name" << dendl;
-    return -EINVAL;
-  }
-
-  const auto& pool = impl->zone_pool;
-  const auto create = exclusive ? Create::MustNotExist : Create::MayExist;
-
-  // write the zone info
-  const auto info_oid = zone_info_oid(info.get_id());
-  RGWObjVersionTracker objv;
-  objv.generate_new_write_ver(dpp->get_cct());
-
-  int r = impl->write(dpp, y, pool, info_oid, create, info, &objv);
-  if (r < 0) {
-    return r;
-  }
-
-  // write the zone name
-  const auto name_oid = zone_name_oid(info.get_name());
-  const auto name = RGWNameToId{info.get_id()};
-  RGWObjVersionTracker name_objv;
-  name_objv.generate_new_write_ver(dpp->get_cct());
-
-  r = impl->write(dpp, y, pool, name_oid, create, name, &name_objv);
-  if (r < 0) {
-    (void) impl->remove(dpp, y, pool, info_oid, &objv);
-    return r;
-  }
-
-  if (writer) {
-    *writer = std::make_unique<RadosZoneWriter>(
-        impl.get(), std::move(objv), info.get_id(), info.get_name());
-  }
-  return 0;
-}
-
-int RadosConfigStore::read_zone_by_id(const DoutPrefixProvider* dpp,
-                                      optional_yield y,
-                                      std::string_view zone_id,
-                                      RGWZoneParams& info,
-                                      std::unique_ptr<sal::ZoneWriter>* writer)
-{
-  const auto& pool = impl->zone_pool;
-  const auto info_oid = zone_info_oid(zone_id);
-  RGWObjVersionTracker objv;
-
-  int r = impl->read(dpp, y, pool, info_oid, info, &objv);
-  if (r < 0) {
-    return r;
-  }
-
-  if (writer) {
-    *writer = std::make_unique<RadosZoneWriter>(
-        impl.get(), std::move(objv), info.get_id(), info.get_name());
-  }
-  return 0;
-}
-
-int RadosConfigStore::read_zone_by_name(const DoutPrefixProvider* dpp,
-                                        optional_yield y,
-                                        std::string_view zone_name,
-                                        RGWZoneParams& info,
-                                        std::unique_ptr<sal::ZoneWriter>* writer)
-{
-  const auto& pool = impl->zone_pool;
-
-  // look up zone id by name
-  const auto name_oid = zone_name_oid(zone_name);
-  RGWNameToId name;
-  int r = impl->read(dpp, y, pool, name_oid, name, nullptr);
-  if (r < 0) {
-    return r;
-  }
-
-  const auto info_oid = zone_info_oid(name.obj_id);
-  RGWObjVersionTracker objv;
-  r = impl->read(dpp, y, pool, info_oid, info, &objv);
-  if (r < 0) {
-    return r;
-  }
-
-  if (writer) {
-    *writer = std::make_unique<RadosZoneWriter>(
-        impl.get(), std::move(objv), info.get_id(), info.get_name());
-  }
-  return 0;
-}
-
-int RadosConfigStore::read_default_zone(const DoutPrefixProvider* dpp,
-                                        optional_yield y,
-                                        std::string_view realm_id,
-                                        RGWZoneParams& info,
-                                        std::unique_ptr<sal::ZoneWriter>* writer)
-{
-  const auto& pool = impl->zone_pool;
-
-  // read default zone id
-  const auto default_oid = default_zone_oid(dpp->get_cct()->_conf, realm_id);
-  RGWDefaultSystemMetaObjInfo default_info;
-  int r = impl->read(dpp, y, pool, default_oid, default_info, nullptr);
-  if (r < 0) {
-    return r;
-  }
-
-  const auto info_oid = zone_info_oid(default_info.default_id);
-  RGWObjVersionTracker objv;
-  r = impl->read(dpp, y, pool, info_oid, info, &objv);
-  if (r < 0) {
-    return r;
-  }
-
-  if (writer) {
-    *writer = std::make_unique<RadosZoneWriter>(
-        impl.get(), std::move(objv), info.get_id(), info.get_name());
-  }
-  return 0;
-}
-
-int RadosConfigStore::list_zone_names(const DoutPrefixProvider* dpp,
-                                      optional_yield y,
-                                      const std::string& marker,
-                                      std::span<std::string> entries,
-                                      sal::ListResult<std::string>& result)
-{
-  const auto& pool = impl->zone_pool;
-  constexpr auto prefix = [] (std::string oid) -> std::string {
-      if (!oid.starts_with(zone_names_oid_prefix)) {
-        return {};
-      }
-      return oid.substr(zone_names_oid_prefix.size());
-    };
-  return impl->list(dpp, y, pool, marker, prefix, entries, result);
-}
-
-} // namespace rgw::rados
diff --git a/src/rgw/store/rados/config/zonegroup.cc b/src/rgw/store/rados/config/zonegroup.cc
deleted file mode 100644 (file)
index 984fda1..0000000
+++ /dev/null
@@ -1,315 +0,0 @@
-// vim: ts=8 sw=2 smarttab ft=cpp
-
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2022 Red Hat, Inc.
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation. See file COPYING.
- *
- */
-
-#include "common/dout.h"
-#include "common/errno.h"
-#include "rgw_zone.h"
-#include "store/rados/config/store.h"
-
-#include "impl.h"
-
-namespace rgw::rados {
-
-// zonegroup oids
-constexpr std::string_view zonegroup_names_oid_prefix = "zonegroups_names.";
-constexpr std::string_view zonegroup_info_oid_prefix = "zonegroup_info.";
-constexpr std::string_view default_zonegroup_info_oid = "default.zonegroup";
-
-static std::string zonegroup_info_oid(std::string_view zonegroup_id)
-{
-  return string_cat_reserve(zonegroup_info_oid_prefix, zonegroup_id);
-}
-static std::string zonegroup_name_oid(std::string_view zonegroup_id)
-{
-  return string_cat_reserve(zonegroup_names_oid_prefix, zonegroup_id);
-}
-static std::string default_zonegroup_oid(const ceph::common::ConfigProxy& conf,
-                                         std::string_view realm_id)
-{
-  const auto prefix = name_or_default(conf->rgw_default_zonegroup_info_oid,
-                                      default_zonegroup_info_oid);
-  return fmt::format("{}.{}", prefix, realm_id);
-}
-
-
-int RadosConfigStore::write_default_zonegroup_id(const DoutPrefixProvider* dpp,
-                                                 optional_yield y,
-                                                 bool exclusive,
-                                                 std::string_view realm_id,
-                                                 std::string_view zonegroup_id)
-{
-  const auto& pool = impl->zonegroup_pool;
-  const auto oid = default_zonegroup_oid(dpp->get_cct()->_conf, realm_id);
-  const auto create = exclusive ? Create::MustNotExist : Create::MayExist;
-
-  RGWDefaultSystemMetaObjInfo default_info;
-  default_info.default_id = zonegroup_id;
-
-  return impl->write(dpp, y, pool, oid, create, default_info, nullptr);
-}
-
-int RadosConfigStore::read_default_zonegroup_id(const DoutPrefixProvider* dpp,
-                                                optional_yield y,
-                                                std::string_view realm_id,
-                                                std::string& zonegroup_id)
-{
-  const auto& pool = impl->zonegroup_pool;
-  const auto oid = default_zonegroup_oid(dpp->get_cct()->_conf, realm_id);
-
-  RGWDefaultSystemMetaObjInfo default_info;
-  int r = impl->read(dpp, y, pool, oid, default_info, nullptr);
-  if (r >= 0) {
-    zonegroup_id = default_info.default_id;
-  }
-  return r;
-}
-
-int RadosConfigStore::delete_default_zonegroup_id(const DoutPrefixProvider* dpp,
-                                                  optional_yield y,
-                                                  std::string_view realm_id)
-{
-  const auto& pool = impl->zonegroup_pool;
-  const auto oid = default_zonegroup_oid(dpp->get_cct()->_conf, realm_id);
-  return impl->remove(dpp, y, pool, oid, nullptr);
-}
-
-
-class RadosZoneGroupWriter : public sal::ZoneGroupWriter {
-  ConfigImpl* impl;
-  RGWObjVersionTracker objv;
-  std::string zonegroup_id;
-  std::string zonegroup_name;
- public:
-  RadosZoneGroupWriter(ConfigImpl* impl, RGWObjVersionTracker objv,
-                       std::string_view zonegroup_id,
-                       std::string_view zonegroup_name)
-    : impl(impl), objv(std::move(objv)),
-      zonegroup_id(zonegroup_id), zonegroup_name(zonegroup_name)
-  {
-  }
-
-  int write(const DoutPrefixProvider* dpp, optional_yield y,
-            const RGWZoneGroup& info) override
-  {
-    if (zonegroup_id != info.get_id() || zonegroup_name != info.get_name()) {
-      return -EINVAL; // can't modify zonegroup id or name directly
-    }
-
-    const auto& pool = impl->zonegroup_pool;
-    const auto info_oid = zonegroup_info_oid(info.get_id());
-    return impl->write(dpp, y, pool, info_oid, Create::MustExist, info, &objv);
-  }
-
-  int rename(const DoutPrefixProvider* dpp, optional_yield y,
-             RGWZoneGroup& info, std::string_view new_name) override
-  {
-    if (zonegroup_id != info.get_id() || zonegroup_name != info.get_name()) {
-      return -EINVAL; // can't modify zonegroup id or name directly
-    }
-    if (new_name.empty()) {
-      ldpp_dout(dpp, 0) << "zonegroup cannot have an empty name" << dendl;
-      return -EINVAL;
-    }
-
-    const auto& pool = impl->zonegroup_pool;
-    const auto name = RGWNameToId{info.get_id()};
-    const auto info_oid = zonegroup_info_oid(info.get_id());
-    const auto old_oid = zonegroup_name_oid(info.get_name());
-    const auto new_oid = zonegroup_name_oid(new_name);
-
-    // link the new name
-    RGWObjVersionTracker new_objv;
-    new_objv.generate_new_write_ver(dpp->get_cct());
-    int r = impl->write(dpp, y, pool, new_oid, Create::MustNotExist,
-                        name, &new_objv);
-    if (r < 0) {
-      return r;
-    }
-
-    // write the info with updated name
-    info.set_name(std::string{new_name});
-    r = impl->write(dpp, y, pool, info_oid, Create::MustExist, info, &objv);
-    if (r < 0) {
-      // on failure, unlink the new name
-      (void) impl->remove(dpp, y, pool, new_oid, &new_objv);
-      return r;
-    }
-
-    // unlink the old name
-    (void) impl->remove(dpp, y, pool, old_oid, nullptr);
-
-    zonegroup_name = new_name;
-    return 0;
-  }
-
-  int remove(const DoutPrefixProvider* dpp, optional_yield y) override
-  {
-    const auto& pool = impl->zonegroup_pool;
-    const auto info_oid = zonegroup_info_oid(zonegroup_id);
-    int r = impl->remove(dpp, y, pool, info_oid, &objv);
-    if (r < 0) {
-      return r;
-    }
-    const auto name_oid = zonegroup_name_oid(zonegroup_name);
-    (void) impl->remove(dpp, y, pool, name_oid, nullptr);
-    return 0;
-  }
-}; // RadosZoneGroupWriter
-
-
-int RadosConfigStore::create_zonegroup(const DoutPrefixProvider* dpp,
-                                       optional_yield y, bool exclusive,
-                                       const RGWZoneGroup& info,
-                                       std::unique_ptr<sal::ZoneGroupWriter>* writer)
-{
-  if (info.get_id().empty()) {
-    ldpp_dout(dpp, 0) << "zonegroup cannot have an empty id" << dendl;
-    return -EINVAL;
-  }
-  if (info.get_name().empty()) {
-    ldpp_dout(dpp, 0) << "zonegroup cannot have an empty name" << dendl;
-    return -EINVAL;
-  }
-
-  const auto& pool = impl->zonegroup_pool;
-  const auto create = exclusive ? Create::MustNotExist : Create::MayExist;
-
-  // write the zonegroup info
-  const auto info_oid = zonegroup_info_oid(info.get_id());
-  RGWObjVersionTracker objv;
-  objv.generate_new_write_ver(dpp->get_cct());
-
-  int r = impl->write(dpp, y, pool, info_oid, create, info, &objv);
-  if (r < 0) {
-    return r;
-  }
-
-  // write the zonegroup name
-  const auto name_oid = zonegroup_name_oid(info.get_name());
-  const auto name = RGWNameToId{info.get_id()};
-  RGWObjVersionTracker name_objv;
-  name_objv.generate_new_write_ver(dpp->get_cct());
-
-  r = impl->write(dpp, y, pool, name_oid, create, name, &name_objv);
-  if (r < 0) {
-    (void) impl->remove(dpp, y, pool, info_oid, &objv);
-    return r;
-  }
-
-  if (writer) {
-    *writer = std::make_unique<RadosZoneGroupWriter>(
-        impl.get(), std::move(objv), info.get_id(), info.get_name());
-  }
-  return 0;
-}
-
-int RadosConfigStore::read_zonegroup_by_id(const DoutPrefixProvider* dpp,
-                                           optional_yield y,
-                                           std::string_view zonegroup_id,
-                                           RGWZoneGroup& info,
-                                           std::unique_ptr<sal::ZoneGroupWriter>* writer)
-{
-  const auto& pool = impl->zonegroup_pool;
-  const auto info_oid = zonegroup_info_oid(zonegroup_id);
-  RGWObjVersionTracker objv;
-
-  int r = impl->read(dpp, y, pool, info_oid, info, &objv);
-  if (r < 0) {
-    return r;
-  }
-
-  if (writer) {
-    *writer = std::make_unique<RadosZoneGroupWriter>(
-        impl.get(), std::move(objv), info.get_id(), info.get_name());
-  }
-  return 0;
-}
-
-int RadosConfigStore::read_zonegroup_by_name(const DoutPrefixProvider* dpp,
-                                             optional_yield y,
-                                             std::string_view zonegroup_name,
-                                             RGWZoneGroup& info,
-                                             std::unique_ptr<sal::ZoneGroupWriter>* writer)
-{
-  const auto& pool = impl->zonegroup_pool;
-
-  // look up zonegroup id by name
-  RGWNameToId name;
-  const auto name_oid = zonegroup_name_oid(zonegroup_name);
-  int r = impl->read(dpp, y, pool, name_oid, name, nullptr);
-  if (r < 0) {
-    return r;
-  }
-
-  const auto info_oid = zonegroup_info_oid(name.obj_id);
-  RGWObjVersionTracker objv;
-  r = impl->read(dpp, y, pool, info_oid, info, &objv);
-  if (r < 0) {
-    return r;
-  }
-
-  if (writer) {
-    *writer = std::make_unique<RadosZoneGroupWriter>(
-        impl.get(), std::move(objv), info.get_id(), info.get_name());
-  }
-  return 0;
-}
-
-int RadosConfigStore::read_default_zonegroup(const DoutPrefixProvider* dpp,
-                                             optional_yield y,
-                                             std::string_view realm_id,
-                                             RGWZoneGroup& info,
-                                             std::unique_ptr<sal::ZoneGroupWriter>* writer)
-{
-  const auto& pool = impl->zonegroup_pool;
-
-  // read default zonegroup id
-  RGWDefaultSystemMetaObjInfo default_info;
-  const auto default_oid = default_zonegroup_oid(dpp->get_cct()->_conf, realm_id);
-  int r = impl->read(dpp, y, pool, default_oid, default_info, nullptr);
-  if (r < 0) {
-    return r;
-  }
-
-  const auto info_oid = zonegroup_info_oid(default_info.default_id);
-  RGWObjVersionTracker objv;
-  r = impl->read(dpp, y, pool, info_oid, info, &objv);
-  if (r < 0) {
-    return r;
-  }
-
-  if (writer) {
-    *writer = std::make_unique<RadosZoneGroupWriter>(
-        impl.get(), std::move(objv), info.get_id(), info.get_name());
-  }
-  return 0;
-}
-
-int RadosConfigStore::list_zonegroup_names(const DoutPrefixProvider* dpp,
-                                           optional_yield y,
-                                           const std::string& marker,
-                                           std::span<std::string> entries,
-                                           sal::ListResult<std::string>& result)
-{
-  const auto& pool = impl->zonegroup_pool;
-  constexpr auto prefix = [] (std::string oid) -> std::string {
-      if (!oid.starts_with(zonegroup_names_oid_prefix)) {
-        return {};
-      }
-      return oid.substr(zonegroup_names_oid_prefix.size());
-    };
-  return impl->list(dpp, y, pool, marker, prefix, entries, result);
-}
-
-} // namespace rgw::rados
diff --git a/src/rgw/store/rados/rgw_bucket.cc b/src/rgw/store/rados/rgw_bucket.cc
deleted file mode 100644 (file)
index 7f600fe..0000000
+++ /dev/null
@@ -1,2971 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab ft=cpp
-
-#include "rgw_acl_s3.h"
-#include "rgw_tag_s3.h"
-
-#include "rgw_bucket.h"
-#include "rgw_op.h"
-#include "rgw_bucket_sync.h"
-
-#include "services/svc_zone.h"
-#include "services/svc_bucket.h"
-#include "services/svc_user.h"
-
-#include "rgw_reshard.h"
-
-// stolen from src/cls/version/cls_version.cc
-#define VERSION_ATTR "ceph.objclass.version"
-
-#include "cls/user/cls_user_types.h"
-
-#include "rgw_sal_rados.h"
-
-#define dout_subsys ceph_subsys_rgw
-
-// seconds for timeout during RGWBucket::check_object_index
-constexpr uint64_t BUCKET_TAG_QUICK_TIMEOUT = 30;
-
-using namespace std;
-
-// default number of entries to list with each bucket listing call
-// (use marker to bridge between calls)
-static constexpr size_t listing_max_entries = 1000;
-
-/*
- * The tenant_name is always returned on purpose. May be empty, of course.
- */
-static void parse_bucket(const string& bucket,
-                         string *tenant_name,
-                         string *bucket_name,
-                         string *bucket_instance = nullptr /* optional */)
-{
-  /*
-   * expected format: [tenant/]bucket:bucket_instance
-   */
-  int pos = bucket.find('/');
-  if (pos >= 0) {
-    *tenant_name = bucket.substr(0, pos);
-  } else {
-    tenant_name->clear();
-  }
-  string bn = bucket.substr(pos + 1);
-  pos = bn.find (':');
-  if (pos < 0) {
-    *bucket_name = std::move(bn);
-    return;
-  }
-  *bucket_name = bn.substr(0, pos);
-  if (bucket_instance) {
-    *bucket_instance = bn.substr(pos + 1);
-  }
-
-  /*
-   * deal with the possible tenant:bucket:bucket_instance case
-   */
-  if (tenant_name->empty()) {
-    pos = bucket_instance->find(':');
-    if (pos >= 0) {
-      *tenant_name = *bucket_name;
-      *bucket_name = bucket_instance->substr(0, pos);
-      *bucket_instance = bucket_instance->substr(pos + 1);
-    }
-  }
-}
-
-static void dump_mulipart_index_results(list<rgw_obj_index_key>& objs_to_unlink,
-        Formatter *f)
-{
-  for (const auto& o : objs_to_unlink) {
-    f->dump_string("object",  o.name);
-  }
-}
-
-void check_bad_user_bucket_mapping(rgw::sal::Driver* driver, rgw::sal::User* user,
-                                  bool fix,
-                                  optional_yield y,
-                                   const DoutPrefixProvider *dpp)
-{
-  rgw::sal::BucketList user_buckets;
-  string marker;
-
-  CephContext *cct = driver->ctx();
-
-  size_t max_entries = cct->_conf->rgw_list_buckets_max_chunk;
-
-  do {
-    int ret = user->list_buckets(dpp, marker, string(), max_entries, false, user_buckets, y);
-    if (ret < 0) {
-      ldout(driver->ctx(), 0) << "failed to read user buckets: "
-                            << cpp_strerror(-ret) << dendl;
-      return;
-    }
-
-    map<string, std::unique_ptr<rgw::sal::Bucket>>& buckets = user_buckets.get_buckets();
-    for (auto i = buckets.begin();
-         i != buckets.end();
-         ++i) {
-      marker = i->first;
-
-      auto& bucket = i->second;
-
-      std::unique_ptr<rgw::sal::Bucket> actual_bucket;
-      int r = driver->get_bucket(dpp, user, user->get_tenant(), bucket->get_name(), &actual_bucket, null_yield);
-      if (r < 0) {
-        ldout(driver->ctx(), 0) << "could not get bucket info for bucket=" << bucket << dendl;
-        continue;
-      }
-
-      if (actual_bucket->get_name().compare(bucket->get_name()) != 0 ||
-          actual_bucket->get_tenant().compare(bucket->get_tenant()) != 0 ||
-          actual_bucket->get_marker().compare(bucket->get_marker()) != 0 ||
-          actual_bucket->get_bucket_id().compare(bucket->get_bucket_id()) != 0) {
-        cout << "bucket info mismatch: expected " << actual_bucket << " got " << bucket << std::endl;
-        if (fix) {
-          cout << "fixing" << std::endl;
-         r = actual_bucket->chown(dpp, user, nullptr, null_yield);
-          if (r < 0) {
-            cerr << "failed to fix bucket: " << cpp_strerror(-r) << std::endl;
-          }
-        }
-      }
-    }
-  } while (user_buckets.is_truncated());
-}
-
-// returns true if entry is in the empty namespace. note: function
-// type conforms to type RGWBucketListNameFilter
-bool rgw_bucket_object_check_filter(const std::string& oid)
-{
-  const static std::string empty_ns;
-  rgw_obj_key key; // thrown away but needed for parsing
-  return rgw_obj_key::oid_to_key_in_ns(oid, &key, empty_ns);
-}
-
-int rgw_remove_object(const DoutPrefixProvider *dpp, rgw::sal::Driver* driver, rgw::sal::Bucket* bucket, rgw_obj_key& key)
-{
-  if (key.instance.empty()) {
-    key.instance = "null";
-  }
-
-  std::unique_ptr<rgw::sal::Object> object = bucket->get_object(key);
-
-  return object->delete_object(dpp, null_yield);
-}
-
-static void set_err_msg(std::string *sink, std::string msg)
-{
-  if (sink && !msg.empty())
-    *sink = msg;
-}
-
-int RGWBucket::init(rgw::sal::Driver* _driver, RGWBucketAdminOpState& op_state,
-                    optional_yield y, const DoutPrefixProvider *dpp, std::string *err_msg)
-{
-  if (!_driver) {
-    set_err_msg(err_msg, "no storage!");
-    return -EINVAL;
-  }
-
-  driver = _driver;
-
-  std::string bucket_name = op_state.get_bucket_name();
-
-  if (bucket_name.empty() && op_state.get_user_id().empty())
-    return -EINVAL;
-
-  user = driver->get_user(op_state.get_user_id());
-  std::string tenant = user->get_tenant();
-
-  // split possible tenant/name
-  auto pos = bucket_name.find('/');
-  if (pos != string::npos) {
-    tenant = bucket_name.substr(0, pos);
-    bucket_name = bucket_name.substr(pos + 1);
-  }
-
-  int r = driver->get_bucket(dpp, user.get(), tenant, bucket_name, &bucket, y);
-  if (r < 0) {
-      set_err_msg(err_msg, "failed to fetch bucket info for bucket=" + bucket_name);
-      return r;
-  }
-
-  op_state.set_bucket(bucket->clone());
-
-  if (!rgw::sal::User::empty(user.get())) {
-    r = user->load_user(dpp, y);
-    if (r < 0) {
-      set_err_msg(err_msg, "failed to fetch user info");
-      return r;
-    }
-  }
-
-  op_state.display_name = user->get_display_name();
-
-  clear_failure();
-  return 0;
-}
-
-bool rgw_find_bucket_by_id(const DoutPrefixProvider *dpp, CephContext *cct, rgw::sal::Driver* driver,
-                           const string& marker, const string& bucket_id, rgw_bucket* bucket_out)
-{
-  void *handle = NULL;
-  bool truncated = false;
-  string s;
-
-  int ret = driver->meta_list_keys_init(dpp, "bucket.instance", marker, &handle);
-  if (ret < 0) {
-    cerr << "ERROR: can't get key: " << cpp_strerror(-ret) << std::endl;
-    driver->meta_list_keys_complete(handle);
-    return -ret;
-  }
-  do {
-      list<string> keys;
-      ret = driver->meta_list_keys_next(dpp, handle, 1000, keys, &truncated);
-      if (ret < 0) {
-        cerr << "ERROR: lists_keys_next(): " << cpp_strerror(-ret) << std::endl;
-        driver->meta_list_keys_complete(handle);
-        return -ret;
-      }
-      for (list<string>::iterator iter = keys.begin(); iter != keys.end(); ++iter) {
-        s = *iter;
-        ret = rgw_bucket_parse_bucket_key(cct, s, bucket_out, nullptr);
-        if (ret < 0) {
-          continue;
-        }
-        if (bucket_id == bucket_out->bucket_id) {
-          driver->meta_list_keys_complete(handle);
-          return true;
-        }
-      }
-  } while (truncated);
-  driver->meta_list_keys_complete(handle);
-  return false;
-}
-
-int RGWBucket::chown(RGWBucketAdminOpState& op_state, const string& marker,
-                     optional_yield y, const DoutPrefixProvider *dpp, std::string *err_msg)
-{
-  int ret = bucket->chown(dpp, user.get(), user.get(), y, &marker);
-  if (ret < 0) {
-    set_err_msg(err_msg, "Failed to change object ownership: " + cpp_strerror(-ret));
-  }
-  
-  return ret;
-}
-
-int RGWBucket::set_quota(RGWBucketAdminOpState& op_state, const DoutPrefixProvider *dpp, std::string *err_msg)
-{
-  bucket = op_state.get_bucket()->clone();
-
-  bucket->get_info().quota = op_state.quota;
-  int r = bucket->put_info(dpp, false, real_time());
-  if (r < 0) {
-    set_err_msg(err_msg, "ERROR: failed writing bucket instance info: " + cpp_strerror(-r));
-    return r;
-  }
-  return r;
-}
-
-int RGWBucket::remove_object(const DoutPrefixProvider *dpp, RGWBucketAdminOpState& op_state, std::string *err_msg)
-{
-  std::string object_name = op_state.get_object_name();
-
-  rgw_obj_key key(object_name);
-
-  bucket = op_state.get_bucket()->clone();
-
-  int ret = rgw_remove_object(dpp, driver, bucket.get(), key);
-  if (ret < 0) {
-    set_err_msg(err_msg, "unable to remove object" + cpp_strerror(-ret));
-    return ret;
-  }
-
-  return 0;
-}
-
-static void dump_bucket_index(const vector<rgw_bucket_dir_entry>& objs,  Formatter *f)
-{
-  for (auto iter = objs.begin(); iter != objs.end(); ++iter) {
-    f->dump_string("object", iter->key.name);
-  }
-}
-
-static void dump_bucket_usage(map<RGWObjCategory, RGWStorageStats>& stats, Formatter *formatter)
-{
-  map<RGWObjCategory, RGWStorageStats>::iterator iter;
-
-  formatter->open_object_section("usage");
-  for (iter = stats.begin(); iter != stats.end(); ++iter) {
-    RGWStorageStats& s = iter->second;
-    formatter->open_object_section(to_string(iter->first));
-    s.dump(formatter);
-    formatter->close_section();
-  }
-  formatter->close_section();
-}
-
-static void dump_index_check(map<RGWObjCategory, RGWStorageStats> existing_stats,
-        map<RGWObjCategory, RGWStorageStats> calculated_stats,
-        Formatter *formatter)
-{
-  formatter->open_object_section("check_result");
-  formatter->open_object_section("existing_header");
-  dump_bucket_usage(existing_stats, formatter);
-  formatter->close_section();
-  formatter->open_object_section("calculated_header");
-  dump_bucket_usage(calculated_stats, formatter);
-  formatter->close_section();
-  formatter->close_section();
-}
-
-int RGWBucket::check_bad_index_multipart(RGWBucketAdminOpState& op_state,
-                                        RGWFormatterFlusher& flusher,
-                                        const DoutPrefixProvider *dpp,
-                                        std::string *err_msg)
-{
-  const bool fix_index = op_state.will_fix_index();
-
-  bucket = op_state.get_bucket()->clone();
-
-  rgw::sal::Bucket::ListParams params;
-  params.list_versions = true;
-  params.ns = RGW_OBJ_NS_MULTIPART;
-
-  std::map<std::string, bool> meta_objs;
-  std::map<rgw_obj_index_key, std::string> all_objs;
-  bool is_truncated;
-  do {
-    rgw::sal::Bucket::ListResults results;
-    int r = bucket->list(dpp, params, listing_max_entries, results, null_yield);
-    if (r < 0) {
-      set_err_msg(err_msg, "failed to list objects in bucket=" + bucket->get_name() +
-              " err=" +  cpp_strerror(-r));
-
-      return r;
-    }
-    is_truncated = results.is_truncated;
-
-    for (const auto& o : results.objs) {
-      rgw_obj_index_key key = o.key;
-      rgw_obj obj(bucket->get_key(), key);
-      std::string oid = obj.get_oid();
-
-      int pos = oid.find_last_of('.');
-      if (pos < 0) {
-        /* obj has no suffix */
-        all_objs[key] = oid;
-      } else {
-        /* obj has suffix */
-       std::string name = oid.substr(0, pos);
-       std::string suffix = oid.substr(pos + 1);
-
-        if (suffix.compare("meta") == 0) {
-          meta_objs[name] = true;
-        } else {
-          all_objs[key] = name;
-        }
-      }
-    }
-  } while (is_truncated);
-
-  std::list<rgw_obj_index_key> objs_to_unlink;
-  Formatter *f =  flusher.get_formatter();
-
-  f->open_array_section("invalid_multipart_entries");
-
-  for (const auto& o : all_objs) {
-    const std::string& name = o.second;
-    if (meta_objs.find(name) == meta_objs.end()) {
-      objs_to_unlink.push_back(o.first);
-    }
-
-    if (objs_to_unlink.size() > listing_max_entries) {
-      if (fix_index) {
-       // note: under rados this removes directly from rados index objects
-       int r = bucket->remove_objs_from_index(dpp, objs_to_unlink);
-       if (r < 0) {
-         set_err_msg(err_msg, "ERROR: remove_obj_from_index() returned error: " +
-                     cpp_strerror(-r));
-         return r;
-       }
-      }
-
-      dump_mulipart_index_results(objs_to_unlink, f);
-      flusher.flush();
-      objs_to_unlink.clear();
-    }
-  }
-
-  if (fix_index) {
-    // note: under rados this removes directly from rados index objects
-    int r = bucket->remove_objs_from_index(dpp, objs_to_unlink);
-    if (r < 0) {
-      set_err_msg(err_msg, "ERROR: remove_obj_from_index() returned error: " +
-              cpp_strerror(-r));
-
-      return r;
-    }
-  }
-
-  dump_mulipart_index_results(objs_to_unlink, f);
-  f->close_section();
-  flusher.flush();
-
-  return 0;
-}
-
-int RGWBucket::check_object_index(const DoutPrefixProvider *dpp, 
-                                  RGWBucketAdminOpState& op_state,
-                                  RGWFormatterFlusher& flusher,
-                                  optional_yield y,
-                                  std::string *err_msg)
-{
-
-  bool fix_index = op_state.will_fix_index();
-
-  if (!fix_index) {
-    set_err_msg(err_msg, "check-objects flag requires fix index enabled");
-    return -EINVAL;
-  }
-
-  // use a quicker/shorter tag timeout during this process
-  bucket->set_tag_timeout(dpp, BUCKET_TAG_QUICK_TIMEOUT);
-
-  rgw::sal::Bucket::ListResults results;
-  results.is_truncated = true;
-
-  Formatter *formatter = flusher.get_formatter();
-  formatter->open_object_section("objects");
-
-  while (results.is_truncated) {
-    rgw::sal::Bucket::ListParams params;
-    params.marker = results.next_marker;
-    params.force_check_filter = rgw_bucket_object_check_filter;
-
-    int r = bucket->list(dpp, params, listing_max_entries, results, y);
-
-    if (r == -ENOENT) {
-      break;
-    } else if (r < 0) {
-      set_err_msg(err_msg, "ERROR: failed operation r=" + cpp_strerror(-r));
-    }
-
-    dump_bucket_index(results.objs, formatter);
-    flusher.flush();
-  }
-
-  formatter->close_section();
-
-  // restore normal tag timeout for bucket
-  bucket->set_tag_timeout(dpp, 0);
-
-  return 0;
-}
-
-
-int RGWBucket::check_index(const DoutPrefixProvider *dpp,
-        RGWBucketAdminOpState& op_state,
-        map<RGWObjCategory, RGWStorageStats>& existing_stats,
-        map<RGWObjCategory, RGWStorageStats>& calculated_stats,
-        std::string *err_msg)
-{
-  bool fix_index = op_state.will_fix_index();
-
-  int r = bucket->check_index(dpp, existing_stats, calculated_stats);
-  if (r < 0) {
-    set_err_msg(err_msg, "failed to check index error=" + cpp_strerror(-r));
-    return r;
-  }
-
-  if (fix_index) {
-    r = bucket->rebuild_index(dpp);
-    if (r < 0) {
-      set_err_msg(err_msg, "failed to rebuild index err=" + cpp_strerror(-r));
-      return r;
-    }
-  }
-
-  return 0;
-}
-
-int RGWBucket::sync(RGWBucketAdminOpState& op_state, const DoutPrefixProvider *dpp, std::string *err_msg)
-{
-  if (!driver->is_meta_master()) {
-    set_err_msg(err_msg, "ERROR: failed to update bucket sync: only allowed on meta master zone");
-    return -EINVAL;
-  }
-  bool sync = op_state.will_sync_bucket();
-  if (sync) {
-    bucket->get_info().flags &= ~BUCKET_DATASYNC_DISABLED;
-  } else {
-    bucket->get_info().flags |= BUCKET_DATASYNC_DISABLED;
-  }
-
-  // when writing this metadata, RGWSI_BucketIndex_RADOS::handle_overwrite()
-  // will write the corresponding datalog and bilog entries
-  int r = bucket->put_info(dpp, false, real_time());
-  if (r < 0) {
-    set_err_msg(err_msg, "ERROR: failed writing bucket instance info:" + cpp_strerror(-r));
-    return r;
-  }
-
-  return 0;
-}
-
-
-int RGWBucket::policy_bl_to_stream(bufferlist& bl, ostream& o)
-{
-  RGWAccessControlPolicy_S3 policy(g_ceph_context);
-  int ret = decode_bl(bl, policy);
-  if (ret < 0) {
-    ldout(driver->ctx(),0) << "failed to decode RGWAccessControlPolicy" << dendl;
-  }
-  policy.to_xml(o);
-  return 0;
-}
-
-int rgw_object_get_attr(const DoutPrefixProvider *dpp,
-                       rgw::sal::Driver* driver, rgw::sal::Object* obj,
-                       const char* attr_name, bufferlist& out_bl, optional_yield y)
-{
-  std::unique_ptr<rgw::sal::Object::ReadOp> rop = obj->get_read_op();
-
-  return rop->get_attr(dpp, attr_name, out_bl, y);
-}
-
-int RGWBucket::get_policy(RGWBucketAdminOpState& op_state, RGWAccessControlPolicy& policy, optional_yield y, const DoutPrefixProvider *dpp)
-{
-  int ret;
-  std::string object_name = op_state.get_object_name();
-
-  bucket = op_state.get_bucket()->clone();
-
-  if (!object_name.empty()) {
-    bufferlist bl;
-    std::unique_ptr<rgw::sal::Object> obj = bucket->get_object(rgw_obj_key(object_name));
-
-    ret = rgw_object_get_attr(dpp, driver, obj.get(), RGW_ATTR_ACL, bl, y);
-    if (ret < 0){
-      return ret;
-    }
-
-    ret = decode_bl(bl, policy);
-    if (ret < 0) {
-      ldout(driver->ctx(),0) << "failed to decode RGWAccessControlPolicy" << dendl;
-    }
-    return ret;
-  }
-
-  map<string, bufferlist>::iterator aiter = bucket->get_attrs().find(RGW_ATTR_ACL);
-  if (aiter == bucket->get_attrs().end()) {
-    return -ENOENT;
-  }
-
-  ret = decode_bl(aiter->second, policy);
-  if (ret < 0) {
-    ldout(driver->ctx(),0) << "failed to decode RGWAccessControlPolicy" << dendl;
-  }
-
-  return ret;
-}
-
-
-int RGWBucketAdminOp::get_policy(rgw::sal::Driver* driver, RGWBucketAdminOpState& op_state,
-                  RGWAccessControlPolicy& policy, const DoutPrefixProvider *dpp)
-{
-  RGWBucket bucket;
-
-  int ret = bucket.init(driver, op_state, null_yield, dpp);
-  if (ret < 0)
-    return ret;
-
-  ret = bucket.get_policy(op_state, policy, null_yield, dpp);
-  if (ret < 0)
-    return ret;
-
-  return 0;
-}
-
-/* Wrappers to facilitate RESTful interface */
-
-
-int RGWBucketAdminOp::get_policy(rgw::sal::Driver* driver, RGWBucketAdminOpState& op_state,
-                  RGWFormatterFlusher& flusher, const DoutPrefixProvider *dpp)
-{
-  RGWAccessControlPolicy policy(driver->ctx());
-
-  int ret = get_policy(driver, op_state, policy, dpp);
-  if (ret < 0)
-    return ret;
-
-  Formatter *formatter = flusher.get_formatter();
-
-  flusher.start(0);
-
-  formatter->open_object_section("policy");
-  policy.dump(formatter);
-  formatter->close_section();
-
-  flusher.flush();
-
-  return 0;
-}
-
-int RGWBucketAdminOp::dump_s3_policy(rgw::sal::Driver* driver, RGWBucketAdminOpState& op_state,
-                  ostream& os, const DoutPrefixProvider *dpp)
-{
-  RGWAccessControlPolicy_S3 policy(driver->ctx());
-
-  int ret = get_policy(driver, op_state, policy, dpp);
-  if (ret < 0)
-    return ret;
-
-  policy.to_xml(os);
-
-  return 0;
-}
-
-int RGWBucketAdminOp::unlink(rgw::sal::Driver* driver, RGWBucketAdminOpState& op_state, const DoutPrefixProvider *dpp)
-{
-  RGWBucket bucket;
-
-  int ret = bucket.init(driver, op_state, null_yield, dpp);
-  if (ret < 0)
-    return ret;
-
-  return static_cast<rgw::sal::RadosStore*>(driver)->ctl()->bucket->unlink_bucket(op_state.get_user_id(), op_state.get_bucket()->get_info().bucket, null_yield, dpp, true);
-}
-
-int RGWBucketAdminOp::link(rgw::sal::Driver* driver, RGWBucketAdminOpState& op_state, const DoutPrefixProvider *dpp, string *err)
-{
-  if (!op_state.is_user_op()) {
-    set_err_msg(err, "empty user id");
-    return -EINVAL;
-  }
-
-  RGWBucket bucket;
-  int ret = bucket.init(driver, op_state, null_yield, dpp, err);
-  if (ret < 0)
-    return ret;
-
-  string bucket_id = op_state.get_bucket_id();
-  std::string display_name = op_state.get_user_display_name();
-  std::unique_ptr<rgw::sal::Bucket> loc_bucket;
-  std::unique_ptr<rgw::sal::Bucket> old_bucket;
-
-  loc_bucket = op_state.get_bucket()->clone();
-
-  if (!bucket_id.empty() && bucket_id != loc_bucket->get_bucket_id()) {
-    set_err_msg(err,
-       "specified bucket id does not match " + loc_bucket->get_bucket_id());
-    return -EINVAL;
-  }
-
-  old_bucket = loc_bucket->clone();
-
-  loc_bucket->get_key().tenant = op_state.get_user_id().tenant;
-
-  if (!op_state.new_bucket_name.empty()) {
-    auto pos = op_state.new_bucket_name.find('/');
-    if (pos != string::npos) {
-      loc_bucket->get_key().tenant = op_state.new_bucket_name.substr(0, pos);
-      loc_bucket->get_key().name = op_state.new_bucket_name.substr(pos + 1);
-    } else {
-      loc_bucket->get_key().name = op_state.new_bucket_name;
-    }
-  }
-
-  RGWObjVersionTracker objv_tracker;
-  RGWObjVersionTracker old_version = loc_bucket->get_info().objv_tracker;
-
-  map<string, bufferlist>::iterator aiter = loc_bucket->get_attrs().find(RGW_ATTR_ACL);
-  if (aiter == loc_bucket->get_attrs().end()) {
-       // should never happen; only pre-argonaut buckets lacked this.
-    ldpp_dout(dpp, 0) << "WARNING: can't bucket link because no acl on bucket=" << old_bucket << dendl;
-    set_err_msg(err,
-       "While crossing the Anavros you have displeased the goddess Hera."
-       "  You must sacrifice your ancient bucket " + loc_bucket->get_bucket_id());
-    return -EINVAL;
-  }
-  bufferlist& aclbl = aiter->second;
-  RGWAccessControlPolicy policy;
-  ACLOwner owner;
-  try {
-   auto iter = aclbl.cbegin();
-   decode(policy, iter);
-   owner = policy.get_owner();
-  } catch (buffer::error& e) {
-    set_err_msg(err, "couldn't decode policy");
-    return -EIO;
-  }
-
-  int r = static_cast<rgw::sal::RadosStore*>(driver)->ctl()->bucket->unlink_bucket(owner.get_id(), old_bucket->get_info().bucket, null_yield, dpp, false);
-  if (r < 0) {
-    set_err_msg(err, "could not unlink policy from user " + owner.get_id().to_str());
-    return r;
-  }
-
-  // now update the user for the bucket...
-  if (display_name.empty()) {
-    ldpp_dout(dpp, 0) << "WARNING: user " << op_state.get_user_id() << " has no display name set" << dendl;
-  }
-
-  RGWAccessControlPolicy policy_instance;
-  policy_instance.create_default(op_state.get_user_id(), display_name);
-  owner = policy_instance.get_owner();
-
-  aclbl.clear();
-  policy_instance.encode(aclbl);
-
-  bool exclusive = false;
-  loc_bucket->get_info().owner = op_state.get_user_id();
-  if (*loc_bucket != *old_bucket) {
-    loc_bucket->get_info().bucket = loc_bucket->get_key();
-    loc_bucket->get_info().objv_tracker.version_for_read()->ver = 0;
-    exclusive = true;
-  }
-
-  r = loc_bucket->put_info(dpp, exclusive, ceph::real_time());
-  if (r < 0) {
-    set_err_msg(err, "ERROR: failed writing bucket instance info: " + cpp_strerror(-r));
-    return r;
-  }
-
-  /* link to user */
-  RGWBucketEntryPoint ep;
-  ep.bucket = loc_bucket->get_info().bucket;
-  ep.owner = op_state.get_user_id();
-  ep.creation_time = loc_bucket->get_info().creation_time;
-  ep.linked = true;
-  rgw::sal::Attrs ep_attrs;
-  rgw_ep_info ep_data{ep, ep_attrs};
-
-  r = static_cast<rgw::sal::RadosStore*>(driver)->ctl()->bucket->link_bucket(op_state.get_user_id(), loc_bucket->get_info().bucket, loc_bucket->get_info().creation_time, null_yield, dpp, true, &ep_data);
-  if (r < 0) {
-    set_err_msg(err, "failed to relink bucket");
-    return r;
-  }
-
-  if (*loc_bucket != *old_bucket) {
-    // like RGWRados::delete_bucket -- excepting no bucket_index work.
-    r = static_cast<rgw::sal::RadosStore*>(driver)->ctl()->bucket->remove_bucket_entrypoint_info(
-                                       old_bucket->get_key(), null_yield, dpp,
-                                       RGWBucketCtl::Bucket::RemoveParams()
-                                       .set_objv_tracker(&ep_data.ep_objv));
-    if (r < 0) {
-      set_err_msg(err, "failed to unlink old bucket " + old_bucket->get_tenant() + "/" + old_bucket->get_name());
-      return r;
-    }
-    r = static_cast<rgw::sal::RadosStore*>(driver)->ctl()->bucket->remove_bucket_instance_info(
-                                       old_bucket->get_key(), old_bucket->get_info(),
-                                       null_yield, dpp,
-                                       RGWBucketCtl::BucketInstance::RemoveParams()
-                                       .set_objv_tracker(&ep_data.ep_objv));
-    if (r < 0) {
-      set_err_msg(err, "failed to unlink old bucket " + old_bucket->get_tenant() + "/" + old_bucket->get_name());
-      return r;
-    }
-  }
-
-  return 0;
-}
-
-int RGWBucketAdminOp::chown(rgw::sal::Driver* driver, RGWBucketAdminOpState& op_state, const string& marker, const DoutPrefixProvider *dpp, string *err)
-{
-  RGWBucket bucket;
-
-  int ret = bucket.init(driver, op_state, null_yield, dpp, err);
-  if (ret < 0)
-    return ret;
-
-  return bucket.chown(op_state, marker, null_yield, dpp, err);
-
-}
-
-int RGWBucketAdminOp::check_index(rgw::sal::Driver* driver, RGWBucketAdminOpState& op_state,
-                  RGWFormatterFlusher& flusher, optional_yield y, const DoutPrefixProvider *dpp)
-{
-  int ret;
-  map<RGWObjCategory, RGWStorageStats> existing_stats;
-  map<RGWObjCategory, RGWStorageStats> calculated_stats;
-
-
-  RGWBucket bucket;
-
-  ret = bucket.init(driver, op_state, null_yield, dpp);
-  if (ret < 0)
-    return ret;
-
-  Formatter *formatter = flusher.get_formatter();
-  flusher.start(0);
-
-  ret = bucket.check_bad_index_multipart(op_state, flusher, dpp);
-  if (ret < 0)
-    return ret;
-
-  ret = bucket.check_object_index(dpp, op_state, flusher, y);
-  if (ret < 0)
-    return ret;
-
-  ret = bucket.check_index(dpp, op_state, existing_stats, calculated_stats);
-  if (ret < 0)
-    return ret;
-
-  dump_index_check(existing_stats, calculated_stats, formatter);
-  flusher.flush();
-
-  return 0;
-}
-
-int RGWBucketAdminOp::remove_bucket(rgw::sal::Driver* driver, RGWBucketAdminOpState& op_state,
-                                   optional_yield y, const DoutPrefixProvider *dpp, 
-                                    bool bypass_gc, bool keep_index_consistent)
-{
-  std::unique_ptr<rgw::sal::Bucket> bucket;
-  std::unique_ptr<rgw::sal::User> user = driver->get_user(op_state.get_user_id());
-
-  int ret = driver->get_bucket(dpp, user.get(), user->get_tenant(), op_state.get_bucket_name(),
-                             &bucket, y);
-  if (ret < 0)
-    return ret;
-
-  if (bypass_gc)
-    ret = bucket->remove_bucket_bypass_gc(op_state.get_max_aio(), keep_index_consistent, y, dpp);
-  else
-    ret = bucket->remove_bucket(dpp, op_state.will_delete_children(),
-                               false, nullptr, y);
-
-  return ret;
-}
-
-int RGWBucketAdminOp::remove_object(rgw::sal::Driver* driver, RGWBucketAdminOpState& op_state, const DoutPrefixProvider *dpp)
-{
-  RGWBucket bucket;
-
-  int ret = bucket.init(driver, op_state, null_yield, dpp);
-  if (ret < 0)
-    return ret;
-
-  return bucket.remove_object(dpp, op_state);
-}
-
-int RGWBucketAdminOp::sync_bucket(rgw::sal::Driver* driver, RGWBucketAdminOpState& op_state, const DoutPrefixProvider *dpp, string *err_msg)
-{
-  RGWBucket bucket;
-  int ret = bucket.init(driver, op_state, null_yield, dpp, err_msg);
-  if (ret < 0)
-  {
-    return ret;
-  }
-  return bucket.sync(op_state, dpp, err_msg);
-}
-
-static int bucket_stats(rgw::sal::Driver* driver,
-                       const std::string& tenant_name,
-                       const std::string& bucket_name,
-                       Formatter *formatter,
-                        const DoutPrefixProvider *dpp)
-{
-  std::unique_ptr<rgw::sal::Bucket> bucket;
-  map<RGWObjCategory, RGWStorageStats> stats;
-
-  real_time mtime;
-  int ret = driver->get_bucket(dpp, nullptr, tenant_name, bucket_name, &bucket, null_yield);
-  if (ret < 0) {
-    return ret;
-  }
-
-  const auto& index = bucket->get_info().get_current_index();
-  if (is_layout_indexless(index)) {
-    cerr << "error, indexless buckets do not maintain stats; bucket=" <<
-      bucket->get_name() << std::endl;
-    return -EINVAL;
-  }
-
-  std::string bucket_ver, master_ver;
-  std::string max_marker;
-  ret = bucket->read_stats(dpp, index, RGW_NO_SHARD, &bucket_ver, &master_ver, stats, &max_marker);
-  if (ret < 0) {
-    cerr << "error getting bucket stats bucket=" << bucket->get_name() << " ret=" << ret << std::endl;
-    return ret;
-  }
-
-  utime_t ut(mtime);
-  utime_t ctime_ut(bucket->get_creation_time());
-
-  formatter->open_object_section("stats");
-  formatter->dump_string("bucket", bucket->get_name());
-  formatter->dump_int("num_shards",
-                     bucket->get_info().layout.current_index.layout.normal.num_shards);
-  formatter->dump_string("tenant", bucket->get_tenant());
-  formatter->dump_string("zonegroup", bucket->get_info().zonegroup);
-  formatter->dump_string("placement_rule", bucket->get_info().placement_rule.to_str());
-  ::encode_json("explicit_placement", bucket->get_key().explicit_placement, formatter);
-  formatter->dump_string("id", bucket->get_bucket_id());
-  formatter->dump_string("marker", bucket->get_marker());
-  formatter->dump_stream("index_type") << bucket->get_info().layout.current_index.layout.type;
-  ::encode_json("owner", bucket->get_info().owner, formatter);
-  formatter->dump_string("ver", bucket_ver);
-  formatter->dump_string("master_ver", master_ver);
-  ut.gmtime(formatter->dump_stream("mtime"));
-  ctime_ut.gmtime(formatter->dump_stream("creation_time"));
-  formatter->dump_string("max_marker", max_marker);
-  dump_bucket_usage(stats, formatter);
-  encode_json("bucket_quota", bucket->get_info().quota, formatter);
-
-  // bucket tags
-  auto iter = bucket->get_attrs().find(RGW_ATTR_TAGS);
-  if (iter != bucket->get_attrs().end()) {
-    RGWObjTagSet_S3 tagset;
-    bufferlist::const_iterator piter{&iter->second};
-    try {
-      tagset.decode(piter);
-      tagset.dump(formatter); 
-    } catch (buffer::error& err) {
-      cerr << "ERROR: caught buffer:error, couldn't decode TagSet" << std::endl;
-    }
-  }
-
-  // TODO: bucket CORS
-  // TODO: bucket LC
-  formatter->close_section();
-
-  return 0;
-}
-
-int RGWBucketAdminOp::limit_check(rgw::sal::Driver* driver,
-                                 RGWBucketAdminOpState& op_state,
-                                 const std::list<std::string>& user_ids,
-                                 RGWFormatterFlusher& flusher, optional_yield y,
-                                  const DoutPrefixProvider *dpp,
-                                 bool warnings_only)
-{
-  int ret = 0;
-  const size_t max_entries =
-    driver->ctx()->_conf->rgw_list_buckets_max_chunk;
-
-  const size_t safe_max_objs_per_shard =
-    driver->ctx()->_conf->rgw_safe_max_objects_per_shard;
-
-  uint16_t shard_warn_pct =
-    driver->ctx()->_conf->rgw_shard_warning_threshold;
-  if (shard_warn_pct > 100)
-    shard_warn_pct = 90;
-
-  Formatter *formatter = flusher.get_formatter();
-  flusher.start(0);
-
-  formatter->open_array_section("users");
-
-  for (const auto& user_id : user_ids) {
-
-    formatter->open_object_section("user");
-    formatter->dump_string("user_id", user_id);
-    formatter->open_array_section("buckets");
-
-    string marker;
-    rgw::sal::BucketList buckets;
-    do {
-      std::unique_ptr<rgw::sal::User> user = driver->get_user(rgw_user(user_id));
-
-      ret = user->list_buckets(dpp, marker, string(), max_entries, false, buckets, y);
-
-      if (ret < 0)
-        return ret;
-
-      map<string, std::unique_ptr<rgw::sal::Bucket>>& m_buckets = buckets.get_buckets();
-
-      for (const auto& iter : m_buckets) {
-       auto& bucket = iter.second;
-       uint64_t num_objects = 0;
-
-       marker = bucket->get_name(); /* Casey's location for marker update,
-                                    * as we may now not reach the end of
-                                    * the loop body */
-
-       ret = bucket->load_bucket(dpp, null_yield);
-       if (ret < 0)
-         continue;
-
-       const auto& index = bucket->get_info().get_current_index();
-       if (is_layout_indexless(index)) {
-         continue; // indexless buckets don't have stats
-       }
-
-       /* need stats for num_entries */
-       string bucket_ver, master_ver;
-       std::map<RGWObjCategory, RGWStorageStats> stats;
-       ret = bucket->read_stats(dpp, index, RGW_NO_SHARD, &bucket_ver, &master_ver, stats, nullptr);
-
-       if (ret < 0)
-         continue;
-
-       for (const auto& s : stats) {
-         num_objects += s.second.num_objects;
-       }
-
-       const uint32_t num_shards = rgw::num_shards(index.layout.normal);
-       uint64_t objs_per_shard =
-         (num_shards) ? num_objects/num_shards : num_objects;
-       {
-         bool warn;
-         stringstream ss;
-         uint64_t fill_pct = objs_per_shard * 100 / safe_max_objs_per_shard;
-         if (fill_pct > 100) {
-           ss << "OVER " << fill_pct << "%";
-           warn = true;
-         } else if (fill_pct >= shard_warn_pct) {
-           ss << "WARN " << fill_pct << "%";
-           warn = true;
-         } else {
-           ss << "OK";
-           warn = false;
-         }
-
-         if (warn || !warnings_only) {
-           formatter->open_object_section("bucket");
-           formatter->dump_string("bucket", bucket->get_name());
-           formatter->dump_string("tenant", bucket->get_tenant());
-           formatter->dump_int("num_objects", num_objects);
-           formatter->dump_int("num_shards", num_shards);
-           formatter->dump_int("objects_per_shard", objs_per_shard);
-           formatter->dump_string("fill_status", ss.str());
-           formatter->close_section();
-         }
-       }
-      }
-      formatter->flush(cout);
-    } while (buckets.is_truncated()); /* foreach: bucket */
-
-    formatter->close_section();
-    formatter->close_section();
-    formatter->flush(cout);
-
-  } /* foreach: user_id */
-
-  formatter->close_section();
-  formatter->flush(cout);
-
-  return ret;
-} /* RGWBucketAdminOp::limit_check */
-
-int RGWBucketAdminOp::info(rgw::sal::Driver* driver,
-                          RGWBucketAdminOpState& op_state,
-                          RGWFormatterFlusher& flusher,
-                          optional_yield y,
-                           const DoutPrefixProvider *dpp)
-{
-  RGWBucket bucket;
-  int ret = 0;
-  const std::string& bucket_name = op_state.get_bucket_name();
-  if (!bucket_name.empty()) {
-    ret = bucket.init(driver, op_state, null_yield, dpp);
-    if (-ENOENT == ret)
-      return -ERR_NO_SUCH_BUCKET;
-    else if (ret < 0)
-      return ret;
-  }
-
-  Formatter *formatter = flusher.get_formatter();
-  flusher.start(0);
-
-  CephContext *cct = driver->ctx();
-
-  const size_t max_entries = cct->_conf->rgw_list_buckets_max_chunk;
-
-  const bool show_stats = op_state.will_fetch_stats();
-  const rgw_user& user_id = op_state.get_user_id();
-  if (op_state.is_user_op()) {
-    formatter->open_array_section("buckets");
-
-    rgw::sal::BucketList buckets;
-    std::unique_ptr<rgw::sal::User> user = driver->get_user(op_state.get_user_id());
-    std::string marker;
-    const std::string empty_end_marker;
-    constexpr bool no_need_stats = false; // set need_stats to false
-
-    do {
-      ret = user->list_buckets(dpp, marker, empty_end_marker, max_entries,
-                             no_need_stats, buckets, y);
-      if (ret < 0) {
-        return ret;
-      }
-
-      const std::string* marker_cursor = nullptr;
-      map<string, std::unique_ptr<rgw::sal::Bucket>>& m = buckets.get_buckets();
-
-      for (const auto& i : m) {
-        const std::string& obj_name = i.first;
-        if (!bucket_name.empty() && bucket_name != obj_name) {
-          continue;
-        }
-
-        if (show_stats) {
-          bucket_stats(driver, user_id.tenant, obj_name, formatter, dpp);
-       } else {
-          formatter->dump_string("bucket", obj_name);
-       }
-
-        marker_cursor = &obj_name;
-      } // for loop
-      if (marker_cursor) {
-       marker = *marker_cursor;
-      }
-
-      flusher.flush();
-    } while (buckets.is_truncated());
-
-    formatter->close_section();
-  } else if (!bucket_name.empty()) {
-    ret = bucket_stats(driver, user_id.tenant, bucket_name, formatter, dpp);
-    if (ret < 0) {
-      return ret;
-    }
-  } else {
-    void *handle = nullptr;
-    bool truncated = true;
-
-    formatter->open_array_section("buckets");
-    ret = driver->meta_list_keys_init(dpp, "bucket", string(), &handle);
-    while (ret == 0 && truncated) {
-      std::list<std::string> buckets;
-      constexpr int max_keys = 1000;
-      ret = driver->meta_list_keys_next(dpp, handle, max_keys, buckets,
-                                                  &truncated);
-      for (auto& bucket_name : buckets) {
-        if (show_stats) {
-          bucket_stats(driver, user_id.tenant, bucket_name, formatter, dpp);
-       } else {
-          formatter->dump_string("bucket", bucket_name);
-       }
-      }
-    }
-    driver->meta_list_keys_complete(handle);
-
-    formatter->close_section();
-  }
-
-  flusher.flush();
-
-  return 0;
-}
-
-int RGWBucketAdminOp::set_quota(rgw::sal::Driver* driver, RGWBucketAdminOpState& op_state, const DoutPrefixProvider *dpp)
-{
-  RGWBucket bucket;
-
-  int ret = bucket.init(driver, op_state, null_yield, dpp);
-  if (ret < 0)
-    return ret;
-  return bucket.set_quota(op_state, dpp);
-}
-
-inline auto split_tenant(const std::string& bucket_name){
-  auto p = bucket_name.find('/');
-  if(p != std::string::npos) {
-    return std::make_pair(bucket_name.substr(0,p), bucket_name.substr(p+1));
-  }
-  return std::make_pair(std::string(), bucket_name);
-}
-
-using bucket_instance_ls = std::vector<RGWBucketInfo>;
-void get_stale_instances(rgw::sal::Driver* driver, const std::string& bucket_name,
-                         const vector<std::string>& lst,
-                         bucket_instance_ls& stale_instances,
-                         const DoutPrefixProvider *dpp)
-{
-
-  bucket_instance_ls other_instances;
-// first iterate over the entries, and pick up the done buckets; these
-// are guaranteed to be stale
-  for (const auto& bucket_instance : lst){
-    RGWBucketInfo binfo;
-    std::unique_ptr<rgw::sal::Bucket> bucket;
-    rgw_bucket rbucket;
-    rgw_bucket_parse_bucket_key(driver->ctx(), bucket_instance, &rbucket, nullptr);
-    int r = driver->get_bucket(dpp, nullptr, rbucket, &bucket, null_yield);
-    if (r < 0){
-      // this can only happen if someone deletes us right when we're processing
-      ldpp_dout(dpp, -1) << "Bucket instance is invalid: " << bucket_instance
-                          << cpp_strerror(-r) << dendl;
-      continue;
-    }
-    binfo = bucket->get_info();
-    if (binfo.reshard_status == cls_rgw_reshard_status::DONE)
-      stale_instances.emplace_back(std::move(binfo));
-    else {
-      other_instances.emplace_back(std::move(binfo));
-    }
-  }
-
-  // Read the cur bucket info, if the bucket doesn't exist we can simply return
-  // all the instances
-  auto [tenant, bname] = split_tenant(bucket_name);
-  RGWBucketInfo cur_bucket_info;
-  std::unique_ptr<rgw::sal::Bucket> cur_bucket;
-  int r = driver->get_bucket(dpp, nullptr, tenant, bname, &cur_bucket, null_yield);
-  if (r < 0) {
-    if (r == -ENOENT) {
-      // bucket doesn't exist, everything is stale then
-      stale_instances.insert(std::end(stale_instances),
-                             std::make_move_iterator(other_instances.begin()),
-                             std::make_move_iterator(other_instances.end()));
-    } else {
-      // all bets are off if we can't read the bucket, just return the sureshot stale instances
-      ldpp_dout(dpp, -1) << "error: reading bucket info for bucket: "
-                          << bname << cpp_strerror(-r) << dendl;
-    }
-    return;
-  }
-
-  // Don't process further in this round if bucket is resharding
-  cur_bucket_info = cur_bucket->get_info();
-  if (cur_bucket_info.reshard_status == cls_rgw_reshard_status::IN_PROGRESS)
-    return;
-
-  other_instances.erase(std::remove_if(other_instances.begin(), other_instances.end(),
-                                       [&cur_bucket_info](const RGWBucketInfo& b){
-                                         return (b.bucket.bucket_id == cur_bucket_info.bucket.bucket_id ||
-                                                 b.bucket.bucket_id == cur_bucket_info.new_bucket_instance_id);
-                                       }),
-                        other_instances.end());
-
-  // check if there are still instances left
-  if (other_instances.empty()) {
-    return;
-  }
-
-  // Now we have a bucket with instances where the reshard status is none, this
-  // usually happens when the reshard process couldn't complete, lockdown the
-  // bucket and walk through these instances to make sure no one else interferes
-  // with these
-  {
-    RGWBucketReshardLock reshard_lock(static_cast<rgw::sal::RadosStore*>(driver), cur_bucket->get_info(), true);
-    r = reshard_lock.lock(dpp);
-    if (r < 0) {
-      // most likely bucket is under reshard, return the sureshot stale instances
-      ldpp_dout(dpp, 5) << __func__
-                             << "failed to take reshard lock; reshard underway likey" << dendl;
-      return;
-    }
-    auto sg = make_scope_guard([&reshard_lock](){ reshard_lock.unlock();} );
-    // this should be fast enough that we may not need to renew locks and check
-    // exit status?, should we read the values of the instances again?
-    stale_instances.insert(std::end(stale_instances),
-                           std::make_move_iterator(other_instances.begin()),
-                           std::make_move_iterator(other_instances.end()));
-  }
-
-  return;
-}
-
-static int process_stale_instances(rgw::sal::Driver* driver, RGWBucketAdminOpState& op_state,
-                                   RGWFormatterFlusher& flusher,
-                                   const DoutPrefixProvider *dpp,
-                                   std::function<void(const bucket_instance_ls&,
-                                                      Formatter *,
-                                                      rgw::sal::Driver*)> process_f)
-{
-  std::string marker;
-  void *handle;
-  Formatter *formatter = flusher.get_formatter();
-  static constexpr auto default_max_keys = 1000;
-
-  int ret = driver->meta_list_keys_init(dpp, "bucket.instance", marker, &handle);
-  if (ret < 0) {
-    cerr << "ERROR: can't get key: " << cpp_strerror(-ret) << std::endl;
-    return ret;
-  }
-
-  bool truncated;
-
-  formatter->open_array_section("keys");
-  auto g = make_scope_guard([&driver, &handle, &formatter]() {
-                              driver->meta_list_keys_complete(handle);
-                              formatter->close_section(); // keys
-                              formatter->flush(cout);
-                            });
-
-  do {
-    list<std::string> keys;
-
-    ret = driver->meta_list_keys_next(dpp, handle, default_max_keys, keys, &truncated);
-    if (ret < 0 && ret != -ENOENT) {
-      cerr << "ERROR: lists_keys_next(): " << cpp_strerror(-ret) << std::endl;
-      return ret;
-    } if (ret != -ENOENT) {
-      // partition the list of buckets by buckets as the listing is un sorted,
-      // since it would minimize the reads to bucket_info
-      std::unordered_map<std::string, std::vector<std::string>> bucket_instance_map;
-      for (auto &key: keys) {
-        auto pos = key.find(':');
-        if(pos != std::string::npos)
-          bucket_instance_map[key.substr(0,pos)].emplace_back(std::move(key));
-      }
-      for (const auto& kv: bucket_instance_map) {
-        bucket_instance_ls stale_lst;
-        get_stale_instances(driver, kv.first, kv.second, stale_lst, dpp);
-        process_f(stale_lst, formatter, driver);
-      }
-    }
-  } while (truncated);
-
-  return 0;
-}
-
-int RGWBucketAdminOp::list_stale_instances(rgw::sal::Driver* driver,
-                                           RGWBucketAdminOpState& op_state,
-                                           RGWFormatterFlusher& flusher,
-                                           const DoutPrefixProvider *dpp)
-{
-  auto process_f = [](const bucket_instance_ls& lst,
-                      Formatter *formatter,
-                      rgw::sal::Driver*){
-                     for (const auto& binfo: lst)
-                       formatter->dump_string("key", binfo.bucket.get_key());
-                   };
-  return process_stale_instances(driver, op_state, flusher, dpp, process_f);
-}
-
-
-int RGWBucketAdminOp::clear_stale_instances(rgw::sal::Driver* driver,
-                                            RGWBucketAdminOpState& op_state,
-                                            RGWFormatterFlusher& flusher,
-                                            const DoutPrefixProvider *dpp)
-{
-  auto process_f = [dpp](const bucket_instance_ls& lst,
-                      Formatter *formatter,
-                      rgw::sal::Driver* driver){
-                     for (const auto &binfo: lst) {
-                      std::unique_ptr<rgw::sal::Bucket> bucket;
-                      driver->get_bucket(nullptr, binfo, &bucket);
-                      int ret = bucket->purge_instance(dpp);
-                       if (ret == 0){
-                         auto md_key = "bucket.instance:" + binfo.bucket.get_key();
-                         ret = driver->meta_remove(dpp, md_key, null_yield);
-                       }
-                       formatter->open_object_section("delete_status");
-                       formatter->dump_string("bucket_instance", binfo.bucket.get_key());
-                       formatter->dump_int("status", -ret);
-                       formatter->close_section();
-                     }
-                   };
-
-  return process_stale_instances(driver, op_state, flusher, dpp, process_f);
-}
-
-static int fix_single_bucket_lc(rgw::sal::Driver* driver,
-                                const std::string& tenant_name,
-                                const std::string& bucket_name,
-                                const DoutPrefixProvider *dpp)
-{
-  std::unique_ptr<rgw::sal::Bucket> bucket;
-  int ret = driver->get_bucket(dpp, nullptr, tenant_name, bucket_name, &bucket, null_yield);
-  if (ret < 0) {
-    // TODO: Should we handle the case where the bucket could've been removed between
-    // listing and fetching?
-    return ret;
-  }
-
-  return rgw::lc::fix_lc_shard_entry(dpp, driver, driver->get_rgwlc()->get_lc(), bucket.get());
-}
-
-static void format_lc_status(Formatter* formatter,
-                             const std::string& tenant_name,
-                             const std::string& bucket_name,
-                             int status)
-{
-  formatter->open_object_section("bucket_entry");
-  std::string entry = tenant_name.empty() ? bucket_name : tenant_name + "/" + bucket_name;
-  formatter->dump_string("bucket", entry);
-  formatter->dump_int("status", status);
-  formatter->close_section(); // bucket_entry
-}
-
-static void process_single_lc_entry(rgw::sal::Driver* driver,
-                                   Formatter *formatter,
-                                    const std::string& tenant_name,
-                                    const std::string& bucket_name,
-                                    const DoutPrefixProvider *dpp)
-{
-  int ret = fix_single_bucket_lc(driver, tenant_name, bucket_name, dpp);
-  format_lc_status(formatter, tenant_name, bucket_name, -ret);
-}
-
-int RGWBucketAdminOp::fix_lc_shards(rgw::sal::Driver* driver,
-                                    RGWBucketAdminOpState& op_state,
-                                    RGWFormatterFlusher& flusher,
-                                    const DoutPrefixProvider *dpp)
-{
-  std::string marker;
-  void *handle;
-  Formatter *formatter = flusher.get_formatter();
-  static constexpr auto default_max_keys = 1000;
-
-  bool truncated;
-  if (const std::string& bucket_name = op_state.get_bucket_name();
-      ! bucket_name.empty()) {
-    const rgw_user user_id = op_state.get_user_id();
-    process_single_lc_entry(driver, formatter, user_id.tenant, bucket_name, dpp);
-    formatter->flush(cout);
-  } else {
-    int ret = driver->meta_list_keys_init(dpp, "bucket", marker, &handle);
-    if (ret < 0) {
-      std::cerr << "ERROR: can't get key: " << cpp_strerror(-ret) << std::endl;
-      return ret;
-    }
-
-    {
-      formatter->open_array_section("lc_fix_status");
-      auto sg = make_scope_guard([&driver, &handle, &formatter](){
-                                   driver->meta_list_keys_complete(handle);
-                                   formatter->close_section(); // lc_fix_status
-                                   formatter->flush(cout);
-                                 });
-      do {
-        list<std::string> keys;
-        ret = driver->meta_list_keys_next(dpp, handle, default_max_keys, keys, &truncated);
-        if (ret < 0 && ret != -ENOENT) {
-          std::cerr << "ERROR: lists_keys_next(): " << cpp_strerror(-ret) << std::endl;
-          return ret;
-        } if (ret != -ENOENT) {
-          for (const auto &key:keys) {
-            auto [tenant_name, bucket_name] = split_tenant(key);
-            process_single_lc_entry(driver, formatter, tenant_name, bucket_name, dpp);
-          }
-        }
-        formatter->flush(cout); // regularly flush every 1k entries
-      } while (truncated);
-    }
-
-  }
-  return 0;
-
-}
-
-static bool has_object_expired(const DoutPrefixProvider *dpp,
-                              rgw::sal::Driver* driver,
-                              rgw::sal::Bucket* bucket,
-                              const rgw_obj_key& key, utime_t& delete_at)
-{
-  std::unique_ptr<rgw::sal::Object> obj = bucket->get_object(key);
-  bufferlist delete_at_bl;
-
-  int ret = rgw_object_get_attr(dpp, driver, obj.get(), RGW_ATTR_DELETE_AT, delete_at_bl, null_yield);
-  if (ret < 0) {
-    return false;  // no delete at attr, proceed
-  }
-
-  ret = decode_bl(delete_at_bl, delete_at);
-  if (ret < 0) {
-    return false;  // failed to parse
-  }
-
-  if (delete_at <= ceph_clock_now() && !delete_at.is_zero()) {
-    return true;
-  }
-
-  return false;
-}
-
-static int fix_bucket_obj_expiry(const DoutPrefixProvider *dpp,
-                                rgw::sal::Driver* driver,
-                                rgw::sal::Bucket* bucket,
-                                RGWFormatterFlusher& flusher, bool dry_run)
-{
-  if (bucket->get_key().bucket_id == bucket->get_key().marker) {
-    ldpp_dout(dpp, -1) << "Not a resharded bucket skipping" << dendl;
-    return 0;  // not a resharded bucket, move along
-  }
-
-  Formatter *formatter = flusher.get_formatter();
-  formatter->open_array_section("expired_deletion_status");
-  auto sg = make_scope_guard([&formatter] {
-                              formatter->close_section();
-                              formatter->flush(std::cout);
-                            });
-
-  rgw::sal::Bucket::ListParams params;
-  rgw::sal::Bucket::ListResults results;
-
-  params.list_versions = bucket->versioned();
-  params.allow_unordered = true;
-
-  do {
-    int ret = bucket->list(dpp, params, listing_max_entries, results, null_yield);
-    if (ret < 0) {
-      ldpp_dout(dpp, -1) << "ERROR failed to list objects in the bucket" << dendl;
-      return ret;
-    }
-    for (const auto& obj : results.objs) {
-      rgw_obj_key key(obj.key);
-      utime_t delete_at;
-      if (has_object_expired(dpp, driver, bucket, key, delete_at)) {
-       formatter->open_object_section("object_status");
-       formatter->dump_string("object", key.name);
-       formatter->dump_stream("delete_at") << delete_at;
-
-       if (!dry_run) {
-         ret = rgw_remove_object(dpp, driver, bucket, key);
-         formatter->dump_int("status", ret);
-       }
-
-       formatter->close_section();  // object_status
-      }
-    }
-    formatter->flush(cout); // regularly flush every 1k entries
-  } while (results.is_truncated);
-
-  return 0;
-}
-
-int RGWBucketAdminOp::fix_obj_expiry(rgw::sal::Driver* driver,
-                                    RGWBucketAdminOpState& op_state,
-                                    RGWFormatterFlusher& flusher,
-                                     const DoutPrefixProvider *dpp, bool dry_run)
-{
-  RGWBucket admin_bucket;
-  int ret = admin_bucket.init(driver, op_state, null_yield, dpp);
-  if (ret < 0) {
-    ldpp_dout(dpp, -1) << "failed to initialize bucket" << dendl;
-    return ret;
-  }
-  std::unique_ptr<rgw::sal::Bucket> bucket;
-  ret = driver->get_bucket(nullptr, admin_bucket.get_bucket_info(), &bucket);
-  if (ret < 0) {
-    return ret;
-  }
-
-  return fix_bucket_obj_expiry(dpp, driver, bucket.get(), flusher, dry_run);
-}
-
-void RGWBucketCompleteInfo::dump(Formatter *f) const {
-  encode_json("bucket_info", info, f);
-  encode_json("attrs", attrs, f);
-}
-
-void RGWBucketCompleteInfo::decode_json(JSONObj *obj) {
-  JSONDecoder::decode_json("bucket_info", info, obj);
-  JSONDecoder::decode_json("attrs", attrs, obj);
-}
-
-class RGWBucketMetadataHandler : public RGWBucketMetadataHandlerBase {
-public:
-  struct Svc {
-    RGWSI_Bucket *bucket{nullptr};
-  } svc;
-
-  struct Ctl {
-    RGWBucketCtl *bucket{nullptr};
-  } ctl;
-
-  RGWBucketMetadataHandler() {}
-
-  void init(RGWSI_Bucket *bucket_svc,
-            RGWBucketCtl *bucket_ctl) override {
-    base_init(bucket_svc->ctx(),
-              bucket_svc->get_ep_be_handler().get());
-    svc.bucket = bucket_svc;
-    ctl.bucket = bucket_ctl;
-  }
-
-  string get_type() override { return "bucket"; }
-
-  RGWMetadataObject *get_meta_obj(JSONObj *jo, const obj_version& objv, const ceph::real_time& mtime) override {
-    RGWBucketEntryPoint be;
-
-    try {
-      decode_json_obj(be, jo);
-    } catch (JSONDecoder::err& e) {
-      return nullptr;
-    }
-
-    return new RGWBucketEntryMetadataObject(be, objv, mtime);
-  }
-
-  int do_get(RGWSI_MetaBackend_Handler::Op *op, string& entry, RGWMetadataObject **obj, optional_yield y, const DoutPrefixProvider *dpp) override {
-    RGWObjVersionTracker ot;
-    RGWBucketEntryPoint be;
-
-    real_time mtime;
-    map<string, bufferlist> attrs;
-
-    RGWSI_Bucket_EP_Ctx ctx(op->ctx());
-
-    int ret = svc.bucket->read_bucket_entrypoint_info(ctx, entry, &be, &ot, &mtime, &attrs, y, dpp);
-    if (ret < 0)
-      return ret;
-
-    RGWBucketEntryMetadataObject *mdo = new RGWBucketEntryMetadataObject(be, ot.read_version, mtime, std::move(attrs));
-
-    *obj = mdo;
-
-    return 0;
-  }
-
-  int do_put(RGWSI_MetaBackend_Handler::Op *op, string& entry,
-             RGWMetadataObject *obj,
-             RGWObjVersionTracker& objv_tracker,
-             optional_yield y,
-             const DoutPrefixProvider *dpp,
-             RGWMDLogSyncType type, bool from_remote_zone) override;
-
-  int do_remove(RGWSI_MetaBackend_Handler::Op *op, string& entry, RGWObjVersionTracker& objv_tracker,
-                optional_yield y, const DoutPrefixProvider *dpp) override {
-    RGWBucketEntryPoint be;
-
-    real_time orig_mtime;
-
-    RGWSI_Bucket_EP_Ctx ctx(op->ctx());
-
-    int ret = svc.bucket->read_bucket_entrypoint_info(ctx, entry, &be, &objv_tracker, &orig_mtime, nullptr, y, dpp);
-    if (ret < 0)
-      return ret;
-
-    /*
-     * We're unlinking the bucket but we don't want to update the entrypoint here - we're removing
-     * it immediately and don't want to invalidate our cached objv_version or the bucket obj removal
-     * will incorrectly fail.
-     */
-    ret = ctl.bucket->unlink_bucket(be.owner, be.bucket, y, dpp, false);
-    if (ret < 0) {
-      ldpp_dout(dpp, -1) << "could not unlink bucket=" << entry << " owner=" << be.owner << dendl;
-    }
-
-    ret = svc.bucket->remove_bucket_entrypoint_info(ctx, entry, &objv_tracker, y, dpp);
-    if (ret < 0) {
-      ldpp_dout(dpp, -1) << "could not delete bucket=" << entry << dendl;
-    }
-    /* idempotent */
-    return 0;
-  }
-
-  int call(std::function<int(RGWSI_Bucket_EP_Ctx& ctx)> f) {
-    return call(nullopt, f);
-  }
-
-  int call(std::optional<RGWSI_MetaBackend_CtxParams> bectx_params,
-           std::function<int(RGWSI_Bucket_EP_Ctx& ctx)> f) {
-    return be_handler->call(bectx_params, [&](RGWSI_MetaBackend_Handler::Op *op) {
-      RGWSI_Bucket_EP_Ctx ctx(op->ctx());
-      return f(ctx);
-    });
-  }
-};
-
-class RGWMetadataHandlerPut_Bucket : public RGWMetadataHandlerPut_SObj
-{
-  RGWBucketMetadataHandler *bhandler;
-  RGWBucketEntryMetadataObject *obj;
-public:
-  RGWMetadataHandlerPut_Bucket(RGWBucketMetadataHandler *_handler,
-                               RGWSI_MetaBackend_Handler::Op *op, string& entry,
-                               RGWMetadataObject *_obj, RGWObjVersionTracker& objv_tracker,
-                              optional_yield y,
-                               RGWMDLogSyncType type, bool from_remote_zone) : RGWMetadataHandlerPut_SObj(_handler, op, entry, obj, objv_tracker, y, type, from_remote_zone),
-                                                        bhandler(_handler) {
-    obj = static_cast<RGWBucketEntryMetadataObject *>(_obj);
-  }
-  ~RGWMetadataHandlerPut_Bucket() {}
-
-  void encode_obj(bufferlist *bl) override {
-    obj->get_ep().encode(*bl);
-  }
-
-  int put_checked(const DoutPrefixProvider *dpp) override;
-  int put_post(const DoutPrefixProvider *dpp) override;
-};
-
-int RGWBucketMetadataHandler::do_put(RGWSI_MetaBackend_Handler::Op *op, string& entry,
-                                     RGWMetadataObject *obj,
-                                     RGWObjVersionTracker& objv_tracker,
-                                    optional_yield y,
-                                     const DoutPrefixProvider *dpp,
-                                     RGWMDLogSyncType type, bool from_remote_zone)
-{
-  RGWMetadataHandlerPut_Bucket put_op(this, op, entry, obj, objv_tracker, y, type, from_remote_zone);
-  return do_put_operate(&put_op, dpp);
-}
-
-int RGWMetadataHandlerPut_Bucket::put_checked(const DoutPrefixProvider *dpp)
-{
-  RGWBucketEntryMetadataObject *orig_obj = static_cast<RGWBucketEntryMetadataObject *>(old_obj);
-
-  if (orig_obj) {
-    obj->set_pattrs(&orig_obj->get_attrs());
-  }
-
-  auto& be = obj->get_ep();
-  auto mtime = obj->get_mtime();
-  auto pattrs = obj->get_pattrs();
-
-  RGWSI_Bucket_EP_Ctx ctx(op->ctx());
-
-  return bhandler->svc.bucket->store_bucket_entrypoint_info(ctx, entry,
-                                                           be,
-                                                           false,
-                                                           mtime,
-                                                           pattrs,
-                                                           &objv_tracker,
-                                                          y,
-                                                           dpp);
-}
-
-int RGWMetadataHandlerPut_Bucket::put_post(const DoutPrefixProvider *dpp)
-{
-  auto& be = obj->get_ep();
-
-  int ret;
-
-  /* link bucket */
-  if (be.linked) {
-    ret = bhandler->ctl.bucket->link_bucket(be.owner, be.bucket, be.creation_time, y, dpp, false);
-  } else {
-    ret = bhandler->ctl.bucket->unlink_bucket(be.owner, be.bucket, y, dpp, false);
-  }
-
-  return ret;
-}
-
-static void get_md5_digest(const RGWBucketEntryPoint *be, string& md5_digest) {
-
-   char md5[CEPH_CRYPTO_MD5_DIGESTSIZE * 2 + 1];
-   unsigned char m[CEPH_CRYPTO_MD5_DIGESTSIZE];
-   bufferlist bl;
-
-   Formatter *f = new JSONFormatter(false);
-   be->dump(f);
-   f->flush(bl);
-
-   MD5 hash;
-   // Allow use of MD5 digest in FIPS mode for non-cryptographic purposes
-   hash.SetFlags(EVP_MD_CTX_FLAG_NON_FIPS_ALLOW);
-   hash.Update((const unsigned char *)bl.c_str(), bl.length());
-   hash.Final(m);
-
-   buf_to_hex(m, CEPH_CRYPTO_MD5_DIGESTSIZE, md5);
-
-   delete f;
-
-   md5_digest = md5;
-}
-
-#define ARCHIVE_META_ATTR RGW_ATTR_PREFIX "zone.archive.info" 
-
-struct archive_meta_info {
-  rgw_bucket orig_bucket;
-
-  bool from_attrs(CephContext *cct, map<string, bufferlist>& attrs) {
-    auto iter = attrs.find(ARCHIVE_META_ATTR);
-    if (iter == attrs.end()) {
-      return false;
-    }
-
-    auto bliter = iter->second.cbegin();
-    try {
-      decode(bliter);
-    } catch (buffer::error& err) {
-      ldout(cct, 0) << "ERROR: failed to decode archive meta info" << dendl;
-      return false;
-    }
-
-    return true;
-  }
-
-  void store_in_attrs(map<string, bufferlist>& attrs) const {
-    encode(attrs[ARCHIVE_META_ATTR]);
-  }
-
-  void encode(bufferlist& bl) const {
-    ENCODE_START(1, 1, bl);
-    encode(orig_bucket, bl);
-    ENCODE_FINISH(bl);
-  }
-
-  void decode(bufferlist::const_iterator& bl) {
-     DECODE_START(1, bl);
-     decode(orig_bucket, bl);
-     DECODE_FINISH(bl);
-  }
-};
-WRITE_CLASS_ENCODER(archive_meta_info)
-
-class RGWArchiveBucketMetadataHandler : public RGWBucketMetadataHandler {
-public:
-  RGWArchiveBucketMetadataHandler() {}
-
-  int do_remove(RGWSI_MetaBackend_Handler::Op *op, string& entry, RGWObjVersionTracker& objv_tracker,
-                optional_yield y, const DoutPrefixProvider *dpp) override {
-    auto cct = svc.bucket->ctx();
-
-    RGWSI_Bucket_EP_Ctx ctx(op->ctx());
-
-    ldpp_dout(dpp, 5) << "SKIP: bucket removal is not allowed on archive zone: bucket:" << entry << " ... proceeding to rename" << dendl;
-
-    string tenant_name, bucket_name;
-    parse_bucket(entry, &tenant_name, &bucket_name);
-    rgw_bucket entry_bucket;
-    entry_bucket.tenant = tenant_name;
-    entry_bucket.name = bucket_name;
-
-    real_time mtime;
-
-    /* read original entrypoint */
-
-    RGWBucketEntryPoint be;
-    map<string, bufferlist> attrs;
-    int ret = svc.bucket->read_bucket_entrypoint_info(ctx, entry, &be, &objv_tracker, &mtime, &attrs, y, dpp);
-    if (ret < 0) {
-        return ret;
-    }
-
-    string bi_meta_name = RGWSI_Bucket::get_bi_meta_key(be.bucket);
-
-    /* read original bucket instance info */
-
-    map<string, bufferlist> attrs_m;
-    ceph::real_time orig_mtime;
-    RGWBucketInfo old_bi;
-
-    ret = ctl.bucket->read_bucket_instance_info(be.bucket, &old_bi, y, dpp, RGWBucketCtl::BucketInstance::GetParams()
-                                                                    .set_mtime(&orig_mtime)
-                                                                    .set_attrs(&attrs_m));
-    if (ret < 0) {
-        return ret;
-    }
-
-    archive_meta_info ami;
-
-    if (!ami.from_attrs(svc.bucket->ctx(), attrs_m)) {
-      ami.orig_bucket = old_bi.bucket;
-      ami.store_in_attrs(attrs_m);
-    }
-
-    /* generate a new bucket instance. We could have avoided this if we could just point a new
-     * bucket entry point to the old bucket instance, however, due to limitation in the way
-     * we index buckets under the user, bucket entrypoint and bucket instance of the same
-     * bucket need to have the same name, so we need to copy the old bucket instance into
-     * to a new entry with the new name
-     */
-
-    string new_bucket_name;
-
-    RGWBucketInfo new_bi = old_bi;
-    RGWBucketEntryPoint new_be = be;
-
-    string md5_digest;
-
-    get_md5_digest(&new_be, md5_digest);
-    new_bucket_name = ami.orig_bucket.name + "-deleted-" + md5_digest;
-
-    new_bi.bucket.name = new_bucket_name;
-    new_bi.objv_tracker.clear();
-
-    new_be.bucket.name = new_bucket_name;
-
-    ret = ctl.bucket->store_bucket_instance_info(be.bucket, new_bi, y, dpp, RGWBucketCtl::BucketInstance::PutParams()
-                                                                    .set_exclusive(false)
-                                                                    .set_mtime(orig_mtime)
-                                                                    .set_attrs(&attrs_m)
-                                                                    .set_orig_info(&old_bi));
-    if (ret < 0) {
-      ldpp_dout(dpp, 0) << "ERROR: failed to put new bucket instance info for bucket=" << new_bi.bucket << " ret=" << ret << dendl;
-      return ret;
-    }
-
-    /* store a new entrypoint */
-
-    RGWObjVersionTracker ot;
-    ot.generate_new_write_ver(cct);
-
-    ret = svc.bucket->store_bucket_entrypoint_info(ctx, RGWSI_Bucket::get_entrypoint_meta_key(new_be.bucket),
-                                                   new_be, true, mtime, &attrs, nullptr, y, dpp);
-    if (ret < 0) {
-      ldpp_dout(dpp, 0) << "ERROR: failed to put new bucket entrypoint for bucket=" << new_be.bucket << " ret=" << ret << dendl;
-      return ret;
-    }
-
-    /* link new bucket */
-
-    ret = ctl.bucket->link_bucket(new_be.owner, new_be.bucket, new_be.creation_time, y, dpp, false);
-    if (ret < 0) {
-      ldpp_dout(dpp, 0) << "ERROR: failed to link new bucket for bucket=" << new_be.bucket << " ret=" << ret << dendl;
-      return ret;
-    }
-
-    /* clean up old stuff */
-
-    ret = ctl.bucket->unlink_bucket(be.owner, entry_bucket, y, dpp, false);
-    if (ret < 0) {
-        ldpp_dout(dpp, -1) << "could not unlink bucket=" << entry << " owner=" << be.owner << dendl;
-    }
-
-    // if (ret == -ECANCELED) it means that there was a race here, and someone
-    // wrote to the bucket entrypoint just before we removed it. The question is
-    // whether it was a newly created bucket entrypoint ...  in which case we
-    // should ignore the error and move forward, or whether it is a higher version
-    // of the same bucket instance ... in which we should retry
-    ret = svc.bucket->remove_bucket_entrypoint_info(ctx,
-                                                    RGWSI_Bucket::get_entrypoint_meta_key(be.bucket),
-                                                    &objv_tracker,
-                                                    y,
-                                                    dpp);
-    if (ret < 0) {
-      ldpp_dout(dpp, 0) << "ERROR: failed to put new bucket entrypoint for bucket=" << new_be.bucket << " ret=" << ret << dendl;
-      return ret;
-    }
-
-    ret = ctl.bucket->remove_bucket_instance_info(be.bucket, old_bi, y, dpp);
-    if (ret < 0) {
-      ldpp_dout(dpp, -1) << "could not delete bucket=" << entry << dendl;
-    }
-
-
-    /* idempotent */
-
-    return 0;
-  }
-
-  int do_put(RGWSI_MetaBackend_Handler::Op *op, string& entry,
-             RGWMetadataObject *obj,
-             RGWObjVersionTracker& objv_tracker,
-             optional_yield y, const DoutPrefixProvider *dpp,
-             RGWMDLogSyncType type, bool from_remote_zone) override {
-    if (entry.find("-deleted-") != string::npos) {
-      RGWObjVersionTracker ot;
-      RGWMetadataObject *robj;
-      int ret = do_get(op, entry, &robj, y, dpp);
-      if (ret != -ENOENT) {
-        if (ret < 0) {
-          return ret;
-        }
-        ot.read_version = robj->get_version();
-        delete robj;
-
-        ret = do_remove(op, entry, ot, y, dpp);
-        if (ret < 0) {
-          return ret;
-        }
-      }
-    }
-
-    return RGWBucketMetadataHandler::do_put(op, entry, obj,
-                                            objv_tracker, y, dpp, type, from_remote_zone);
-  }
-
-};
-
-class RGWBucketInstanceMetadataHandler : public RGWBucketInstanceMetadataHandlerBase {
-  int read_bucket_instance_entry(RGWSI_Bucket_BI_Ctx& ctx,
-                                 const string& entry,
-                                 RGWBucketCompleteInfo *bi,
-                                 ceph::real_time *pmtime,
-                                 optional_yield y,
-                                 const DoutPrefixProvider *dpp) {
-    return svc.bucket->read_bucket_instance_info(ctx,
-                                                 entry,
-                                                 &bi->info,
-                                                 pmtime, &bi->attrs,
-                                                 y,
-                                                 dpp);
-  }
-
-public:
-  struct Svc {
-    RGWSI_Zone *zone{nullptr};
-    RGWSI_Bucket *bucket{nullptr};
-    RGWSI_BucketIndex *bi{nullptr};
-  } svc;
-
-  rgw::sal::Driver* driver;
-
-  RGWBucketInstanceMetadataHandler(rgw::sal::Driver* driver)
-    : driver(driver) {}
-
-  void init(RGWSI_Zone *zone_svc,
-           RGWSI_Bucket *bucket_svc,
-           RGWSI_BucketIndex *bi_svc) override {
-    base_init(bucket_svc->ctx(),
-              bucket_svc->get_bi_be_handler().get());
-    svc.zone = zone_svc;
-    svc.bucket = bucket_svc;
-    svc.bi = bi_svc;
-  }
-
-  string get_type() override { return "bucket.instance"; }
-
-  RGWMetadataObject *get_meta_obj(JSONObj *jo, const obj_version& objv, const ceph::real_time& mtime) override {
-    RGWBucketCompleteInfo bci;
-
-    try {
-      decode_json_obj(bci, jo);
-    } catch (JSONDecoder::err& e) {
-      return nullptr;
-    }
-
-    return new RGWBucketInstanceMetadataObject(bci, objv, mtime);
-  }
-
-  int do_get(RGWSI_MetaBackend_Handler::Op *op, string& entry, RGWMetadataObject **obj, optional_yield y, const DoutPrefixProvider *dpp) override {
-    RGWBucketCompleteInfo bci;
-    real_time mtime;
-
-    RGWSI_Bucket_BI_Ctx ctx(op->ctx());
-
-    int ret = svc.bucket->read_bucket_instance_info(ctx, entry, &bci.info, &mtime, &bci.attrs, y, dpp);
-    if (ret < 0)
-      return ret;
-
-    RGWBucketInstanceMetadataObject *mdo = new RGWBucketInstanceMetadataObject(bci, bci.info.objv_tracker.read_version, mtime);
-
-    *obj = mdo;
-
-    return 0;
-  }
-
-  int do_put(RGWSI_MetaBackend_Handler::Op *op, string& entry,
-             RGWMetadataObject *_obj, RGWObjVersionTracker& objv_tracker,
-            optional_yield y, const DoutPrefixProvider *dpp,
-             RGWMDLogSyncType sync_type, bool from_remote_zone) override;
-
-  int do_remove(RGWSI_MetaBackend_Handler::Op *op, string& entry, RGWObjVersionTracker& objv_tracker,
-                optional_yield y, const DoutPrefixProvider *dpp) override {
-    RGWBucketCompleteInfo bci;
-
-    RGWSI_Bucket_BI_Ctx ctx(op->ctx());
-
-    int ret = read_bucket_instance_entry(ctx, entry, &bci, nullptr, y, dpp);
-    if (ret < 0 && ret != -ENOENT)
-      return ret;
-
-    return svc.bucket->remove_bucket_instance_info(ctx, entry, bci.info, &bci.info.objv_tracker, y, dpp);
-  }
-
-  int call(std::function<int(RGWSI_Bucket_BI_Ctx& ctx)> f) {
-    return call(nullopt, f);
-  }
-
-  int call(std::optional<RGWSI_MetaBackend_CtxParams> bectx_params,
-           std::function<int(RGWSI_Bucket_BI_Ctx& ctx)> f) {
-    return be_handler->call(bectx_params, [&](RGWSI_MetaBackend_Handler::Op *op) {
-      RGWSI_Bucket_BI_Ctx ctx(op->ctx());
-      return f(ctx);
-    });
-  }
-};
-
-class RGWMetadataHandlerPut_BucketInstance : public RGWMetadataHandlerPut_SObj
-{
-  CephContext *cct;
-  RGWBucketInstanceMetadataHandler *bihandler;
-  RGWBucketInstanceMetadataObject *obj;
-public:
-  RGWMetadataHandlerPut_BucketInstance(CephContext *_cct,
-                                       RGWBucketInstanceMetadataHandler *_handler,
-                                       RGWSI_MetaBackend_Handler::Op *_op, string& entry,
-                                       RGWMetadataObject *_obj, RGWObjVersionTracker& objv_tracker,
-                                      optional_yield y,
-                                       RGWMDLogSyncType type, bool from_remote_zone) : RGWMetadataHandlerPut_SObj(_handler, _op, entry, obj, objv_tracker, y, type, from_remote_zone),
-                                       cct(_cct), bihandler(_handler) {
-    obj = static_cast<RGWBucketInstanceMetadataObject *>(_obj);
-
-    auto& bci = obj->get_bci();
-    obj->set_pattrs(&bci.attrs);
-  }
-
-  void encode_obj(bufferlist *bl) override {
-    obj->get_bucket_info().encode(*bl);
-  }
-
-  int put_check(const DoutPrefixProvider *dpp) override;
-  int put_checked(const DoutPrefixProvider *dpp) override;
-  int put_post(const DoutPrefixProvider *dpp) override;
-};
-
-int RGWBucketInstanceMetadataHandler::do_put(RGWSI_MetaBackend_Handler::Op *op,
-                                             string& entry,
-                                             RGWMetadataObject *obj,
-                                             RGWObjVersionTracker& objv_tracker,
-                                             optional_yield y,
-                                             const DoutPrefixProvider *dpp,
-                                             RGWMDLogSyncType type, bool from_remote_zone)
-{
-  RGWMetadataHandlerPut_BucketInstance put_op(svc.bucket->ctx(), this, op, entry, obj,
-                                              objv_tracker, y, type, from_remote_zone);
-  return do_put_operate(&put_op, dpp);
-}
-
-void init_default_bucket_layout(CephContext *cct, rgw::BucketLayout& layout,
-                               const RGWZone& zone,
-                               std::optional<uint32_t> shards,
-                               std::optional<rgw::BucketIndexType> type) {
-  layout.current_index.gen = 0;
-  layout.current_index.layout.normal.hash_type = rgw::BucketHashType::Mod;
-
-  layout.current_index.layout.type =
-    type.value_or(rgw::BucketIndexType::Normal);
-
-  if (shards) {
-    layout.current_index.layout.normal.num_shards = *shards;
-  } else if (cct->_conf->rgw_override_bucket_index_max_shards > 0) {
-    layout.current_index.layout.normal.num_shards =
-      cct->_conf->rgw_override_bucket_index_max_shards;
-  } else {
-    layout.current_index.layout.normal.num_shards =
-      zone.bucket_index_max_shards;
-  }
-
-  if (layout.current_index.layout.type == rgw::BucketIndexType::Normal) {
-    layout.logs.push_back(log_layout_from_index(0, layout.current_index));
-  }
-}
-
-int RGWMetadataHandlerPut_BucketInstance::put_check(const DoutPrefixProvider *dpp)
-{
-  int ret;
-
-  RGWBucketCompleteInfo& bci = obj->get_bci();
-
-  RGWBucketInstanceMetadataObject *orig_obj = static_cast<RGWBucketInstanceMetadataObject *>(old_obj);
-
-  RGWBucketCompleteInfo *old_bci = (orig_obj ? &orig_obj->get_bci() : nullptr);
-
-  const bool exists = (!!orig_obj);
-
-  if (from_remote_zone) {
-    // don't sync bucket layout changes
-    if (!exists) {
-      // replace peer's layout with default-constructed, then apply our defaults
-      bci.info.layout = rgw::BucketLayout{};
-      init_default_bucket_layout(cct, bci.info.layout,
-                                bihandler->svc.zone->get_zone(),
-                                std::nullopt, std::nullopt);
-    } else {
-      bci.info.layout = old_bci->info.layout;
-    }
-  }
-
-  if (!exists || old_bci->info.bucket.bucket_id != bci.info.bucket.bucket_id) {
-    /* a new bucket, we need to select a new bucket placement for it */
-    string tenant_name;
-    string bucket_name;
-    string bucket_instance;
-    parse_bucket(entry, &tenant_name, &bucket_name, &bucket_instance);
-
-    RGWZonePlacementInfo rule_info;
-    bci.info.bucket.name = bucket_name;
-    bci.info.bucket.bucket_id = bucket_instance;
-    bci.info.bucket.tenant = tenant_name;
-    // if the sync module never writes data, don't require the zone to specify all placement targets
-    if (bihandler->svc.zone->sync_module_supports_writes()) {
-      ret = bihandler->svc.zone->select_bucket_location_by_rule(dpp, bci.info.placement_rule, &rule_info, y);
-      if (ret < 0) {
-        ldpp_dout(dpp, 0) << "ERROR: select_bucket_placement() returned " << ret << dendl;
-        return ret;
-      }
-    }
-    bci.info.layout.current_index.layout.type = rule_info.index_type;
-  } else {
-    /* existing bucket, keep its placement */
-    bci.info.bucket.explicit_placement = old_bci->info.bucket.explicit_placement;
-    bci.info.placement_rule = old_bci->info.placement_rule;
-  }
-
-  /* record the read version (if any), store the new version */
-  bci.info.objv_tracker.read_version = objv_tracker.read_version;
-  bci.info.objv_tracker.write_version = objv_tracker.write_version;
-
-  return 0;
-}
-
-int RGWMetadataHandlerPut_BucketInstance::put_checked(const DoutPrefixProvider *dpp)
-{
-  RGWBucketInstanceMetadataObject *orig_obj = static_cast<RGWBucketInstanceMetadataObject *>(old_obj);
-
-  RGWBucketInfo *orig_info = (orig_obj ? &orig_obj->get_bucket_info() : nullptr);
-
-  auto& info = obj->get_bucket_info();
-  auto mtime = obj->get_mtime();
-  auto pattrs = obj->get_pattrs();
-
-  RGWSI_Bucket_BI_Ctx ctx(op->ctx());
-
-  return bihandler->svc.bucket->store_bucket_instance_info(ctx,
-                                                         entry,
-                                                         info,
-                                                         orig_info,
-                                                         false,
-                                                         mtime,
-                                                         pattrs,
-                                                        y,
-                                                         dpp);
-}
-
-int RGWMetadataHandlerPut_BucketInstance::put_post(const DoutPrefixProvider *dpp)
-{
-  RGWBucketCompleteInfo& bci = obj->get_bci();
-
-  objv_tracker = bci.info.objv_tracker;
-
-  int ret = bihandler->svc.bi->init_index(dpp, bci.info, bci.info.layout.current_index);
-  if (ret < 0) {
-    return ret;
-  }
-
-  /* update lifecyle policy */
-  {
-    std::unique_ptr<rgw::sal::Bucket> bucket;
-    ret = bihandler->driver->get_bucket(nullptr, bci.info, &bucket);
-    if (ret < 0) {
-      ldpp_dout(dpp, 0) << __func__ << " failed to get_bucket(...) for "
-                       << bci.info.bucket.name
-                       << dendl;
-      return ret;
-    }
-
-    auto lc = bihandler->driver->get_rgwlc();
-
-    auto lc_it = bci.attrs.find(RGW_ATTR_LC);
-    if (lc_it != bci.attrs.end()) {
-      ldpp_dout(dpp, 20) << "set lc config for " << bci.info.bucket.name << dendl;
-      ret = lc->set_bucket_config(bucket.get(), bci.attrs, nullptr);
-      if (ret < 0) {
-             ldpp_dout(dpp, 0) << __func__ << " failed to set lc config for "
-                       << bci.info.bucket.name
-                       << dendl;
-             return ret;
-      }
-
-    } else {
-      ldpp_dout(dpp, 20) << "remove lc config for " << bci.info.bucket.name << dendl;
-      ret = lc->remove_bucket_config(bucket.get(), bci.attrs, false /* cannot merge attrs */);
-      if (ret < 0) {
-             ldpp_dout(dpp, 0) << __func__ << " failed to remove lc config for "
-                       << bci.info.bucket.name
-                       << dendl;
-             return ret;
-      }
-    }
-  } /* update lc */
-
-  return STATUS_APPLIED;
-}
-
-class RGWArchiveBucketInstanceMetadataHandler : public RGWBucketInstanceMetadataHandler {
-public:
-  RGWArchiveBucketInstanceMetadataHandler(rgw::sal::Driver* driver)
-    : RGWBucketInstanceMetadataHandler(driver) {}
-
-  // N.B. replication of lifecycle policy relies on logic in RGWBucketInstanceMetadataHandler::do_put(...), override with caution
-
-  int do_remove(RGWSI_MetaBackend_Handler::Op *op, string& entry, RGWObjVersionTracker& objv_tracker, optional_yield y, const DoutPrefixProvider *dpp) override {
-    ldpp_dout(dpp, 0) << "SKIP: bucket instance removal is not allowed on archive zone: bucket.instance:" << entry << dendl;
-    return 0;
-  }
-};
-
-RGWBucketCtl::RGWBucketCtl(RGWSI_Zone *zone_svc,
-                           RGWSI_Bucket *bucket_svc,
-                           RGWSI_Bucket_Sync *bucket_sync_svc,
-                           RGWSI_BucketIndex *bi_svc,
-                           RGWSI_User* user_svc)
-  : cct(zone_svc->ctx())
-{
-  svc.zone = zone_svc;
-  svc.bucket = bucket_svc;
-  svc.bucket_sync = bucket_sync_svc;
-  svc.bi = bi_svc;
-  svc.user = user_svc;
-}
-
-void RGWBucketCtl::init(RGWUserCtl *user_ctl,
-                        RGWBucketMetadataHandler *_bm_handler,
-                        RGWBucketInstanceMetadataHandler *_bmi_handler,
-                        RGWDataChangesLog *datalog,
-                        const DoutPrefixProvider *dpp)
-{
-  ctl.user = user_ctl;
-
-  bm_handler = _bm_handler;
-  bmi_handler = _bmi_handler;
-
-  bucket_be_handler = bm_handler->get_be_handler();
-  bi_be_handler = bmi_handler->get_be_handler();
-
-  datalog->set_bucket_filter(
-    [this](const rgw_bucket& bucket, optional_yield y, const DoutPrefixProvider *dpp) {
-      return bucket_exports_data(bucket, y, dpp);
-    });
-}
-
-int RGWBucketCtl::call(std::function<int(RGWSI_Bucket_X_Ctx& ctx)> f) {
-  return bm_handler->call([&](RGWSI_Bucket_EP_Ctx& ep_ctx) {
-    return bmi_handler->call([&](RGWSI_Bucket_BI_Ctx& bi_ctx) {
-      RGWSI_Bucket_X_Ctx ctx{ep_ctx, bi_ctx};
-      return f(ctx);
-    });
-  });
-}
-
-int RGWBucketCtl::read_bucket_entrypoint_info(const rgw_bucket& bucket,
-                                              RGWBucketEntryPoint *info,
-                                              optional_yield y, const DoutPrefixProvider *dpp,
-                                              const Bucket::GetParams& params)
-{
-  return bm_handler->call(params.bectx_params, [&](RGWSI_Bucket_EP_Ctx& ctx) {
-    return svc.bucket->read_bucket_entrypoint_info(ctx,
-                                                   RGWSI_Bucket::get_entrypoint_meta_key(bucket),
-                                                   info,
-                                                   params.objv_tracker,
-                                                   params.mtime,
-                                                   params.attrs,
-                                                  y,
-                                                   dpp,
-                                                   params.cache_info,
-                                                   params.refresh_version);
-  });
-}
-
-int RGWBucketCtl::store_bucket_entrypoint_info(const rgw_bucket& bucket,
-                                               RGWBucketEntryPoint& info,
-                                               optional_yield y,
-                                               const DoutPrefixProvider *dpp,
-                                               const Bucket::PutParams& params)
-{
-  return bm_handler->call([&](RGWSI_Bucket_EP_Ctx& ctx) {
-    return svc.bucket->store_bucket_entrypoint_info(ctx,
-                                                    RGWSI_Bucket::get_entrypoint_meta_key(bucket),
-                                                    info,
-                                                    params.exclusive,
-                                                    params.mtime,
-                                                    params.attrs,
-                                                    params.objv_tracker,
-                                                    y,
-                                                    dpp);
-  });
-}
-
-int RGWBucketCtl::remove_bucket_entrypoint_info(const rgw_bucket& bucket,
-                                                optional_yield y,
-                                                const DoutPrefixProvider *dpp,
-                                                const Bucket::RemoveParams& params)
-{
-  return bm_handler->call([&](RGWSI_Bucket_EP_Ctx& ctx) {
-    return svc.bucket->remove_bucket_entrypoint_info(ctx,
-                                                     RGWSI_Bucket::get_entrypoint_meta_key(bucket),
-                                                     params.objv_tracker,
-                                                    y,
-                                                     dpp);
-  });
-}
-
-int RGWBucketCtl::read_bucket_instance_info(const rgw_bucket& bucket,
-                                            RGWBucketInfo *info,
-                                            optional_yield y,
-                                            const DoutPrefixProvider *dpp,
-                                            const BucketInstance::GetParams& params)
-{
-  int ret = bmi_handler->call(params.bectx_params, [&](RGWSI_Bucket_BI_Ctx& ctx) {
-    return svc.bucket->read_bucket_instance_info(ctx,
-                                                 RGWSI_Bucket::get_bi_meta_key(bucket),
-                                                 info,
-                                                 params.mtime,
-                                                 params.attrs,
-                                                y,
-                                                 dpp,
-                                                 params.cache_info,
-                                                 params.refresh_version);
-  });
-
-  if (ret < 0) {
-    return ret;
-  }
-
-  if (params.objv_tracker) {
-    *params.objv_tracker = info->objv_tracker;
-  }
-
-  return 0;
-}
-
-int RGWBucketCtl::read_bucket_info(const rgw_bucket& bucket,
-                                   RGWBucketInfo *info,
-                                   optional_yield y,
-                                   const DoutPrefixProvider *dpp,
-                                   const BucketInstance::GetParams& params,
-                                   RGWObjVersionTracker *ep_objv_tracker)
-{
-  const rgw_bucket *b = &bucket;
-
-  std::optional<RGWBucketEntryPoint> ep;
-
-  if (b->bucket_id.empty()) {
-    ep.emplace();
-
-    int r = read_bucket_entrypoint_info(*b, &(*ep), y, dpp, RGWBucketCtl::Bucket::GetParams()
-                                                    .set_bectx_params(params.bectx_params)
-                                                    .set_objv_tracker(ep_objv_tracker));
-    if (r < 0) {
-      return r;
-    }
-
-    b = &ep->bucket;
-  }
-
-  int ret = bmi_handler->call(params.bectx_params, [&](RGWSI_Bucket_BI_Ctx& ctx) {
-    return svc.bucket->read_bucket_instance_info(ctx,
-                                                 RGWSI_Bucket::get_bi_meta_key(*b),
-                                                 info,
-                                                 params.mtime,
-                                                 params.attrs,
-                                                y, dpp,
-                                                 params.cache_info,
-                                                 params.refresh_version);
-  });
-
-  if (ret < 0) {
-    return ret;
-  }
-
-  if (params.objv_tracker) {
-    *params.objv_tracker = info->objv_tracker;
-  }
-
-  return 0;
-}
-
-int RGWBucketCtl::do_store_bucket_instance_info(RGWSI_Bucket_BI_Ctx& ctx,
-                                                const rgw_bucket& bucket,
-                                                RGWBucketInfo& info,
-                                                optional_yield y,
-                                                const DoutPrefixProvider *dpp,
-                                                const BucketInstance::PutParams& params)
-{
-  if (params.objv_tracker) {
-    info.objv_tracker = *params.objv_tracker;
-  }
-
-  return svc.bucket->store_bucket_instance_info(ctx,
-                                                RGWSI_Bucket::get_bi_meta_key(bucket),
-                                                info,
-                                                params.orig_info,
-                                                params.exclusive,
-                                                params.mtime,
-                                                params.attrs,
-                                                y,
-                                                dpp);
-}
-
-int RGWBucketCtl::store_bucket_instance_info(const rgw_bucket& bucket,
-                                            RGWBucketInfo& info,
-                                            optional_yield y,
-                                            const DoutPrefixProvider *dpp,
-                                            const BucketInstance::PutParams& params)
-{
-  return bmi_handler->call([&](RGWSI_Bucket_BI_Ctx& ctx) {
-    return do_store_bucket_instance_info(ctx, bucket, info, y, dpp, params);
-  });
-}
-
-int RGWBucketCtl::remove_bucket_instance_info(const rgw_bucket& bucket,
-                                              RGWBucketInfo& info,
-                                              optional_yield y,
-                                              const DoutPrefixProvider *dpp,
-                                              const BucketInstance::RemoveParams& params)
-{
-  if (params.objv_tracker) {
-    info.objv_tracker = *params.objv_tracker;
-  }
-
-  return bmi_handler->call([&](RGWSI_Bucket_BI_Ctx& ctx) {
-    return svc.bucket->remove_bucket_instance_info(ctx,
-                                                   RGWSI_Bucket::get_bi_meta_key(bucket),
-                                                   info,
-                                                   &info.objv_tracker,
-                                                   y,
-                                                   dpp);
-  });
-}
-
-int RGWBucketCtl::do_store_linked_bucket_info(RGWSI_Bucket_X_Ctx& ctx,
-                                              RGWBucketInfo& info,
-                                              RGWBucketInfo *orig_info,
-                                              bool exclusive, real_time mtime,
-                                              obj_version *pep_objv,
-                                              map<string, bufferlist> *pattrs,
-                                              bool create_entry_point,
-                                             optional_yield y, const DoutPrefixProvider *dpp)
-{
-  bool create_head = !info.has_instance_obj || create_entry_point;
-
-  int ret = svc.bucket->store_bucket_instance_info(ctx.bi,
-                                                   RGWSI_Bucket::get_bi_meta_key(info.bucket),
-                                                   info,
-                                                   orig_info,
-                                                   exclusive,
-                                                   mtime, pattrs,
-                                                  y, dpp);
-  if (ret < 0) {
-    return ret;
-  }
-
-  if (!create_head)
-    return 0; /* done! */
-
-  RGWBucketEntryPoint entry_point;
-  entry_point.bucket = info.bucket;
-  entry_point.owner = info.owner;
-  entry_point.creation_time = info.creation_time;
-  entry_point.linked = true;
-  RGWObjVersionTracker ot;
-  if (pep_objv && !pep_objv->tag.empty()) {
-    ot.write_version = *pep_objv;
-  } else {
-    ot.generate_new_write_ver(cct);
-    if (pep_objv) {
-      *pep_objv = ot.write_version;
-    }
-  }
-  ret = svc.bucket->store_bucket_entrypoint_info(ctx.ep,
-                                                 RGWSI_Bucket::get_entrypoint_meta_key(info.bucket),
-                                                 entry_point,
-                                                 exclusive,
-                                                 mtime,
-                                                 pattrs,
-                                                 &ot,
-                                                y,
-                                                 dpp);
-  if (ret < 0)
-    return ret;
-
-  return 0;
-}
-int RGWBucketCtl::convert_old_bucket_info(RGWSI_Bucket_X_Ctx& ctx,
-                                          const rgw_bucket& bucket,
-                                          optional_yield y,
-                                          const DoutPrefixProvider *dpp)
-{
-  RGWBucketEntryPoint entry_point;
-  real_time ep_mtime;
-  RGWObjVersionTracker ot;
-  map<string, bufferlist> attrs;
-  RGWBucketInfo info;
-  auto cct = svc.bucket->ctx();
-
-  ldpp_dout(dpp, 10) << "RGWRados::convert_old_bucket_info(): bucket=" << bucket << dendl;
-
-  int ret = svc.bucket->read_bucket_entrypoint_info(ctx.ep,
-                                                    RGWSI_Bucket::get_entrypoint_meta_key(bucket),
-                                                    &entry_point, &ot, &ep_mtime, &attrs, y, dpp);
-  if (ret < 0) {
-    ldpp_dout(dpp, 0) << "ERROR: get_bucket_entrypoint_info() returned " << ret << " bucket=" << bucket << dendl;
-    return ret;
-  }
-
-  if (!entry_point.has_bucket_info) {
-    /* already converted! */
-    return 0;
-  }
-
-  info = entry_point.old_bucket_info;
-
-  ot.generate_new_write_ver(cct);
-
-  ret = do_store_linked_bucket_info(ctx, info, nullptr, false, ep_mtime, &ot.write_version, &attrs, true, y, dpp);
-  if (ret < 0) {
-    ldpp_dout(dpp, 0) << "ERROR: failed to put_linked_bucket_info(): " << ret << dendl;
-    return ret;
-  }
-
-  return 0;
-}
-
-int RGWBucketCtl::set_bucket_instance_attrs(RGWBucketInfo& bucket_info,
-                                            map<string, bufferlist>& attrs,
-                                            RGWObjVersionTracker *objv_tracker,
-                                            optional_yield y,
-                                            const DoutPrefixProvider *dpp)
-{
-  return call([&](RGWSI_Bucket_X_Ctx& ctx) {
-    rgw_bucket& bucket = bucket_info.bucket;
-
-    if (!bucket_info.has_instance_obj) {
-      /* an old bucket object, need to convert it */
-        int ret = convert_old_bucket_info(ctx, bucket, y, dpp);
-        if (ret < 0) {
-          ldpp_dout(dpp, 0) << "ERROR: failed converting old bucket info: " << ret << dendl;
-          return ret;
-        }
-    }
-
-    return do_store_bucket_instance_info(ctx.bi,
-                                         bucket,
-                                         bucket_info,
-                                         y,
-                                         dpp,
-                                         BucketInstance::PutParams().set_attrs(&attrs)
-                                                                    .set_objv_tracker(objv_tracker)
-                                                                    .set_orig_info(&bucket_info));
-    });
-}
-
-
-int RGWBucketCtl::link_bucket(const rgw_user& user_id,
-                              const rgw_bucket& bucket,
-                              ceph::real_time creation_time,
-                             optional_yield y,
-                              const DoutPrefixProvider *dpp,
-                              bool update_entrypoint,
-                              rgw_ep_info *pinfo)
-{
-  return bm_handler->call([&](RGWSI_Bucket_EP_Ctx& ctx) {
-    return do_link_bucket(ctx, user_id, bucket, creation_time,
-                          update_entrypoint, pinfo, y, dpp);
-  });
-}
-
-int RGWBucketCtl::do_link_bucket(RGWSI_Bucket_EP_Ctx& ctx,
-                                 const rgw_user& user_id,
-                                 const rgw_bucket& bucket,
-                                 ceph::real_time creation_time,
-                                 bool update_entrypoint,
-                                 rgw_ep_info *pinfo,
-                                optional_yield y,
-                                 const DoutPrefixProvider *dpp)
-{
-  int ret;
-
-  RGWBucketEntryPoint ep;
-  RGWObjVersionTracker ot;
-  RGWObjVersionTracker& rot = (pinfo) ? pinfo->ep_objv : ot;
-  map<string, bufferlist> attrs, *pattrs = nullptr;
-  string meta_key;
-
-  if (update_entrypoint) {
-    meta_key = RGWSI_Bucket::get_entrypoint_meta_key(bucket);
-    if (pinfo) {
-      ep = pinfo->ep;
-      pattrs = &pinfo->attrs;
-    } else {
-      ret = svc.bucket->read_bucket_entrypoint_info(ctx,
-                                                    meta_key,
-                                                    &ep, &rot,
-                                                    nullptr, &attrs,
-                                                    y, dpp);
-      if (ret < 0 && ret != -ENOENT) {
-        ldpp_dout(dpp, 0) << "ERROR: read_bucket_entrypoint_info() returned: "
-                      << cpp_strerror(-ret) << dendl;
-      }
-      pattrs = &attrs;
-    }
-  }
-
-  ret = svc.user->add_bucket(dpp, user_id, bucket, creation_time, y);
-  if (ret < 0) {
-    ldpp_dout(dpp, 0) << "ERROR: error adding bucket to user directory:"
-                 << " user=" << user_id
-                  << " bucket=" << bucket
-                 << " err=" << cpp_strerror(-ret)
-                 << dendl;
-    goto done_err;
-  }
-
-  if (!update_entrypoint)
-    return 0;
-
-  ep.linked = true;
-  ep.owner = user_id;
-  ep.bucket = bucket;
-  ret = svc.bucket->store_bucket_entrypoint_info(
-    ctx, meta_key, ep, false, real_time(), pattrs, &rot, y, dpp);
-  if (ret < 0)
-    goto done_err;
-
-  return 0;
-
-done_err:
-  int r = do_unlink_bucket(ctx, user_id, bucket, true, y, dpp);
-  if (r < 0) {
-    ldpp_dout(dpp, 0) << "ERROR: failed unlinking bucket on error cleanup: "
-                           << cpp_strerror(-r) << dendl;
-  }
-  return ret;
-}
-
-int RGWBucketCtl::unlink_bucket(const rgw_user& user_id, const rgw_bucket& bucket, optional_yield y, const DoutPrefixProvider *dpp, bool update_entrypoint)
-{
-  return bm_handler->call([&](RGWSI_Bucket_EP_Ctx& ctx) {
-    return do_unlink_bucket(ctx, user_id, bucket, update_entrypoint, y, dpp);
-  });
-}
-
-int RGWBucketCtl::do_unlink_bucket(RGWSI_Bucket_EP_Ctx& ctx,
-                                   const rgw_user& user_id,
-                                   const rgw_bucket& bucket,
-                                   bool update_entrypoint,
-                                  optional_yield y,
-                                   const DoutPrefixProvider *dpp)
-{
-  int ret = svc.user->remove_bucket(dpp, user_id, bucket, y);
-  if (ret < 0) {
-    ldpp_dout(dpp, 0) << "ERROR: error removing bucket from directory: "
-        << cpp_strerror(-ret)<< dendl;
-  }
-
-  if (!update_entrypoint)
-    return 0;
-
-  RGWBucketEntryPoint ep;
-  RGWObjVersionTracker ot;
-  map<string, bufferlist> attrs;
-  string meta_key = RGWSI_Bucket::get_entrypoint_meta_key(bucket);
-  ret = svc.bucket->read_bucket_entrypoint_info(ctx, meta_key, &ep, &ot, nullptr, &attrs, y, dpp);
-  if (ret == -ENOENT)
-    return 0;
-  if (ret < 0)
-    return ret;
-
-  if (!ep.linked)
-    return 0;
-
-  if (ep.owner != user_id) {
-    ldpp_dout(dpp, 0) << "bucket entry point user mismatch, can't unlink bucket: " << ep.owner << " != " << user_id << dendl;
-    return -EINVAL;
-  }
-
-  ep.linked = false;
-  return svc.bucket->store_bucket_entrypoint_info(ctx, meta_key, ep, false, real_time(), &attrs, &ot, y, dpp);
-}
-
-// TODO: remove RGWRados dependency for bucket listing
-int RGWBucketCtl::chown(rgw::sal::Driver* driver, rgw::sal::Bucket* bucket,
-                        const rgw_user& user_id, const std::string& display_name,
-                        const std::string& marker, optional_yield y, const DoutPrefixProvider *dpp)
-{
-  map<string, bool> common_prefixes;
-
-  rgw::sal::Bucket::ListParams params;
-  rgw::sal::Bucket::ListResults results;
-
-  params.list_versions = true;
-  params.allow_unordered = true;
-  params.marker = marker;
-
-  int count = 0;
-  int max_entries = 1000;
-
-  //Loop through objects and update object acls to point to bucket owner
-
-  do {
-    RGWObjectCtx obj_ctx(driver);
-    results.objs.clear();
-    int ret = bucket->list(dpp, params, max_entries, results, y);
-    if (ret < 0) {
-      ldpp_dout(dpp, 0) << "ERROR: list objects failed: " << cpp_strerror(-ret) << dendl;
-      return ret;
-    }
-
-    params.marker = results.next_marker;
-    count += results.objs.size();
-
-    for (const auto& obj : results.objs) {
-      std::unique_ptr<rgw::sal::Object> r_obj = bucket->get_object(obj.key);
-
-      ret = r_obj->get_obj_attrs(y, dpp);
-      if (ret < 0){
-        ldpp_dout(dpp, 0) << "ERROR: failed to read object " << obj.key.name << cpp_strerror(-ret) << dendl;
-        continue;
-      }
-      const auto& aiter = r_obj->get_attrs().find(RGW_ATTR_ACL);
-      if (aiter == r_obj->get_attrs().end()) {
-        ldpp_dout(dpp, 0) << "ERROR: no acls found for object " << obj.key.name << " .Continuing with next object." << dendl;
-        continue;
-      } else {
-        bufferlist& bl = aiter->second;
-        RGWAccessControlPolicy policy(driver->ctx());
-        ACLOwner owner;
-        try {
-          decode(policy, bl);
-          owner = policy.get_owner();
-        } catch (buffer::error& err) {
-          ldpp_dout(dpp, 0) << "ERROR: decode policy failed" << err.what()
-                                << dendl;
-          return -EIO;
-        }
-
-        //Get the ACL from the policy
-        RGWAccessControlList& acl = policy.get_acl();
-
-        //Remove grant that is set to old owner
-        acl.remove_canon_user_grant(owner.get_id());
-
-        //Create a grant and add grant
-        ACLGrant grant;
-        grant.set_canon(user_id, display_name, RGW_PERM_FULL_CONTROL);
-        acl.add_grant(&grant);
-
-        //Update the ACL owner to the new user
-        owner.set_id(user_id);
-        owner.set_name(display_name);
-        policy.set_owner(owner);
-
-        bl.clear();
-        encode(policy, bl);
-
-       r_obj->set_atomic();
-       map<string, bufferlist> attrs;
-       attrs[RGW_ATTR_ACL] = bl;
-       ret = r_obj->set_obj_attrs(dpp, &attrs, nullptr, y);
-        if (ret < 0) {
-          ldpp_dout(dpp, 0) << "ERROR: modify attr failed " << cpp_strerror(-ret) << dendl;
-          return ret;
-        }
-      }
-    }
-    cerr << count << " objects processed in " << bucket
-        << ". Next marker " << params.marker.name << std::endl;
-  } while(results.is_truncated);
-  return 0;
-}
-
-int RGWBucketCtl::read_bucket_stats(const rgw_bucket& bucket,
-                                    RGWBucketEnt *result,
-                                    optional_yield y,
-                                    const DoutPrefixProvider *dpp)
-{
-  return call([&](RGWSI_Bucket_X_Ctx& ctx) {
-    return svc.bucket->read_bucket_stats(ctx, bucket, result, y, dpp);
-  });
-}
-
-int RGWBucketCtl::read_buckets_stats(map<string, RGWBucketEnt>& m,
-                                     optional_yield y, const DoutPrefixProvider *dpp)
-{
-  return call([&](RGWSI_Bucket_X_Ctx& ctx) {
-    return svc.bucket->read_buckets_stats(ctx, m, y, dpp);
-  });
-}
-
-int RGWBucketCtl::sync_user_stats(const DoutPrefixProvider *dpp, 
-                                  const rgw_user& user_id,
-                                  const RGWBucketInfo& bucket_info,
-                                 optional_yield y,
-                                  RGWBucketEnt* pent)
-{
-  RGWBucketEnt ent;
-  if (!pent) {
-    pent = &ent;
-  }
-  int r = svc.bi->read_stats(dpp, bucket_info, pent, null_yield);
-  if (r < 0) {
-    ldpp_dout(dpp, 20) << __func__ << "(): failed to read bucket stats (r=" << r << ")" << dendl;
-    return r;
-  }
-
-  return svc.user->flush_bucket_stats(dpp, user_id, *pent, y);
-}
-
-int RGWBucketCtl::get_sync_policy_handler(std::optional<rgw_zone_id> zone,
-                                          std::optional<rgw_bucket> bucket,
-                                          RGWBucketSyncPolicyHandlerRef *phandler,
-                                          optional_yield y,
-                                          const DoutPrefixProvider *dpp)
-{
-  int r = call([&](RGWSI_Bucket_X_Ctx& ctx) {
-    return svc.bucket_sync->get_policy_handler(ctx, zone, bucket, phandler, y, dpp);
-  });
-  if (r < 0) {
-    ldpp_dout(dpp, 20) << __func__ << "(): failed to get policy handler for bucket=" << bucket << " (r=" << r << ")" << dendl;
-    return r;
-  }
-  return 0;
-}
-
-int RGWBucketCtl::bucket_exports_data(const rgw_bucket& bucket,
-                                      optional_yield y,
-                                      const DoutPrefixProvider *dpp)
-{
-
-  RGWBucketSyncPolicyHandlerRef handler;
-
-  int r = get_sync_policy_handler(std::nullopt, bucket, &handler, y, dpp);
-  if (r < 0) {
-    return r;
-  }
-
-  return handler->bucket_exports_data();
-}
-
-int RGWBucketCtl::bucket_imports_data(const rgw_bucket& bucket,
-                                      optional_yield y, const DoutPrefixProvider *dpp)
-{
-
-  RGWBucketSyncPolicyHandlerRef handler;
-
-  int r = get_sync_policy_handler(std::nullopt, bucket, &handler, y, dpp);
-  if (r < 0) {
-    return r;
-  }
-
-  return handler->bucket_imports_data();
-}
-
-RGWBucketMetadataHandlerBase* RGWBucketMetaHandlerAllocator::alloc()
-{
-  return new RGWBucketMetadataHandler();
-}
-
-RGWBucketInstanceMetadataHandlerBase* RGWBucketInstanceMetaHandlerAllocator::alloc(rgw::sal::Driver* driver)
-{
-  return new RGWBucketInstanceMetadataHandler(driver);
-}
-
-RGWBucketMetadataHandlerBase* RGWArchiveBucketMetaHandlerAllocator::alloc()
-{
-  return new RGWArchiveBucketMetadataHandler();
-}
-
-RGWBucketInstanceMetadataHandlerBase* RGWArchiveBucketInstanceMetaHandlerAllocator::alloc(rgw::sal::Driver* driver)
-{
-  return new RGWArchiveBucketInstanceMetadataHandler(driver);
-}
-
-
-void RGWBucketEntryPoint::generate_test_instances(list<RGWBucketEntryPoint*>& o)
-{
-  RGWBucketEntryPoint *bp = new RGWBucketEntryPoint();
-  init_bucket(&bp->bucket, "tenant", "bucket", "pool", ".index.pool", "marker", "10");
-  bp->owner = "owner";
-  bp->creation_time = ceph::real_clock::from_ceph_timespec({ceph_le32(2), ceph_le32(3)});
-
-  o.push_back(bp);
-  o.push_back(new RGWBucketEntryPoint);
-}
-
-void RGWBucketEntryPoint::dump(Formatter *f) const
-{
-  encode_json("bucket", bucket, f);
-  encode_json("owner", owner, f);
-  utime_t ut(creation_time);
-  encode_json("creation_time", ut, f);
-  encode_json("linked", linked, f);
-  encode_json("has_bucket_info", has_bucket_info, f);
-  if (has_bucket_info) {
-    encode_json("old_bucket_info", old_bucket_info, f);
-  }
-}
-
-void RGWBucketEntryPoint::decode_json(JSONObj *obj) {
-  JSONDecoder::decode_json("bucket", bucket, obj);
-  JSONDecoder::decode_json("owner", owner, obj);
-  utime_t ut;
-  JSONDecoder::decode_json("creation_time", ut, obj);
-  creation_time = ut.to_real_time();
-  JSONDecoder::decode_json("linked", linked, obj);
-  JSONDecoder::decode_json("has_bucket_info", has_bucket_info, obj);
-  if (has_bucket_info) {
-    JSONDecoder::decode_json("old_bucket_info", old_bucket_info, obj);
-  }
-}
-
diff --git a/src/rgw/store/rados/rgw_bucket.h b/src/rgw/store/rados/rgw_bucket.h
deleted file mode 100644 (file)
index 636a1f2..0000000
+++ /dev/null
@@ -1,765 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab ft=cpp
-
-#pragma once
-
-#include <string>
-#include <memory>
-#include <variant>
-
-#include <boost/container/flat_map.hpp>
-#include <boost/container/flat_set.hpp>
-
-#include "include/types.h"
-#include "rgw_common.h"
-#include "rgw_tools.h"
-#include "rgw_metadata.h"
-
-#include "rgw_string.h"
-#include "rgw_sal.h"
-
-#include "common/Formatter.h"
-#include "common/lru_map.h"
-#include "common/ceph_time.h"
-
-#include "rgw_formats.h"
-
-#include "services/svc_bucket_types.h"
-#include "services/svc_bucket_sync.h"
-
-// define as static when RGWBucket implementation completes
-extern void rgw_get_buckets_obj(const rgw_user& user_id, std::string& buckets_obj_id);
-
-class RGWSI_Meta;
-class RGWBucketMetadataHandler;
-class RGWBucketInstanceMetadataHandler;
-class RGWUserCtl;
-class RGWBucketCtl;
-class RGWZone;
-struct RGWZoneParams;
-
-extern void init_bucket(rgw_bucket *b, const char *t, const char *n, const char *dp, const char *ip, const char *m, const char *id);
-extern int rgw_bucket_parse_bucket_key(CephContext *cct, const std::string& key,
-                                       rgw_bucket* bucket, int *shard_id);
-
-extern std::string rgw_make_bucket_entry_name(const std::string& tenant_name,
-                                              const std::string& bucket_name);
-
-extern void rgw_parse_url_bucket(const std::string& bucket,
-                                 const std::string& auth_tenant,
-                                 std::string &tenant_name, std::string &bucket_name);
-
-// this is used as a filter to RGWRados::cls_bucket_list_ordered; it
-// conforms to the type RGWBucketListNameFilter
-extern bool rgw_bucket_object_check_filter(const std::string& oid);
-
-void init_default_bucket_layout(CephContext *cct, rgw::BucketLayout& layout,
-                               const RGWZone& zone,
-                               std::optional<uint32_t> shards,
-                               std::optional<rgw::BucketIndexType> type);
-
-struct RGWBucketCompleteInfo {
-  RGWBucketInfo info;
-  std::map<std::string, bufferlist> attrs;
-
-  void dump(Formatter *f) const;
-  void decode_json(JSONObj *obj);
-};
-
-class RGWBucketEntryMetadataObject : public RGWMetadataObject {
-  RGWBucketEntryPoint ep;
-  std::map<std::string, bufferlist> attrs;
-public:
-  RGWBucketEntryMetadataObject(RGWBucketEntryPoint& _ep, const obj_version& v, real_time m) : ep(_ep) {
-    objv = v;
-    mtime = m;
-    set_pattrs (&attrs);
-  }
-  RGWBucketEntryMetadataObject(RGWBucketEntryPoint& _ep, const obj_version& v, real_time m, std::map<std::string, bufferlist>&& _attrs) :
-    ep(_ep), attrs(std::move(_attrs)) {
-    objv = v;
-    mtime = m;
-    set_pattrs (&attrs);
-  }
-
-  void dump(Formatter *f) const override {
-    ep.dump(f);
-  }
-
-  RGWBucketEntryPoint& get_ep() {
-    return ep;
-  }
-
-  std::map<std::string, bufferlist>& get_attrs() {
-    return attrs;
-  }
-};
-
-class RGWBucketInstanceMetadataObject : public RGWMetadataObject {
-  RGWBucketCompleteInfo info;
-public:
-  RGWBucketInstanceMetadataObject() {}
-  RGWBucketInstanceMetadataObject(RGWBucketCompleteInfo& i, const obj_version& v, real_time m) : info(i) {
-    objv = v;
-    mtime = m;
-  }
-
-  void dump(Formatter *f) const override {
-    info.dump(f);
-  }
-
-  void decode_json(JSONObj *obj) {
-    info.decode_json(obj);
-  }
-
-  RGWBucketCompleteInfo& get_bci() {
-    return info;
-  }
-  RGWBucketInfo& get_bucket_info() {
-    return info.info;
-  }
-};
-
-/**
- * store a list of the user's buckets, with associated functinos.
- */
-class RGWUserBuckets {
-  std::map<std::string, RGWBucketEnt> buckets;
-
-public:
-  RGWUserBuckets() = default;
-  RGWUserBuckets(RGWUserBuckets&&) = default;
-
-  RGWUserBuckets& operator=(const RGWUserBuckets&) = default;
-
-  void encode(bufferlist& bl) const {
-    using ceph::encode;
-    encode(buckets, bl);
-  }
-  void decode(bufferlist::const_iterator& bl) {
-    using ceph::decode;
-    decode(buckets, bl);
-  }
-  /**
-   * Check if the user owns a bucket by the given name.
-   */
-  bool owns(std::string& name) {
-    std::map<std::string, RGWBucketEnt>::iterator iter;
-    iter = buckets.find(name);
-    return (iter != buckets.end());
-  }
-
-  /**
-   * Add a (created) bucket to the user's bucket list.
-   */
-  void add(const RGWBucketEnt& bucket) {
-    buckets[bucket.bucket.name] = bucket;
-  }
-
-  /**
-   * Remove a bucket from the user's list by name.
-   */
-  void remove(const std::string& name) {
-    std::map<std::string, RGWBucketEnt>::iterator iter;
-    iter = buckets.find(name);
-    if (iter != buckets.end()) {
-      buckets.erase(iter);
-    }
-  }
-
-  /**
-   * Get the user's buckets as a map.
-   */
-  std::map<std::string, RGWBucketEnt>& get_buckets() { return buckets; }
-
-  /**
-   * Cleanup data structure
-   */
-  void clear() { buckets.clear(); }
-
-  size_t count() { return buckets.size(); }
-};
-WRITE_CLASS_ENCODER(RGWUserBuckets)
-
-class RGWBucketMetadataHandlerBase : public RGWMetadataHandler_GenericMetaBE {
-public:
-  virtual ~RGWBucketMetadataHandlerBase() {}
-  virtual void init(RGWSI_Bucket *bucket_svc,
-                    RGWBucketCtl *bucket_ctl) = 0;
-
-};
-
-class RGWBucketInstanceMetadataHandlerBase : public RGWMetadataHandler_GenericMetaBE {
-public:
-  virtual ~RGWBucketInstanceMetadataHandlerBase() {}
-  virtual void init(RGWSI_Zone *zone_svc,
-                    RGWSI_Bucket *bucket_svc,
-                    RGWSI_BucketIndex *bi_svc) = 0;
-};
-
-class RGWBucketMetaHandlerAllocator {
-public:
-  static RGWBucketMetadataHandlerBase *alloc();
-};
-
-class RGWBucketInstanceMetaHandlerAllocator {
-public:
-  static RGWBucketInstanceMetadataHandlerBase *alloc(rgw::sal::Driver* driver);
-};
-
-class RGWArchiveBucketMetaHandlerAllocator {
-public:
-  static RGWBucketMetadataHandlerBase *alloc();
-};
-
-class RGWArchiveBucketInstanceMetaHandlerAllocator {
-public:
-  static RGWBucketInstanceMetadataHandlerBase *alloc(rgw::sal::Driver* driver);
-};
-
-extern int rgw_remove_object(const DoutPrefixProvider *dpp, rgw::sal::Driver* driver, rgw::sal::Bucket* bucket, rgw_obj_key& key);
-
-extern int rgw_object_get_attr(rgw::sal::Driver* driver, rgw::sal::Object* obj,
-                              const char* attr_name, bufferlist& out_bl,
-                              optional_yield y);
-
-extern void check_bad_user_bucket_mapping(rgw::sal::Driver* driver, rgw::sal::User* user, bool fix, optional_yield y, const DoutPrefixProvider *dpp);
-
-struct RGWBucketAdminOpState {
-  rgw_user uid;
-  std::string display_name;
-  std::string bucket_name;
-  std::string bucket_id;
-  std::string object_name;
-  std::string new_bucket_name;
-
-  bool list_buckets;
-  bool stat_buckets;
-  bool check_objects;
-  bool fix_index;
-  bool delete_child_objects;
-  bool bucket_stored;
-  bool sync_bucket;
-  int max_aio = 0;
-
-  std::unique_ptr<rgw::sal::Bucket>  bucket;
-
-  RGWQuotaInfo quota;
-  RGWRateLimitInfo ratelimit_info;
-
-  void set_fetch_stats(bool value) { stat_buckets = value; }
-  void set_check_objects(bool value) { check_objects = value; }
-  void set_fix_index(bool value) { fix_index = value; }
-  void set_delete_children(bool value) { delete_child_objects = value; }
-
-  void set_max_aio(int value) { max_aio = value; }
-
-  void set_user_id(const rgw_user& user_id) {
-    if (!user_id.empty())
-      uid = user_id;
-  }
-  void set_tenant(const std::string& tenant_str) {
-    uid.tenant = tenant_str;
-  }
-  void set_bucket_name(const std::string& bucket_str) {
-    bucket_name = bucket_str; 
-  }
-  void set_object(std::string& object_str) {
-    object_name = object_str;
-  }
-  void set_new_bucket_name(std::string& new_bucket_str) {
-    new_bucket_name = new_bucket_str;
-  }
-  void set_quota(RGWQuotaInfo& value) {
-    quota = value;
-  }
-  void set_bucket_ratelimit(RGWRateLimitInfo& value) {
-    ratelimit_info = value;
-  }
-
-
-  void set_sync_bucket(bool value) { sync_bucket = value; }
-
-  rgw_user& get_user_id() { return uid; }
-  std::string& get_user_display_name() { return display_name; }
-  std::string& get_bucket_name() { return bucket_name; }
-  std::string& get_object_name() { return object_name; }
-  std::string& get_tenant() { return uid.tenant; }
-
-  rgw::sal::Bucket* get_bucket() { return bucket.get(); }
-  void set_bucket(std::unique_ptr<rgw::sal::Bucket> _bucket) {
-    bucket = std::move(_bucket);
-    bucket_stored = true;
-  }
-
-  void set_bucket_id(const std::string& bi) {
-    bucket_id = bi;
-  }
-  const std::string& get_bucket_id() { return bucket_id; }
-
-  bool will_fetch_stats() { return stat_buckets; }
-  bool will_fix_index() { return fix_index; }
-  bool will_delete_children() { return delete_child_objects; }
-  bool will_check_objects() { return check_objects; }
-  bool is_user_op() { return !uid.empty(); }
-  bool is_system_op() { return uid.empty(); }
-  bool has_bucket_stored() { return bucket_stored; }
-  int get_max_aio() { return max_aio; }
-  bool will_sync_bucket() { return sync_bucket; }
-
-  RGWBucketAdminOpState() : list_buckets(false), stat_buckets(false), check_objects(false), 
-                            fix_index(false), delete_child_objects(false),
-                            bucket_stored(false), sync_bucket(true)  {}
-};
-
-
-/*
- * A simple wrapper class for administrative bucket operations
- */
-class RGWBucket {
-  RGWUserBuckets buckets;
-  rgw::sal::Driver* driver;
-  RGWAccessHandle handle;
-
-  std::unique_ptr<rgw::sal::Bucket> bucket;
-  std::unique_ptr<rgw::sal::User> user;
-
-  bool failure;
-
-  RGWObjVersionTracker ep_objv; // entrypoint object version
-
-public:
-  RGWBucket() : driver(NULL), handle(NULL), failure(false) {}
-  int init(rgw::sal::Driver* storage, RGWBucketAdminOpState& op_state, optional_yield y,
-             const DoutPrefixProvider *dpp, std::string *err_msg = NULL);
-
-  int check_bad_index_multipart(RGWBucketAdminOpState& op_state,
-              RGWFormatterFlusher& flusher,
-              const DoutPrefixProvider *dpp, std::string *err_msg = NULL);
-
-  int check_object_index(const DoutPrefixProvider *dpp, 
-                         RGWBucketAdminOpState& op_state,
-                         RGWFormatterFlusher& flusher,
-                         optional_yield y,
-                         std::string *err_msg = NULL);
-
-  int check_index(const DoutPrefixProvider *dpp,
-          RGWBucketAdminOpState& op_state,
-          std::map<RGWObjCategory, RGWStorageStats>& existing_stats,
-          std::map<RGWObjCategory, RGWStorageStats>& calculated_stats,
-          std::string *err_msg = NULL);
-
-  int chown(RGWBucketAdminOpState& op_state, const std::string& marker,
-            optional_yield y, const DoutPrefixProvider *dpp, std::string *err_msg = NULL);
-  int set_quota(RGWBucketAdminOpState& op_state, const DoutPrefixProvider *dpp, std::string *err_msg = NULL);
-
-  int remove_object(const DoutPrefixProvider *dpp, RGWBucketAdminOpState& op_state, std::string *err_msg = NULL);
-  int policy_bl_to_stream(bufferlist& bl, std::ostream& o);
-  int get_policy(RGWBucketAdminOpState& op_state, RGWAccessControlPolicy& policy, optional_yield y, const DoutPrefixProvider *dpp);
-  int sync(RGWBucketAdminOpState& op_state, const DoutPrefixProvider *dpp, std::string *err_msg = NULL);
-
-  void clear_failure() { failure = false; }
-
-  const RGWBucketInfo& get_bucket_info() const { return bucket->get_info(); }
-};
-
-class RGWBucketAdminOp {
-public:
-  static int get_policy(rgw::sal::Driver* driver, RGWBucketAdminOpState& op_state,
-                  RGWFormatterFlusher& flusher, const DoutPrefixProvider *dpp);
-  static int get_policy(rgw::sal::Driver* driver, RGWBucketAdminOpState& op_state,
-                  RGWAccessControlPolicy& policy, const DoutPrefixProvider *dpp);
-  static int dump_s3_policy(rgw::sal::Driver* driver, RGWBucketAdminOpState& op_state,
-                  std::ostream& os, const DoutPrefixProvider *dpp);
-
-  static int unlink(rgw::sal::Driver* driver, RGWBucketAdminOpState& op_state, const DoutPrefixProvider *dpp);
-  static int link(rgw::sal::Driver* driver, RGWBucketAdminOpState& op_state, const DoutPrefixProvider *dpp, std::string *err_msg = NULL);
-  static int chown(rgw::sal::Driver* driver, RGWBucketAdminOpState& op_state, const std::string& marker, const DoutPrefixProvider *dpp, std::string *err_msg = NULL);
-
-  static int check_index(rgw::sal::Driver* driver, RGWBucketAdminOpState& op_state,
-                  RGWFormatterFlusher& flusher, optional_yield y, const DoutPrefixProvider *dpp);
-
-  static int remove_bucket(rgw::sal::Driver* driver, RGWBucketAdminOpState& op_state, optional_yield y,
-                          const DoutPrefixProvider *dpp, bool bypass_gc = false, bool keep_index_consistent = true);
-  static int remove_object(rgw::sal::Driver* driver, RGWBucketAdminOpState& op_state, const DoutPrefixProvider *dpp);
-  static int info(rgw::sal::Driver* driver, RGWBucketAdminOpState& op_state, RGWFormatterFlusher& flusher, optional_yield y, const DoutPrefixProvider *dpp);
-  static int limit_check(rgw::sal::Driver* driver, RGWBucketAdminOpState& op_state,
-                        const std::list<std::string>& user_ids,
-                        RGWFormatterFlusher& flusher, optional_yield y,
-                         const DoutPrefixProvider *dpp,
-                        bool warnings_only = false);
-  static int set_quota(rgw::sal::Driver* driver, RGWBucketAdminOpState& op_state, const DoutPrefixProvider *dpp);
-
-  static int list_stale_instances(rgw::sal::Driver* driver, RGWBucketAdminOpState& op_state,
-                                 RGWFormatterFlusher& flusher, const DoutPrefixProvider *dpp);
-
-  static int clear_stale_instances(rgw::sal::Driver* driver, RGWBucketAdminOpState& op_state,
-                                  RGWFormatterFlusher& flusher, const DoutPrefixProvider *dpp);
-  static int fix_lc_shards(rgw::sal::Driver* driver, RGWBucketAdminOpState& op_state,
-                           RGWFormatterFlusher& flusher, const DoutPrefixProvider *dpp);
-  static int fix_obj_expiry(rgw::sal::Driver* driver, RGWBucketAdminOpState& op_state,
-                           RGWFormatterFlusher& flusher, const DoutPrefixProvider *dpp, bool dry_run = false);
-
-  static int sync_bucket(rgw::sal::Driver* driver, RGWBucketAdminOpState& op_state, const DoutPrefixProvider *dpp, std::string *err_msg = NULL);
-};
-
-struct rgw_ep_info {
-  RGWBucketEntryPoint &ep;
-  std::map<std::string, buffer::list>& attrs;
-  RGWObjVersionTracker ep_objv;
-  rgw_ep_info(RGWBucketEntryPoint &ep, std::map<std::string, bufferlist>& attrs)
-    : ep(ep), attrs(attrs) {}
-};
-
-class RGWBucketCtl {
-  CephContext *cct;
-
-  struct Svc {
-    RGWSI_Zone *zone{nullptr};
-    RGWSI_Bucket *bucket{nullptr};
-    RGWSI_Bucket_Sync *bucket_sync{nullptr};
-    RGWSI_BucketIndex *bi{nullptr};
-    RGWSI_User* user = nullptr;
-  } svc;
-
-  struct Ctl {
-    RGWUserCtl *user{nullptr};
-  } ctl;
-
-  RGWBucketMetadataHandler *bm_handler;
-  RGWBucketInstanceMetadataHandler *bmi_handler;
-
-  RGWSI_Bucket_BE_Handler bucket_be_handler; /* bucket backend handler */
-  RGWSI_BucketInstance_BE_Handler bi_be_handler; /* bucket instance backend handler */
-
-  int call(std::function<int(RGWSI_Bucket_X_Ctx& ctx)> f);
-
-public:
-  RGWBucketCtl(RGWSI_Zone *zone_svc,
-               RGWSI_Bucket *bucket_svc,
-               RGWSI_Bucket_Sync *bucket_sync_svc,
-               RGWSI_BucketIndex *bi_svc,
-               RGWSI_User* user_svc);
-
-  void init(RGWUserCtl *user_ctl,
-            RGWBucketMetadataHandler *_bm_handler,
-            RGWBucketInstanceMetadataHandler *_bmi_handler,
-            RGWDataChangesLog *datalog,
-            const DoutPrefixProvider *dpp);
-
-  struct Bucket {
-    struct GetParams {
-      RGWObjVersionTracker *objv_tracker{nullptr};
-      real_time *mtime{nullptr};
-      std::map<std::string, bufferlist> *attrs{nullptr};
-      rgw_cache_entry_info *cache_info{nullptr};
-      boost::optional<obj_version> refresh_version;
-      std::optional<RGWSI_MetaBackend_CtxParams> bectx_params;
-
-      GetParams() {}
-
-      GetParams& set_objv_tracker(RGWObjVersionTracker *_objv_tracker) {
-        objv_tracker = _objv_tracker;
-        return *this;
-      }
-
-      GetParams& set_mtime(ceph::real_time *_mtime) {
-        mtime = _mtime;
-        return *this;
-      }
-
-      GetParams& set_attrs(std::map<std::string, bufferlist> *_attrs) {
-        attrs = _attrs;
-        return *this;
-      }
-
-      GetParams& set_cache_info(rgw_cache_entry_info *_cache_info) {
-        cache_info = _cache_info;
-        return *this;
-      }
-
-      GetParams& set_refresh_version(const obj_version& _refresh_version) {
-        refresh_version = _refresh_version;
-        return *this;
-      }
-
-      GetParams& set_bectx_params(std::optional<RGWSI_MetaBackend_CtxParams> _bectx_params) {
-        bectx_params = _bectx_params;
-        return *this;
-      }
-    };
-
-    struct PutParams {
-      RGWObjVersionTracker *objv_tracker{nullptr};
-      ceph::real_time mtime;
-      bool exclusive{false};
-      std::map<std::string, bufferlist> *attrs{nullptr};
-
-      PutParams() {}
-
-      PutParams& set_objv_tracker(RGWObjVersionTracker *_objv_tracker) {
-        objv_tracker = _objv_tracker;
-        return *this;
-      }
-
-      PutParams& set_mtime(const ceph::real_time& _mtime) {
-        mtime = _mtime;
-        return *this;
-      }
-
-      PutParams& set_exclusive(bool _exclusive) {
-        exclusive = _exclusive;
-        return *this;
-      }
-
-      PutParams& set_attrs(std::map<std::string, bufferlist> *_attrs) {
-        attrs = _attrs;
-        return *this;
-      }
-    };
-
-    struct RemoveParams {
-      RGWObjVersionTracker *objv_tracker{nullptr};
-
-      RemoveParams() {}
-
-      RemoveParams& set_objv_tracker(RGWObjVersionTracker *_objv_tracker) {
-        objv_tracker = _objv_tracker;
-        return *this;
-      }
-    };
-  };
-
-  struct BucketInstance {
-    struct GetParams {
-      real_time *mtime{nullptr};
-      std::map<std::string, bufferlist> *attrs{nullptr};
-      rgw_cache_entry_info *cache_info{nullptr};
-      boost::optional<obj_version> refresh_version;
-      RGWObjVersionTracker *objv_tracker{nullptr};
-      std::optional<RGWSI_MetaBackend_CtxParams> bectx_params;
-
-      GetParams() {}
-
-      GetParams& set_mtime(ceph::real_time *_mtime) {
-        mtime = _mtime;
-        return *this;
-      }
-
-      GetParams& set_attrs(std::map<std::string, bufferlist> *_attrs) {
-        attrs = _attrs;
-        return *this;
-      }
-
-      GetParams& set_cache_info(rgw_cache_entry_info *_cache_info) {
-        cache_info = _cache_info;
-        return *this;
-      }
-
-      GetParams& set_refresh_version(const obj_version& _refresh_version) {
-        refresh_version = _refresh_version;
-        return *this;
-      }
-
-      GetParams& set_objv_tracker(RGWObjVersionTracker *_objv_tracker) {
-        objv_tracker = _objv_tracker;
-        return *this;
-      }
-
-      GetParams& set_bectx_params(std::optional<RGWSI_MetaBackend_CtxParams> _bectx_params) {
-        bectx_params = _bectx_params;
-        return *this;
-      }
-    };
-
-    struct PutParams {
-      std::optional<RGWBucketInfo *> orig_info; /* nullopt: orig_info was not fetched,
-                                                   nullptr: orig_info was not found (new bucket instance */
-      ceph::real_time mtime;
-      bool exclusive{false};
-      std::map<std::string, bufferlist> *attrs{nullptr};
-      RGWObjVersionTracker *objv_tracker{nullptr};
-
-      PutParams() {}
-
-      PutParams& set_orig_info(RGWBucketInfo *pinfo) {
-        orig_info = pinfo;
-        return *this;
-      }
-
-      PutParams& set_mtime(const ceph::real_time& _mtime) {
-        mtime = _mtime;
-        return *this;
-      }
-
-      PutParams& set_exclusive(bool _exclusive) {
-        exclusive = _exclusive;
-        return *this;
-      }
-
-      PutParams& set_attrs(std::map<std::string, bufferlist> *_attrs) {
-        attrs = _attrs;
-        return *this;
-      }
-
-      PutParams& set_objv_tracker(RGWObjVersionTracker *_objv_tracker) {
-        objv_tracker = _objv_tracker;
-        return *this;
-      }
-    };
-
-    struct RemoveParams {
-      RGWObjVersionTracker *objv_tracker{nullptr};
-
-      RemoveParams() {}
-
-      RemoveParams& set_objv_tracker(RGWObjVersionTracker *_objv_tracker) {
-        objv_tracker = _objv_tracker;
-        return *this;
-      }
-    };
-  };
-
-  /* bucket entrypoint */
-  int read_bucket_entrypoint_info(const rgw_bucket& bucket,
-                                  RGWBucketEntryPoint *info,
-                                  optional_yield y,
-                                  const DoutPrefixProvider *dpp,
-                                  const Bucket::GetParams& params = {});
-  int store_bucket_entrypoint_info(const rgw_bucket& bucket,
-                                   RGWBucketEntryPoint& info,
-                                   optional_yield y,
-                                   const DoutPrefixProvider *dpp,
-                                   const Bucket::PutParams& params = {});
-  int remove_bucket_entrypoint_info(const rgw_bucket& bucket,
-                                    optional_yield y,
-                                    const DoutPrefixProvider *dpp,
-                                    const Bucket::RemoveParams& params = {});
-
-  /* bucket instance */
-  int read_bucket_instance_info(const rgw_bucket& bucket,
-                                  RGWBucketInfo *info,
-                                  optional_yield y,
-                                  const DoutPrefixProvider *dpp,
-                                  const BucketInstance::GetParams& params = {});
-  int store_bucket_instance_info(const rgw_bucket& bucket,
-                                 RGWBucketInfo& info,
-                                 optional_yield y,
-                                 const DoutPrefixProvider *dpp,
-                                 const BucketInstance::PutParams& params = {});
-  int remove_bucket_instance_info(const rgw_bucket& bucket,
-                                  RGWBucketInfo& info,
-                                  optional_yield y,
-                                  const DoutPrefixProvider *dpp,
-                                  const BucketInstance::RemoveParams& params = {});
-
-  /*
-   * bucket_id may or may not be provided
-   *
-   * ep_objv_tracker might not be populated even if provided. Will only be set if entrypoint is read
-   * (that is: if bucket_id is empty).
-   */
-  int read_bucket_info(const rgw_bucket& bucket,
-                       RGWBucketInfo *info,
-                       optional_yield y,
-                       const DoutPrefixProvider *dpp,
-                       const BucketInstance::GetParams& params = {},
-                      RGWObjVersionTracker *ep_objv_tracker = nullptr);
-
-
-  int set_bucket_instance_attrs(RGWBucketInfo& bucket_info,
-                                std::map<std::string, bufferlist>& attrs,
-                                RGWObjVersionTracker *objv_tracker,
-                                optional_yield y,
-                                const DoutPrefixProvider *dpp);
-
-  /* user/bucket */
-  int link_bucket(const rgw_user& user_id,
-                  const rgw_bucket& bucket,
-                  ceph::real_time creation_time,
-                 optional_yield y,
-                  const DoutPrefixProvider *dpp,
-                  bool update_entrypoint = true,
-                  rgw_ep_info *pinfo = nullptr);
-
-  int unlink_bucket(const rgw_user& user_id,
-                    const rgw_bucket& bucket,
-                   optional_yield y,
-                    const DoutPrefixProvider *dpp,
-                    bool update_entrypoint = true);
-
-  int chown(rgw::sal::Driver* driver, rgw::sal::Bucket* bucket,
-            const rgw_user& user_id, const std::string& display_name,
-            const std::string& marker, optional_yield y, const DoutPrefixProvider *dpp);
-
-  int read_buckets_stats(std::map<std::string, RGWBucketEnt>& m,
-                         optional_yield y,
-                         const DoutPrefixProvider *dpp);
-
-  int read_bucket_stats(const rgw_bucket& bucket,
-                        RGWBucketEnt *result,
-                        optional_yield y,
-                        const DoutPrefixProvider *dpp);
-
-  /* quota related */
-  int sync_user_stats(const DoutPrefixProvider *dpp, 
-                      const rgw_user& user_id, const RGWBucketInfo& bucket_info,
-                     optional_yield y,
-                      RGWBucketEnt* pent);
-
-  /* bucket sync */
-  int get_sync_policy_handler(std::optional<rgw_zone_id> zone,
-                              std::optional<rgw_bucket> bucket,
-                             RGWBucketSyncPolicyHandlerRef *phandler,
-                             optional_yield y,
-                              const DoutPrefixProvider *dpp);
-  int bucket_exports_data(const rgw_bucket& bucket,
-                          optional_yield y,
-                          const DoutPrefixProvider *dpp);
-  int bucket_imports_data(const rgw_bucket& bucket,
-                          optional_yield y,
-                          const DoutPrefixProvider *dpp);
-
-private:
-  int convert_old_bucket_info(RGWSI_Bucket_X_Ctx& ctx,
-                              const rgw_bucket& bucket,
-                              optional_yield y,
-                              const DoutPrefixProvider *dpp);
-
-  int do_store_bucket_instance_info(RGWSI_Bucket_BI_Ctx& ctx,
-                                    const rgw_bucket& bucket,
-                                    RGWBucketInfo& info,
-                                    optional_yield y,
-                                    const DoutPrefixProvider *dpp,
-                                    const BucketInstance::PutParams& params);
-
-  int do_store_linked_bucket_info(RGWSI_Bucket_X_Ctx& ctx,
-                                  RGWBucketInfo& info,
-                                  RGWBucketInfo *orig_info,
-                                  bool exclusive, real_time mtime,
-                                  obj_version *pep_objv,
-                                  std::map<std::string, bufferlist> *pattrs,
-                                  bool create_entry_point,
-                                 optional_yield,
-                                  const DoutPrefixProvider *dpp);
-
-  int do_link_bucket(RGWSI_Bucket_EP_Ctx& ctx,
-                     const rgw_user& user,
-                     const rgw_bucket& bucket,
-                     ceph::real_time creation_time,
-                     bool update_entrypoint,
-                     rgw_ep_info *pinfo,
-                    optional_yield y,
-                     const DoutPrefixProvider *dpp);
-
-  int do_unlink_bucket(RGWSI_Bucket_EP_Ctx& ctx,
-                       const rgw_user& user_id,
-                       const rgw_bucket& bucket,
-                       bool update_entrypoint,
-                      optional_yield y,
-                       const DoutPrefixProvider *dpp);
-
-};
-
-bool rgw_find_bucket_by_id(const DoutPrefixProvider *dpp, CephContext *cct, rgw::sal::Driver* driver, const std::string& marker,
-                           const std::string& bucket_id, rgw_bucket* bucket_out);
diff --git a/src/rgw/store/rados/rgw_bucket_sync.cc b/src/rgw/store/rados/rgw_bucket_sync.cc
deleted file mode 100644 (file)
index 5fd81c5..0000000
+++ /dev/null
@@ -1,941 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab ft=cpp
-
-#include "rgw_common.h"
-#include "rgw_bucket_sync.h"
-#include "rgw_data_sync.h"
-#include "rgw_zone.h"
-
-#include "services/svc_zone.h"
-#include "services/svc_bucket_sync.h"
-
-#define dout_subsys ceph_subsys_rgw
-
-using namespace std;
-
-ostream& operator<<(ostream& os, const rgw_sync_bucket_entity& e) {
-  os << "{b=" << rgw_sync_bucket_entities::bucket_key(e.bucket) << ",z=" << e.zone.value_or(rgw_zone_id()) << ",az=" << (int)e.all_zones << "}";
-  return os;
-}
-
-ostream& operator<<(ostream& os, const rgw_sync_bucket_pipe& pipe) {
-  os << "{s=" << pipe.source << ",d=" << pipe.dest << "}";
-  return os;
-}
-
-ostream& operator<<(ostream& os, const rgw_sync_bucket_entities& e) {
-  os << "{b=" << rgw_sync_bucket_entities::bucket_key(e.bucket) << ",z=" << e.zones.value_or(std::set<rgw_zone_id>()) << "}";
-  return os;
-}
-
-ostream& operator<<(ostream& os, const rgw_sync_bucket_pipes& pipe) {
-  os << "{id=" << pipe.id << ",s=" << pipe.source << ",d=" << pipe.dest << "}";
-  return os;
-}
-
-static std::vector<rgw_sync_bucket_pipe> filter_relevant_pipes(const std::vector<rgw_sync_bucket_pipes>& pipes,
-                                                               const rgw_zone_id& source_zone,
-                                                               const rgw_zone_id& dest_zone)
-{
-  std::vector<rgw_sync_bucket_pipe> relevant_pipes;
-  for (auto& p : pipes) {
-    if (p.source.match_zone(source_zone) &&
-        p.dest.match_zone(dest_zone)) {
-      for (auto pipe : p.expand()) {
-        pipe.source.apply_zone(source_zone);
-        pipe.dest.apply_zone(dest_zone);
-        relevant_pipes.push_back(pipe);
-      }
-    }
-  }
-
-  return relevant_pipes;
-}
-
-static bool is_wildcard_bucket(const rgw_bucket& bucket)
-{
-  return bucket.name.empty();
-}
-
-void rgw_sync_group_pipe_map::dump(ceph::Formatter *f) const
-{
-  encode_json("zone", zone.id, f);
-  encode_json("buckets", rgw_sync_bucket_entities::bucket_key(bucket), f);
-  encode_json("sources", sources, f);
-  encode_json("dests", dests, f);
-}
-
-
-template <typename CB1, typename CB2>
-void rgw_sync_group_pipe_map::try_add_to_pipe_map(const rgw_zone_id& source_zone,
-                                                  const rgw_zone_id& dest_zone,
-                                                  const std::vector<rgw_sync_bucket_pipes>& pipes,
-                                                  zb_pipe_map_t *pipe_map,
-                                                  CB1 filter_cb,
-                                                  CB2 call_filter_cb)
-{
-  if (!filter_cb(source_zone, nullopt, dest_zone, nullopt)) {
-    return;
-  }
-  auto relevant_pipes = filter_relevant_pipes(pipes, source_zone, dest_zone);
-
-  for (auto& pipe : relevant_pipes) {
-    rgw_sync_bucket_entity zb;
-    if (!call_filter_cb(pipe, &zb)) {
-      continue;
-    }
-    pipe_map->insert(make_pair(zb, pipe));
-  }
-}
-          
-template <typename CB>
-void rgw_sync_group_pipe_map::try_add_source(const rgw_zone_id& source_zone,
-                  const rgw_zone_id& dest_zone,
-                  const std::vector<rgw_sync_bucket_pipes>& pipes,
-                  CB filter_cb)
-{
-  return try_add_to_pipe_map(source_zone, dest_zone, pipes,
-                             &sources,
-                             filter_cb,
-                             [&](const rgw_sync_bucket_pipe& pipe, rgw_sync_bucket_entity *zb) {
-                             *zb = rgw_sync_bucket_entity{source_zone, pipe.source.get_bucket()};
-                             return filter_cb(source_zone, zb->bucket, dest_zone, pipe.dest.get_bucket());
-                             });
-}
-
-template <typename CB>
-void rgw_sync_group_pipe_map::try_add_dest(const rgw_zone_id& source_zone,
-                                           const rgw_zone_id& dest_zone,
-                                           const std::vector<rgw_sync_bucket_pipes>& pipes,
-                                           CB filter_cb)
-{
-  return try_add_to_pipe_map(source_zone, dest_zone, pipes,
-                             &dests,
-                             filter_cb,
-                             [&](const rgw_sync_bucket_pipe& pipe, rgw_sync_bucket_entity *zb) {
-                             *zb = rgw_sync_bucket_entity{dest_zone, pipe.dest.get_bucket()};
-                             return filter_cb(source_zone, pipe.source.get_bucket(), dest_zone, zb->bucket);
-                             });
-}
-
-using zb_pipe_map_t = rgw_sync_group_pipe_map::zb_pipe_map_t;
-
-pair<zb_pipe_map_t::const_iterator, zb_pipe_map_t::const_iterator> rgw_sync_group_pipe_map::find_pipes(const zb_pipe_map_t& m,
-                                                                                                       const rgw_zone_id& zone,
-                                                                                                       std::optional<rgw_bucket> b) const
-{
-  if (!b) {
-    return m.equal_range(rgw_sync_bucket_entity{zone, rgw_bucket()});
-  }
-
-  auto zb = rgw_sync_bucket_entity{zone, *b};
-
-  auto range = m.equal_range(zb);
-  if (range.first == range.second &&
-      !is_wildcard_bucket(*b)) {
-    /* couldn't find the specific bucket, try to find by wildcard */
-    zb.bucket = rgw_bucket();
-    range = m.equal_range(zb);
-  }
-
-  return range;
-}
-
-
-template <typename CB>
-void rgw_sync_group_pipe_map::init(const DoutPrefixProvider *dpp,
-                                   CephContext *cct,
-                                   const rgw_zone_id& _zone,
-                                   std::optional<rgw_bucket> _bucket,
-                                   const rgw_sync_policy_group& group,
-                                   rgw_sync_data_flow_group *_default_flow,
-                                   std::set<rgw_zone_id> *_pall_zones,
-                                   CB filter_cb) {
-  zone = _zone;
-  bucket = _bucket;
-  default_flow = _default_flow;
-  pall_zones = _pall_zones;
-
-  rgw_sync_bucket_entity zb(zone, bucket);
-
-  status = group.status;
-
-  std::vector<rgw_sync_bucket_pipes> zone_pipes;
-
-  string bucket_key = (bucket ? bucket->get_key() : "*");
-
-  /* only look at pipes that touch the specific zone and bucket */
-  for (auto& pipe : group.pipes) {
-    if (pipe.contains_zone_bucket(zone, bucket)) {
-      ldpp_dout(dpp, 20) << __func__ << "(): pipe_map (zone=" << zone << " bucket=" << bucket_key << "): adding potential pipe: " << pipe << dendl;
-      zone_pipes.push_back(pipe);
-    }
-  }
-
-  const rgw_sync_data_flow_group *pflow;
-
-  if (!group.data_flow.empty()) {
-    pflow = &group.data_flow;
-  } else {
-    if (!default_flow) {
-      return;
-    }
-    pflow = default_flow;
-  }
-
-  auto& flow = *pflow;
-
-  pall_zones->insert(zone);
-
-  /* symmetrical */
-  for (auto& symmetrical_group : flow.symmetrical) {
-    if (symmetrical_group.zones.find(zone) != symmetrical_group.zones.end()) {
-      for (auto& z : symmetrical_group.zones) {
-        if (z != zone) {
-          pall_zones->insert(z);
-          try_add_source(z, zone, zone_pipes, filter_cb);
-          try_add_dest(zone, z, zone_pipes, filter_cb);
-        }
-      }
-    }
-  }
-
-  /* directional */
-  for (auto& rule : flow.directional) {
-    if (rule.source_zone == zone) {
-      pall_zones->insert(rule.dest_zone);
-      try_add_dest(zone, rule.dest_zone, zone_pipes, filter_cb);
-    } else if (rule.dest_zone == zone) {
-      pall_zones->insert(rule.source_zone);
-      try_add_source(rule.source_zone, zone, zone_pipes, filter_cb);
-    }
-  }
-}
-
-/*
- * find all relevant pipes in our zone that match {dest_bucket} <- {source_zone, source_bucket}
- */
-vector<rgw_sync_bucket_pipe> rgw_sync_group_pipe_map::find_source_pipes(const rgw_zone_id& source_zone,
-                                                                        std::optional<rgw_bucket> source_bucket,
-                                                                        std::optional<rgw_bucket> dest_bucket) const {
-  vector<rgw_sync_bucket_pipe> result;
-
-  auto range = find_pipes(sources, source_zone, source_bucket);
-
-  for (auto iter = range.first; iter != range.second; ++iter) {
-    auto pipe = iter->second;
-    if (pipe.dest.match_bucket(dest_bucket)) {
-      result.push_back(pipe);
-    }
-  }
-  return result;
-}
-
-/*
- * find all relevant pipes in other zones that pull from a specific
- * source bucket in out zone {source_bucket} -> {dest_zone, dest_bucket}
- */
-vector<rgw_sync_bucket_pipe> rgw_sync_group_pipe_map::find_dest_pipes(std::optional<rgw_bucket> source_bucket,
-                                                                      const rgw_zone_id& dest_zone,
-                                                                      std::optional<rgw_bucket> dest_bucket) const {
-  vector<rgw_sync_bucket_pipe> result;
-
-  auto range = find_pipes(dests, dest_zone, dest_bucket);
-
-  for (auto iter = range.first; iter != range.second; ++iter) {
-    auto pipe = iter->second;
-    if (pipe.source.match_bucket(source_bucket)) {
-      result.push_back(pipe);
-    }
-  }
-
-  return result;
-}
-
-/*
- * find all relevant pipes from {source_zone, source_bucket} -> {dest_zone, dest_bucket}
- */
-vector<rgw_sync_bucket_pipe> rgw_sync_group_pipe_map::find_pipes(const rgw_zone_id& source_zone,
-                                                                 std::optional<rgw_bucket> source_bucket,
-                                                                 const rgw_zone_id& dest_zone,
-                                                                 std::optional<rgw_bucket> dest_bucket) const {
-  if (dest_zone == zone) {
-    return find_source_pipes(source_zone, source_bucket, dest_bucket);
-  }
-
-  if (source_zone == zone) {
-    return find_dest_pipes(source_bucket, dest_zone, dest_bucket);
-  }
-
-  return vector<rgw_sync_bucket_pipe>();
-}
-
-void RGWBucketSyncFlowManager::pipe_rules::insert(const rgw_sync_bucket_pipe& pipe)
-{
-  pipes.push_back(pipe);
-
-  auto ppipe = &pipes.back();
-  auto prefix = ppipe->params.source.filter.prefix.value_or(string());
-
-  prefix_refs.insert(make_pair(prefix, ppipe));
-
-  for (auto& t : ppipe->params.source.filter.tags) {
-    string tag = t.key + "=" + t.value;
-    auto titer = tag_refs.find(tag);
-    if (titer != tag_refs.end() &&
-        ppipe->params.priority > titer->second->params.priority) {
-      titer->second = ppipe;
-    } else {
-      tag_refs[tag] = ppipe;
-    }
-  }
-}
-
-bool RGWBucketSyncFlowManager::pipe_rules::find_basic_info_without_tags(const rgw_obj_key& key,
-                                                                        std::optional<rgw_user> *user,
-                                                                        std::optional<rgw_user> *acl_translation_owner,
-                                                                        std::optional<string> *storage_class,
-                                                                        rgw_sync_pipe_params::Mode *mode,
-                                                                        bool *need_more_info) const
-{
-  std::optional<string> owner;
-
-  *need_more_info = false;
-
-  if (prefix_refs.empty()) {
-    return false;
-  }
-
-  auto end = prefix_refs.upper_bound(key.name);
-  auto iter = end;
-  if (iter != prefix_refs.begin()) {
-    --iter;
-  }
-  if (iter == prefix_refs.end()) {
-    return false;
-  }
-
-  if (iter != prefix_refs.begin()) {
-    iter = prefix_refs.find(iter->first); /* prefix_refs is multimap, find first element
-                                             holding that key */
-  }
-
-  std::vector<decltype(iter)> iters;
-
-  std::optional<int> priority;
-
-  for (; iter != end; ++iter) {
-    auto& prefix = iter->first;
-    if (!boost::starts_with(key.name, prefix)) {
-      continue;
-    }
-
-    auto& rule_params = iter->second->params;
-    auto& filter = rule_params.source.filter;
-
-    if (rule_params.priority > priority) {
-      priority = rule_params.priority;
-
-      if (!filter.has_tags()) {
-        iters.clear();
-      }
-      iters.push_back(iter);
-
-      *need_more_info = filter.has_tags(); /* if highest priority filter has tags, then
-                                              we can't be sure if it would be used.
-                                              We need to first read the info from the source object */
-    }
-  }
-
-  if (iters.empty()) {
-    return false;
-  }
-
-  std::optional<rgw_user> _user;
-  std::optional<rgw_sync_pipe_acl_translation> _acl_translation;
-  std::optional<string> _storage_class;
-  rgw_sync_pipe_params::Mode _mode{rgw_sync_pipe_params::Mode::MODE_SYSTEM};
-
-  // make sure all params are the same by saving the first one
-  // encountered and comparing all subsequent to it
-  bool first_iter = true;
-  for (auto& iter : iters) {
-    const rgw_sync_pipe_params& rule_params = iter->second->params;
-    if (first_iter) {
-      _user = rule_params.user;
-      _acl_translation = rule_params.dest.acl_translation;
-      _storage_class = rule_params.dest.storage_class;
-      _mode = rule_params.mode;
-      first_iter = false;
-    } else {
-      // note: three of these == operators are comparing std::optional
-      // against std::optional; as one would expect they are equal a)
-      // if both do not contain values or b) if both do and those
-      // contained values are the same
-      const bool conflict =
-       !(_user == rule_params.user &&
-         _acl_translation == rule_params.dest.acl_translation &&
-         _storage_class == rule_params.dest.storage_class &&
-         _mode == rule_params.mode);
-      if (conflict) {
-       *need_more_info = true;
-       return false;
-      }
-    }
-  }
-
-  *user = _user;
-  if (_acl_translation) {
-    *acl_translation_owner = _acl_translation->owner;
-  }
-  *storage_class = _storage_class;
-  *mode = _mode;
-
-  return true;
-}
-
-bool RGWBucketSyncFlowManager::pipe_rules::find_obj_params(const rgw_obj_key& key,
-                                                           const RGWObjTags::tag_map_t& tags,
-                                                           rgw_sync_pipe_params *params) const
-{
-  if (prefix_refs.empty()) {
-    return false;
-  }
-
-  auto iter = prefix_refs.upper_bound(key.name);
-  if (iter != prefix_refs.begin()) {
-    --iter;
-  }
-  if (iter == prefix_refs.end()) {
-    return false;
-  }
-
-  auto end = prefix_refs.upper_bound(key.name);
-  auto max = end;
-
-  std::optional<int> priority;
-
-  for (; iter != end; ++iter) {
-    /* NOTE: this is not the most efficient way to do it,
-     * a trie data structure would be better
-     */
-    auto& prefix = iter->first;
-    if (!boost::starts_with(key.name, prefix)) {
-      continue;
-    }
-
-    auto& rule_params = iter->second->params;
-    auto& filter = rule_params.source.filter;
-
-    if (!filter.check_tags(tags)) {
-      continue;
-    }
-
-    if (rule_params.priority > priority) {
-      priority = rule_params.priority;
-      max = iter;
-    }
-  }
-
-  if (max == end) {
-    return false;
-  }
-
-  *params = max->second->params;
-  return true;
-}
-
-/*
- * return either the current prefix for s, or the next one if s is not within a prefix
- */
-
-RGWBucketSyncFlowManager::pipe_rules::prefix_map_t::const_iterator RGWBucketSyncFlowManager::pipe_rules::prefix_search(const std::string& s) const
-{
-  if (prefix_refs.empty()) {
-    return prefix_refs.end();
-  }
-  auto next = prefix_refs.upper_bound(s);
-  auto iter = next;
-  if (iter != prefix_refs.begin()) {
-    --iter;
-  }
-  if (!boost::starts_with(s, iter->first)) {
-    return next;
-  }
-
-  return iter;
-}
-
-void RGWBucketSyncFlowManager::pipe_set::insert(const rgw_sync_bucket_pipe& pipe) {
-  pipe_map.insert(make_pair(pipe.id, pipe));
-
-  auto& rules_ref = rules[endpoints_pair(pipe)];
-
-  if (!rules_ref) {
-    rules_ref = make_shared<RGWBucketSyncFlowManager::pipe_rules>();
-  }
-
-  rules_ref->insert(pipe);
-
-  pipe_handler h(rules_ref, pipe);
-
-  handlers.insert(h);
-}
-
-void RGWBucketSyncFlowManager::pipe_set::dump(ceph::Formatter *f) const
-{
-  encode_json("pipes", pipe_map, f);
-}
-
-bool RGWBucketSyncFlowManager::allowed_data_flow(const rgw_zone_id& source_zone,
-                                                 std::optional<rgw_bucket> source_bucket,
-                                                 const rgw_zone_id& dest_zone,
-                                                 std::optional<rgw_bucket> dest_bucket,
-                                                 bool check_activated) const
-{
-  bool found = false;
-  bool found_activated = false;
-
-  for (auto m : flow_groups) {
-    auto& fm = m.second;
-    auto pipes = fm.find_pipes(source_zone, source_bucket,
-                               dest_zone, dest_bucket);
-
-    bool is_found = !pipes.empty();
-
-    if (is_found) {
-      switch (fm.status) {
-        case rgw_sync_policy_group::Status::FORBIDDEN:
-          return false;
-        case rgw_sync_policy_group::Status::ENABLED:
-          found = true;
-          found_activated = true;
-          break;
-        case rgw_sync_policy_group::Status::ALLOWED:
-          found = true;
-          break;
-        default:
-          break; /* unknown -- ignore */
-      }
-    }
-  }
-
-  if (check_activated && found_activated) {
-    return true;
-  }
-
-  return found;
-}
-
-void RGWBucketSyncFlowManager::init(const DoutPrefixProvider *dpp, const rgw_sync_policy_info& sync_policy) {
-  std::optional<rgw_sync_data_flow_group> default_flow;
-  if (parent) {
-    default_flow.emplace();
-    default_flow->init_default(parent->all_zones);
-  }
-
-  for (auto& item : sync_policy.groups) {
-    auto& group = item.second;
-    auto& flow_group_map = flow_groups[group.id];
-
-    flow_group_map.init(dpp, cct, zone_id, bucket, group,
-                        (default_flow ? &(*default_flow) : nullptr),
-                        &all_zones,
-                        [&](const rgw_zone_id& source_zone,
-                            std::optional<rgw_bucket> source_bucket,
-                            const rgw_zone_id& dest_zone,
-                            std::optional<rgw_bucket> dest_bucket) {
-                        if (!parent) {
-                          return true;
-                        }
-                        return parent->allowed_data_flow(source_zone,
-                                                         source_bucket,
-                                                         dest_zone,
-                                                         dest_bucket,
-                                                         false); /* just check that it's not disabled */
-                        });
-  }
-}
-
-void RGWBucketSyncFlowManager::reflect(const DoutPrefixProvider *dpp,
-                                       std::optional<rgw_bucket> effective_bucket,
-                                       RGWBucketSyncFlowManager::pipe_set *source_pipes,
-                                       RGWBucketSyncFlowManager::pipe_set *dest_pipes,
-                                       bool only_enabled) const
-
-{
-  string effective_bucket_key;
-  if (effective_bucket) {
-    effective_bucket_key = effective_bucket->get_key();
-  }
-  if (parent) {
-    parent->reflect(dpp, effective_bucket, source_pipes, dest_pipes, only_enabled);
-  }
-
-  for (auto& item : flow_groups) {
-    auto& flow_group_map = item.second;
-
-    /* only return enabled groups */
-    if (flow_group_map.status != rgw_sync_policy_group::Status::ENABLED &&
-        (only_enabled || flow_group_map.status != rgw_sync_policy_group::Status::ALLOWED)) {
-      continue;
-    }
-
-    for (auto& entry : flow_group_map.sources) {
-      rgw_sync_bucket_pipe pipe = entry.second;
-      if (!pipe.dest.match_bucket(effective_bucket)) {
-        continue;
-      }
-
-      pipe.source.apply_bucket(effective_bucket);
-      pipe.dest.apply_bucket(effective_bucket);
-
-      ldpp_dout(dpp, 20) << __func__ << "(): flow manager (bucket=" << effective_bucket_key << "): adding source pipe: " << pipe << dendl;
-      source_pipes->insert(pipe);
-    }
-
-    for (auto& entry : flow_group_map.dests) {
-      rgw_sync_bucket_pipe pipe = entry.second;
-
-      if (!pipe.source.match_bucket(effective_bucket)) {
-        continue;
-      }
-
-      pipe.source.apply_bucket(effective_bucket);
-      pipe.dest.apply_bucket(effective_bucket);
-
-      ldpp_dout(dpp, 20) << __func__ << "(): flow manager (bucket=" << effective_bucket_key << "): adding dest pipe: " << pipe << dendl;
-      dest_pipes->insert(pipe);
-    }
-  }
-}
-
-
-RGWBucketSyncFlowManager::RGWBucketSyncFlowManager(CephContext *_cct,
-                                                   const rgw_zone_id& _zone_id,
-                                                   std::optional<rgw_bucket> _bucket,
-                                                   const RGWBucketSyncFlowManager *_parent) : cct(_cct),
-                                                                                              zone_id(_zone_id),
-                                                                                              bucket(_bucket),
-                                                                                              parent(_parent) {}
-
-
-void RGWSyncPolicyCompat::convert_old_sync_config(RGWSI_Zone *zone_svc,
-                                                  RGWSI_SyncModules *sync_modules_svc,
-                                                  rgw_sync_policy_info *ppolicy)
-{
-  bool found = false;
-
-  rgw_sync_policy_info policy;
-
-  auto& group = policy.groups["default"];
-  auto& zonegroup = zone_svc->get_zonegroup();
-
-  for (const auto& ziter1 : zonegroup.zones) {
-    auto& id1 = ziter1.first;
-    const RGWZone& z1 = ziter1.second;
-
-    for (const auto& ziter2 : zonegroup.zones) {
-      auto& id2 = ziter2.first;
-      const RGWZone& z2 = ziter2.second;
-
-      if (id1 == id2) {
-        continue;
-      }
-
-      if (z1.syncs_from(z2.name)) {
-        found = true;
-        rgw_sync_directional_rule *rule;
-        group.data_flow.find_or_create_directional(id2,
-                                                   id1,
-                                                   &rule);
-      }
-    }
-  }
-
-  if (!found) { /* nothing syncs */
-    return;
-  }
-
-  rgw_sync_bucket_pipes pipes;
-  pipes.id = "all";
-  pipes.source.all_zones = true;
-  pipes.dest.all_zones = true;
-
-  group.pipes.emplace_back(std::move(pipes));
-
-
-  group.status = rgw_sync_policy_group::Status::ENABLED;
-
-  *ppolicy = std::move(policy);
-}
-
-RGWBucketSyncPolicyHandler::RGWBucketSyncPolicyHandler(RGWSI_Zone *_zone_svc,
-                                                       RGWSI_SyncModules *sync_modules_svc,
-                                                      RGWSI_Bucket_Sync *_bucket_sync_svc,
-                                                       std::optional<rgw_zone_id> effective_zone) : zone_svc(_zone_svc) ,
-                                                                                                    bucket_sync_svc(_bucket_sync_svc) {
-  zone_id = effective_zone.value_or(zone_svc->zone_id());
-  flow_mgr.reset(new RGWBucketSyncFlowManager(zone_svc->ctx(),
-                                              zone_id,
-                                              nullopt,
-                                              nullptr));
-  sync_policy = zone_svc->get_zonegroup().sync_policy;
-
-  if (sync_policy.empty()) {
-    RGWSyncPolicyCompat::convert_old_sync_config(zone_svc, sync_modules_svc, &sync_policy);
-    legacy_config = true;
-  }
-}
-
-RGWBucketSyncPolicyHandler::RGWBucketSyncPolicyHandler(const RGWBucketSyncPolicyHandler *_parent,
-                                                       const RGWBucketInfo& _bucket_info,
-                                                       map<string, bufferlist>&& _bucket_attrs) : parent(_parent),
-                                                                                                       bucket_info(_bucket_info),
-                                                                                                       bucket_attrs(std::move(_bucket_attrs)) {
-  if (_bucket_info.sync_policy) {
-    sync_policy = *_bucket_info.sync_policy;
-
-    for (auto& entry : sync_policy.groups) {
-      for (auto& pipe : entry.second.pipes) {
-        if (pipe.params.mode == rgw_sync_pipe_params::MODE_USER &&
-            pipe.params.user.empty()) {
-          pipe.params.user = _bucket_info.owner;
-        }
-      }
-    }
-  }
-  legacy_config = parent->legacy_config;
-  bucket = _bucket_info.bucket;
-  zone_svc = parent->zone_svc;
-  bucket_sync_svc = parent->bucket_sync_svc;
-  flow_mgr.reset(new RGWBucketSyncFlowManager(zone_svc->ctx(),
-                                              parent->zone_id,
-                                              _bucket_info.bucket,
-                                              parent->flow_mgr.get()));
-}
-
-RGWBucketSyncPolicyHandler::RGWBucketSyncPolicyHandler(const RGWBucketSyncPolicyHandler *_parent,
-                                                       const rgw_bucket& _bucket,
-                                                       std::optional<rgw_sync_policy_info> _sync_policy) : parent(_parent) {
-  if (_sync_policy) {
-    sync_policy = *_sync_policy;
-  }
-  legacy_config = parent->legacy_config;
-  bucket = _bucket;
-  zone_svc = parent->zone_svc;
-  bucket_sync_svc = parent->bucket_sync_svc;
-  flow_mgr.reset(new RGWBucketSyncFlowManager(zone_svc->ctx(),
-                                              parent->zone_id,
-                                              _bucket,
-                                              parent->flow_mgr.get()));
-}
-
-RGWBucketSyncPolicyHandler *RGWBucketSyncPolicyHandler::alloc_child(const RGWBucketInfo& bucket_info,
-                                                                    map<string, bufferlist>&& bucket_attrs) const
-{
-  return new RGWBucketSyncPolicyHandler(this, bucket_info, std::move(bucket_attrs));
-}
-
-RGWBucketSyncPolicyHandler *RGWBucketSyncPolicyHandler::alloc_child(const rgw_bucket& bucket,
-                                                                    std::optional<rgw_sync_policy_info> sync_policy) const
-{
-  return new RGWBucketSyncPolicyHandler(this, bucket, sync_policy);
-}
-
-int RGWBucketSyncPolicyHandler::init(const DoutPrefixProvider *dpp, optional_yield y)
-{
-  int r = bucket_sync_svc->get_bucket_sync_hints(dpp, bucket.value_or(rgw_bucket()),
-                                                &source_hints,
-                                                &target_hints,
-                                                y);
-  if (r < 0) {
-    ldpp_dout(dpp, 0) << "ERROR: failed to initialize bucket sync policy handler: get_bucket_sync_hints() on bucket="
-      << bucket << " returned r=" << r << dendl;
-    return r;
-  }
-
-  flow_mgr->init(dpp, sync_policy);
-
-  reflect(dpp, &source_pipes,
-          &target_pipes,
-          &sources,
-          &targets,
-          &source_zones,
-          &target_zones,
-          true);
-
-  return 0;
-}
-
-void RGWBucketSyncPolicyHandler::reflect(const DoutPrefixProvider *dpp, RGWBucketSyncFlowManager::pipe_set *psource_pipes,
-                                         RGWBucketSyncFlowManager::pipe_set *ptarget_pipes,
-                                         map<rgw_zone_id, RGWBucketSyncFlowManager::pipe_set> *psources,
-                                         map<rgw_zone_id, RGWBucketSyncFlowManager::pipe_set> *ptargets,
-                                         std::set<rgw_zone_id> *psource_zones,
-                                         std::set<rgw_zone_id> *ptarget_zones,
-                                         bool only_enabled) const
-{
-  RGWBucketSyncFlowManager::pipe_set _source_pipes;
-  RGWBucketSyncFlowManager::pipe_set _target_pipes;
-  map<rgw_zone_id, RGWBucketSyncFlowManager::pipe_set> _sources;
-  map<rgw_zone_id, RGWBucketSyncFlowManager::pipe_set> _targets;
-  std::set<rgw_zone_id> _source_zones;
-  std::set<rgw_zone_id> _target_zones;
-
-  flow_mgr->reflect(dpp, bucket, &_source_pipes, &_target_pipes, only_enabled);
-
-  for (auto& entry : _source_pipes.pipe_map) {
-    auto& pipe = entry.second;
-    if (!pipe.source.zone) {
-      continue;
-    }
-    _source_zones.insert(*pipe.source.zone);
-    _sources[*pipe.source.zone].insert(pipe);
-  }
-
-  for (auto& entry : _target_pipes.pipe_map) {
-    auto& pipe = entry.second;
-    if (!pipe.dest.zone) {
-      continue;
-    }
-    _target_zones.insert(*pipe.dest.zone);
-    _targets[*pipe.dest.zone].insert(pipe);
-  }
-
-  if (psource_pipes) {
-    *psource_pipes = std::move(_source_pipes);
-  }
-  if (ptarget_pipes) {
-    *ptarget_pipes = std::move(_target_pipes);
-  }
-  if (psources) {
-    *psources = std::move(_sources);
-  }
-  if (ptargets) {
-    *ptargets = std::move(_targets);
-  }
-  if (psource_zones) {
-    *psource_zones = std::move(_source_zones);
-  }
-  if (ptarget_zones) {
-    *ptarget_zones = std::move(_target_zones);
-  }
-}
-
-multimap<rgw_zone_id, rgw_sync_bucket_pipe> RGWBucketSyncPolicyHandler::get_all_sources() const
-{
-  multimap<rgw_zone_id, rgw_sync_bucket_pipe> m;
-
-  for (auto& source_entry : sources) {
-    auto& zone_id = source_entry.first;
-
-    auto& pipes = source_entry.second.pipe_map;
-
-    for (auto& entry : pipes) {
-      auto& pipe = entry.second;
-      m.insert(make_pair(zone_id, pipe));
-    }
-  }
-
-  for (auto& pipe : resolved_sources) {
-    if (!pipe.source.zone) {
-      continue;
-    }
-
-    m.insert(make_pair(*pipe.source.zone, pipe));
-  }
-
-  return m;
-}
-
-multimap<rgw_zone_id, rgw_sync_bucket_pipe> RGWBucketSyncPolicyHandler::get_all_dests() const
-{
-  multimap<rgw_zone_id, rgw_sync_bucket_pipe> m;
-
-  for (auto& dest_entry : targets) {
-    auto& zone_id = dest_entry.first;
-
-    auto& pipes = dest_entry.second.pipe_map;
-
-    for (auto& entry : pipes) {
-      auto& pipe = entry.second;
-      m.insert(make_pair(zone_id, pipe));
-    }
-  }
-
-  for (auto& pipe : resolved_dests) {
-    if (!pipe.dest.zone) {
-      continue;
-    }
-
-    m.insert(make_pair(*pipe.dest.zone, pipe));
-  }
-
-  return m;
-}
-
-multimap<rgw_zone_id, rgw_sync_bucket_pipe> RGWBucketSyncPolicyHandler::get_all_dests_in_zone(const rgw_zone_id& zone_id) const
-{
-  multimap<rgw_zone_id, rgw_sync_bucket_pipe> m;
-
-  auto iter = targets.find(zone_id);
-  if (iter != targets.end()) {
-    auto& pipes = iter->second.pipe_map;
-
-    for (auto& entry : pipes) {
-      auto& pipe = entry.second;
-      m.insert(make_pair(zone_id, pipe));
-    }
-  }
-
-  for (auto& pipe : resolved_dests) {
-    if (!pipe.dest.zone ||
-        *pipe.dest.zone != zone_id) {
-      continue;
-    }
-
-    m.insert(make_pair(*pipe.dest.zone, pipe));
-  }
-
-  return m;
-}
-
-void RGWBucketSyncPolicyHandler::get_pipes(std::set<rgw_sync_bucket_pipe> *_sources, std::set<rgw_sync_bucket_pipe> *_targets,
-                                           std::optional<rgw_sync_bucket_entity> filter_peer) { /* return raw pipes */
-  for (auto& entry : source_pipes.pipe_map) {
-    auto& source_pipe = entry.second;
-    if (!filter_peer ||
-        source_pipe.source.match(*filter_peer)) {
-      _sources->insert(source_pipe);
-    }
-  }
-
-  for (auto& entry : target_pipes.pipe_map) {
-    auto& target_pipe = entry.second;
-    if (!filter_peer ||
-        target_pipe.dest.match(*filter_peer)) {
-      _targets->insert(target_pipe);
-    }
-  }
-}
-
-bool RGWBucketSyncPolicyHandler::bucket_exports_data() const
-{
-  if (!bucket) {
-    return false;
-  }
-
-  if (bucket_is_sync_source()) {
-    return true;
-  }
-
-  return (zone_svc->need_to_log_data() &&
-          bucket_info->datasync_flag_enabled());
-}
-
-bool RGWBucketSyncPolicyHandler::bucket_imports_data() const
-{
-  return bucket_is_sync_target();
-}
-
diff --git a/src/rgw/store/rados/rgw_bucket_sync.h b/src/rgw/store/rados/rgw_bucket_sync.h
deleted file mode 100644 (file)
index 7614377..0000000
+++ /dev/null
@@ -1,412 +0,0 @@
-
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2018 Red Hat, Inc.
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation. See file COPYING.
- *
- */
-
-#pragma once
-
-#include "rgw_common.h"
-#include "rgw_sync_policy.h"
-
-class RGWSI_Zone;
-class RGWSI_SyncModules;
-class RGWSI_Bucket_Sync;
-
-struct rgw_sync_group_pipe_map;
-struct rgw_sync_bucket_pipes;
-struct rgw_sync_policy_info;
-
-struct rgw_sync_group_pipe_map {
-  rgw_zone_id zone;
-  std::optional<rgw_bucket> bucket;
-
-  rgw_sync_policy_group::Status status{rgw_sync_policy_group::Status::FORBIDDEN};
-
-  using zb_pipe_map_t = std::multimap<rgw_sync_bucket_entity, rgw_sync_bucket_pipe>;
-
-  zb_pipe_map_t sources; /* all the pipes where zone is pulling from */
-  zb_pipe_map_t dests; /* all the pipes that pull from zone */
-
-  std::set<rgw_zone_id> *pall_zones{nullptr};
-  rgw_sync_data_flow_group *default_flow{nullptr}; /* flow to use if policy doesn't define it,
-                                                      used in the case of bucket sync policy, not at the
-                                                      zonegroup level */
-
-  void dump(ceph::Formatter *f) const;
-
-  template <typename CB1, typename CB2>
-  void try_add_to_pipe_map(const rgw_zone_id& source_zone,
-                           const rgw_zone_id& dest_zone,
-                           const std::vector<rgw_sync_bucket_pipes>& pipes,
-                           zb_pipe_map_t *pipe_map,
-                           CB1 filter_cb,
-                           CB2 call_filter_cb);
-          
-  template <typename CB>
-  void try_add_source(const rgw_zone_id& source_zone,
-                      const rgw_zone_id& dest_zone,
-                      const std::vector<rgw_sync_bucket_pipes>& pipes,
-                      CB filter_cb);
-          
-  template <typename CB>
-  void try_add_dest(const rgw_zone_id& source_zone,
-                  const rgw_zone_id& dest_zone,
-                  const std::vector<rgw_sync_bucket_pipes>& pipes,
-                  CB filter_cb);
-          
-  std::pair<zb_pipe_map_t::const_iterator, zb_pipe_map_t::const_iterator> find_pipes(const zb_pipe_map_t& m,
-                                                                                const rgw_zone_id& zone,
-                                                                                std::optional<rgw_bucket> b) const;
-
-  template <typename CB>
-  void init(const DoutPrefixProvider *dpp, CephContext *cct,
-            const rgw_zone_id& _zone,
-            std::optional<rgw_bucket> _bucket,
-            const rgw_sync_policy_group& group,
-            rgw_sync_data_flow_group *_default_flow,
-            std::set<rgw_zone_id> *_pall_zones,
-            CB filter_cb);
-
-  /*
-   * find all relevant pipes in our zone that match {dest_bucket} <- {source_zone, source_bucket}
-   */
-  std::vector<rgw_sync_bucket_pipe> find_source_pipes(const rgw_zone_id& source_zone,
-                                                 std::optional<rgw_bucket> source_bucket,
-                                                 std::optional<rgw_bucket> dest_bucket) const;
-
-  /*
-   * find all relevant pipes in other zones that pull from a specific
-   * source bucket in out zone {source_bucket} -> {dest_zone, dest_bucket}
-   */
-  std::vector<rgw_sync_bucket_pipe> find_dest_pipes(std::optional<rgw_bucket> source_bucket,
-                                               const rgw_zone_id& dest_zone,
-                                               std::optional<rgw_bucket> dest_bucket) const;
-
-  /*
-   * find all relevant pipes from {source_zone, source_bucket} -> {dest_zone, dest_bucket}
-   */
-  std::vector<rgw_sync_bucket_pipe> find_pipes(const rgw_zone_id& source_zone,
-                                          std::optional<rgw_bucket> source_bucket,
-                                          const rgw_zone_id& dest_zone,
-                                          std::optional<rgw_bucket> dest_bucket) const;
-};
-
-class RGWSyncPolicyCompat {
-public:
-  static void convert_old_sync_config(RGWSI_Zone *zone_svc,
-                                      RGWSI_SyncModules *sync_modules_svc,
-                                      rgw_sync_policy_info *ppolicy);
-};
-
-class RGWBucketSyncFlowManager {
-  friend class RGWBucketSyncPolicyHandler;
-public:
-  struct endpoints_pair {
-    rgw_sync_bucket_entity source;
-    rgw_sync_bucket_entity dest;
-
-    endpoints_pair() {}
-    endpoints_pair(const rgw_sync_bucket_pipe& pipe) {
-      source = pipe.source;
-      dest = pipe.dest;
-    }
-
-    bool operator<(const endpoints_pair& e) const {
-      if (source < e.source) {
-        return true;
-      }
-      if (e.source < source) {
-        return false;
-      }
-      return (dest < e.dest);
-    }
-  };
-
-  /*
-   * pipe_rules: deal with a set of pipes that have common endpoints_pair
-   */
-  class pipe_rules {
-    std::list<rgw_sync_bucket_pipe> pipes;
-
-  public:
-    using prefix_map_t = std::multimap<std::string, rgw_sync_bucket_pipe *>;
-
-    std::map<std::string, rgw_sync_bucket_pipe *> tag_refs;
-    prefix_map_t prefix_refs;
-
-    void insert(const rgw_sync_bucket_pipe& pipe);
-
-    bool find_basic_info_without_tags(const rgw_obj_key& key,
-                                      std::optional<rgw_user> *user,
-                                      std::optional<rgw_user> *acl_translation,
-                                      std::optional<std::string> *storage_class,
-                                      rgw_sync_pipe_params::Mode *mode,
-                                      bool *need_more_info) const;
-    bool find_obj_params(const rgw_obj_key& key, 
-                         const RGWObjTags::tag_map_t& tags,
-                         rgw_sync_pipe_params *params) const;
-
-    void scan_prefixes(std::vector<std::string> *prefixes) const;
-
-    prefix_map_t::const_iterator prefix_begin() const {
-      return prefix_refs.begin();
-    }
-    prefix_map_t::const_iterator prefix_search(const std::string& s) const;
-    prefix_map_t::const_iterator prefix_end() const {
-      return prefix_refs.end();
-    }
-  };
-
-  using pipe_rules_ref = std::shared_ptr<pipe_rules>;
-
-  /*
-   * pipe_handler: extends endpoints_rule to point at the corresponding rules handler
-   */
-  struct pipe_handler : public endpoints_pair {
-    pipe_rules_ref rules;
-
-    pipe_handler() {}
-    pipe_handler(pipe_rules_ref& _rules,
-                 const rgw_sync_bucket_pipe& _pipe) : endpoints_pair(_pipe),
-                                                      rules(_rules) {}
-    bool specific() const {
-      return source.specific() && dest.specific();
-    }
-    
-    bool find_basic_info_without_tags(const rgw_obj_key& key,
-                                      std::optional<rgw_user> *user,
-                                      std::optional<rgw_user> *acl_translation,
-                                      std::optional<std::string> *storage_class,
-                                      rgw_sync_pipe_params::Mode *mode,
-                                      bool *need_more_info) const {
-      if (!rules) {
-        return false;
-      }
-      return rules->find_basic_info_without_tags(key, user, acl_translation, storage_class, mode, need_more_info);
-    }
-
-    bool find_obj_params(const rgw_obj_key& key,
-                         const RGWObjTags::tag_map_t& tags,
-                         rgw_sync_pipe_params *params) const {
-      if (!rules) {
-        return false;
-      }
-      return rules->find_obj_params(key, tags, params);
-    }
-  };
-
-  struct pipe_set {
-    std::map<endpoints_pair, pipe_rules_ref> rules;
-    std::multimap<std::string, rgw_sync_bucket_pipe> pipe_map;
-
-    std::set<pipe_handler> handlers;
-
-    using iterator = std::set<pipe_handler>::iterator;
-
-    void clear() {
-      rules.clear();
-      pipe_map.clear();
-      handlers.clear();
-    }
-
-    void insert(const rgw_sync_bucket_pipe& pipe);
-
-    iterator begin() const {
-      return handlers.begin();
-    }
-
-    iterator end() const {
-      return handlers.end();
-    }
-
-    void dump(ceph::Formatter *f) const;
-  };
-
-private:
-
-  CephContext *cct;
-
-  rgw_zone_id zone_id;
-  std::optional<rgw_bucket> bucket;
-
-  const RGWBucketSyncFlowManager *parent{nullptr};
-
-  std::map<std::string, rgw_sync_group_pipe_map> flow_groups;
-
-  std::set<rgw_zone_id> all_zones;
-
-  bool allowed_data_flow(const rgw_zone_id& source_zone,
-                         std::optional<rgw_bucket> source_bucket,
-                         const rgw_zone_id& dest_zone,
-                         std::optional<rgw_bucket> dest_bucket,
-                         bool check_activated) const;
-
-  /*
-   * find all the matching flows om a flow map for a specific bucket
-   */
-  void update_flow_maps(const rgw_sync_bucket_pipes& pipe);
-
-  void init(const DoutPrefixProvider *dpp, const rgw_sync_policy_info& sync_policy);
-
-public:
-
-  RGWBucketSyncFlowManager(CephContext *_cct,
-                           const rgw_zone_id& _zone_id,
-                           std::optional<rgw_bucket> _bucket,
-                           const RGWBucketSyncFlowManager *_parent);
-
-  void reflect(const DoutPrefixProvider *dpp, std::optional<rgw_bucket> effective_bucket,
-               pipe_set *flow_by_source,
-               pipe_set *flow_by_dest,  
-               bool only_enabled) const;
-
-};
-
-static inline std::ostream& operator<<(std::ostream& os, const RGWBucketSyncFlowManager::endpoints_pair& e) {
-  os << e.dest << " -> " << e.source;
-  return os;
-}
-
-class RGWBucketSyncPolicyHandler {
-  bool legacy_config{false};
-  const RGWBucketSyncPolicyHandler *parent{nullptr};
-  RGWSI_Zone *zone_svc;
-  RGWSI_Bucket_Sync *bucket_sync_svc;
-  rgw_zone_id zone_id;
-  std::optional<RGWBucketInfo> bucket_info;
-  std::optional<std::map<std::string, bufferlist> > bucket_attrs;
-  std::optional<rgw_bucket> bucket;
-  std::unique_ptr<RGWBucketSyncFlowManager> flow_mgr;
-  rgw_sync_policy_info sync_policy;
-
-  RGWBucketSyncFlowManager::pipe_set source_pipes;
-  RGWBucketSyncFlowManager::pipe_set target_pipes;
-
-  std::map<rgw_zone_id, RGWBucketSyncFlowManager::pipe_set> sources; /* source pipes by source zone id */
-  std::map<rgw_zone_id, RGWBucketSyncFlowManager::pipe_set> targets; /* target pipes by target zone id */
-
-  std::set<rgw_zone_id> source_zones;
-  std::set<rgw_zone_id> target_zones;
-
-  std::set<rgw_bucket> source_hints;
-  std::set<rgw_bucket> target_hints;
-  std::set<rgw_sync_bucket_pipe> resolved_sources;
-  std::set<rgw_sync_bucket_pipe> resolved_dests;
-
-
-  bool bucket_is_sync_source() const {
-    return !targets.empty() || !resolved_dests.empty();
-  }
-
-  bool bucket_is_sync_target() const {
-    return !sources.empty() || !resolved_sources.empty();
-  }
-
-  RGWBucketSyncPolicyHandler(const RGWBucketSyncPolicyHandler *_parent,
-                             const RGWBucketInfo& _bucket_info,
-                             std::map<std::string, bufferlist>&& _bucket_attrs);
-
-  RGWBucketSyncPolicyHandler(const RGWBucketSyncPolicyHandler *_parent,
-                             const rgw_bucket& _bucket,
-                             std::optional<rgw_sync_policy_info> _sync_policy);
-public:
-  RGWBucketSyncPolicyHandler(RGWSI_Zone *_zone_svc,
-                             RGWSI_SyncModules *sync_modules_svc,
-                            RGWSI_Bucket_Sync *bucket_sync_svc,
-                             std::optional<rgw_zone_id> effective_zone = std::nullopt);
-
-  RGWBucketSyncPolicyHandler *alloc_child(const RGWBucketInfo& bucket_info,
-                                          std::map<std::string, bufferlist>&& bucket_attrs) const;
-  RGWBucketSyncPolicyHandler *alloc_child(const rgw_bucket& bucket,
-                                          std::optional<rgw_sync_policy_info> sync_policy) const;
-
-  int init(const DoutPrefixProvider *dpp, optional_yield y);
-
-  void reflect(const DoutPrefixProvider *dpp, RGWBucketSyncFlowManager::pipe_set *psource_pipes,
-               RGWBucketSyncFlowManager::pipe_set *ptarget_pipes,
-               std::map<rgw_zone_id, RGWBucketSyncFlowManager::pipe_set> *psources,
-               std::map<rgw_zone_id, RGWBucketSyncFlowManager::pipe_set> *ptargets,
-               std::set<rgw_zone_id> *psource_zones,
-               std::set<rgw_zone_id> *ptarget_zones,
-               bool only_enabled) const;
-
-  void set_resolved_hints(std::set<rgw_sync_bucket_pipe>&& _resolved_sources,
-                          std::set<rgw_sync_bucket_pipe>&& _resolved_dests) {
-    resolved_sources = std::move(_resolved_sources);
-    resolved_dests = std::move(_resolved_dests);
-  }
-
-  const std::set<rgw_sync_bucket_pipe>& get_resolved_source_hints() {
-    return resolved_sources;
-  }
-
-  const std::set<rgw_sync_bucket_pipe>& get_resolved_dest_hints() {
-    return resolved_dests;
-  }
-
-  const std::set<rgw_zone_id>& get_source_zones() const {
-    return source_zones;
-  }
-
-  const std::set<rgw_zone_id>& get_target_zones() const {
-    return target_zones;
-  }
-
-  const  std::map<rgw_zone_id, RGWBucketSyncFlowManager::pipe_set>& get_sources() {
-    return sources;
-  }
-
-  std::multimap<rgw_zone_id, rgw_sync_bucket_pipe> get_all_sources() const;
-  std::multimap<rgw_zone_id, rgw_sync_bucket_pipe> get_all_dests() const;
-  std::multimap<rgw_zone_id, rgw_sync_bucket_pipe> get_all_dests_in_zone(const rgw_zone_id& zone_id) const;
-
-  const std::map<rgw_zone_id, RGWBucketSyncFlowManager::pipe_set>& get_targets() {
-    return targets;
-  }
-
-  const std::optional<RGWBucketInfo>& get_bucket_info() const {
-    return bucket_info;
-  }
-
-  const std::optional<std::map<std::string, bufferlist> >& get_bucket_attrs() const {
-    return bucket_attrs;
-  }
-
-  void get_pipes(RGWBucketSyncFlowManager::pipe_set **_sources, RGWBucketSyncFlowManager::pipe_set **_targets) { /* return raw pipes (with zone name) */
-    *_sources = &source_pipes;
-    *_targets = &target_pipes;
-  }
-  void get_pipes(std::set<rgw_sync_bucket_pipe> *sources, std::set<rgw_sync_bucket_pipe> *targets,
-                 std::optional<rgw_sync_bucket_entity> filter_peer);
-
-  const std::set<rgw_bucket>& get_source_hints() const {
-    return source_hints;
-  }
-
-  const std::set<rgw_bucket>& get_target_hints() const {
-    return target_hints;
-  }
-
-  bool bucket_exports_data() const;
-  bool bucket_imports_data() const;
-
-  const rgw_sync_policy_info& get_sync_policy() const {
-    return sync_policy;
-  }
-
-  bool is_legacy_config() const {
-    return legacy_config;
-  }
-};
-
diff --git a/src/rgw/store/rados/rgw_cr_rados.cc b/src/rgw/store/rados/rgw_cr_rados.cc
deleted file mode 100644 (file)
index 0507972..0000000
+++ /dev/null
@@ -1,1138 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab ft=cpp
-
-#include "include/compat.h"
-#include "rgw_sal.h"
-#include "rgw_zone.h"
-#include "rgw_coroutine.h"
-#include "rgw_cr_rados.h"
-#include "rgw_sync_counters.h"
-#include "rgw_bucket.h"
-#include "rgw_datalog_notify.h"
-#include "rgw_cr_rest.h"
-#include "rgw_rest_conn.h"
-#include "rgw_rados.h"
-
-#include "services/svc_zone.h"
-#include "services/svc_zone_utils.h"
-#include "services/svc_sys_obj.h"
-#include "services/svc_cls.h"
-
-#include "cls/lock/cls_lock_client.h"
-#include "cls/rgw/cls_rgw_client.h"
-
-#include <boost/asio/yield.hpp>
-#include <boost/container/flat_set.hpp>
-
-#define dout_context g_ceph_context
-#define dout_subsys ceph_subsys_rgw
-
-using namespace std;
-
-bool RGWAsyncRadosProcessor::RGWWQ::_enqueue(RGWAsyncRadosRequest *req) {
-  if (processor->is_going_down()) {
-    return false;
-  }
-  req->get();
-  processor->m_req_queue.push_back(req);
-  dout(20) << "enqueued request req=" << hex << req << dec << dendl;
-  _dump_queue();
-  return true;
-}
-
-bool RGWAsyncRadosProcessor::RGWWQ::_empty() {
-  return processor->m_req_queue.empty();
-}
-
-RGWAsyncRadosRequest *RGWAsyncRadosProcessor::RGWWQ::_dequeue() {
-  if (processor->m_req_queue.empty())
-    return NULL;
-  RGWAsyncRadosRequest *req = processor->m_req_queue.front();
-  processor->m_req_queue.pop_front();
-  dout(20) << "dequeued request req=" << hex << req << dec << dendl;
-  _dump_queue();
-  return req;
-}
-
-void RGWAsyncRadosProcessor::RGWWQ::_process(RGWAsyncRadosRequest *req, ThreadPool::TPHandle& handle) {
-  processor->handle_request(this, req);
-  processor->req_throttle.put(1);
-}
-
-void RGWAsyncRadosProcessor::RGWWQ::_dump_queue() {
-  if (!g_conf()->subsys.should_gather<ceph_subsys_rgw, 20>()) {
-    return;
-  }
-  deque<RGWAsyncRadosRequest *>::iterator iter;
-  if (processor->m_req_queue.empty()) {
-    dout(20) << "RGWWQ: empty" << dendl;
-    return;
-  }
-  dout(20) << "RGWWQ:" << dendl;
-  for (iter = processor->m_req_queue.begin(); iter != processor->m_req_queue.end(); ++iter) {
-    dout(20) << "req: " << hex << *iter << dec << dendl;
-  }
-}
-
-RGWAsyncRadosProcessor::RGWAsyncRadosProcessor(CephContext *_cct, int num_threads)
-  : cct(_cct), m_tp(cct, "RGWAsyncRadosProcessor::m_tp", "rados_async", num_threads),
-    req_throttle(_cct, "rgw_async_rados_ops", num_threads * 2),
-    req_wq(this,
-          ceph::make_timespan(g_conf()->rgw_op_thread_timeout),
-          ceph::make_timespan(g_conf()->rgw_op_thread_suicide_timeout),
-          &m_tp) {
-}
-
-void RGWAsyncRadosProcessor::start() {
-  m_tp.start();
-}
-
-void RGWAsyncRadosProcessor::stop() {
-  going_down = true;
-  m_tp.drain(&req_wq);
-  m_tp.stop();
-  for (auto iter = m_req_queue.begin(); iter != m_req_queue.end(); ++iter) {
-    (*iter)->put();
-  }
-}
-
-void RGWAsyncRadosProcessor::handle_request(const DoutPrefixProvider *dpp, RGWAsyncRadosRequest *req) {
-  req->send_request(dpp);
-  req->put();
-}
-
-void RGWAsyncRadosProcessor::queue(RGWAsyncRadosRequest *req) {
-  req_throttle.get(1);
-  req_wq.queue(req);
-}
-
-int RGWAsyncGetSystemObj::_send_request(const DoutPrefixProvider *dpp)
-{
-  map<string, bufferlist> *pattrs = want_attrs ? &attrs : nullptr;
-
-  auto sysobj = svc_sysobj->get_obj(obj);
-  return sysobj.rop()
-               .set_objv_tracker(&objv_tracker)
-               .set_attrs(pattrs)
-              .set_raw_attrs(raw_attrs)
-               .read(dpp, &bl, null_yield);
-}
-
-RGWAsyncGetSystemObj::RGWAsyncGetSystemObj(const DoutPrefixProvider *_dpp, RGWCoroutine *caller, RGWAioCompletionNotifier *cn, RGWSI_SysObj *_svc,
-                       RGWObjVersionTracker *_objv_tracker, const rgw_raw_obj& _obj,
-                       bool want_attrs, bool raw_attrs)
-  : RGWAsyncRadosRequest(caller, cn), dpp(_dpp), svc_sysobj(_svc),
-    obj(_obj), want_attrs(want_attrs), raw_attrs(raw_attrs)
-{
-  if (_objv_tracker) {
-    objv_tracker = *_objv_tracker;
-  }
-}
-
-int RGWSimpleRadosReadAttrsCR::send_request(const DoutPrefixProvider *dpp)
-{
-  req = new RGWAsyncGetSystemObj(dpp, this, stack->create_completion_notifier(),
-                                svc, objv_tracker, obj, true, raw_attrs);
-  async_rados->queue(req);
-  return 0;
-}
-
-int RGWSimpleRadosReadAttrsCR::request_complete()
-{
-  if (pattrs) {
-    *pattrs = std::move(req->attrs);
-  }
-  if (objv_tracker) {
-    *objv_tracker = req->objv_tracker;
-  }
-  return req->get_ret_status();
-}
-
-int RGWAsyncPutSystemObj::_send_request(const DoutPrefixProvider *dpp)
-{
-  auto sysobj = svc->get_obj(obj);
-  return sysobj.wop()
-               .set_objv_tracker(&objv_tracker)
-               .set_exclusive(exclusive)
-               .write_data(dpp, bl, null_yield);
-}
-
-RGWAsyncPutSystemObj::RGWAsyncPutSystemObj(const DoutPrefixProvider *_dpp, 
-                     RGWCoroutine *caller, 
-                     RGWAioCompletionNotifier *cn,
-                     RGWSI_SysObj *_svc,
-                     RGWObjVersionTracker *_objv_tracker, const rgw_raw_obj& _obj,
-                     bool _exclusive, bufferlist _bl)
-  : RGWAsyncRadosRequest(caller, cn), dpp(_dpp), svc(_svc),
-    obj(_obj), exclusive(_exclusive), bl(std::move(_bl))
-{
-  if (_objv_tracker) {
-    objv_tracker = *_objv_tracker;
-  }
-}
-
-int RGWAsyncPutSystemObjAttrs::_send_request(const DoutPrefixProvider *dpp)
-{
-  auto sysobj = svc->get_obj(obj);
-  return sysobj.wop()
-               .set_objv_tracker(&objv_tracker)
-               .set_exclusive(exclusive)
-               .set_attrs(attrs)
-               .write_attrs(dpp, null_yield);
-}
-
-RGWAsyncPutSystemObjAttrs::RGWAsyncPutSystemObjAttrs(const DoutPrefixProvider *_dpp, RGWCoroutine *caller, RGWAioCompletionNotifier *cn,
-                     RGWSI_SysObj *_svc,
-                     RGWObjVersionTracker *_objv_tracker, const rgw_raw_obj& _obj,
-                     map<string, bufferlist> _attrs, bool exclusive)
-  : RGWAsyncRadosRequest(caller, cn), dpp(_dpp), svc(_svc),
-    obj(_obj), attrs(std::move(_attrs)), exclusive(exclusive)
-{
-  if (_objv_tracker) {
-    objv_tracker = *_objv_tracker;
-  }
-}
-
-
-RGWOmapAppend::RGWOmapAppend(RGWAsyncRadosProcessor *_async_rados, rgw::sal::RadosStore* _store, const rgw_raw_obj& _obj,
-                             uint64_t _window_size)
-                      : RGWConsumerCR<string>(_store->ctx()), async_rados(_async_rados),
-                        store(_store), obj(_obj), going_down(false), num_pending_entries(0), window_size(_window_size), total_entries(0)
-{
-}
-
-int RGWAsyncLockSystemObj::_send_request(const DoutPrefixProvider *dpp)
-{
-  rgw_rados_ref ref;
-  int r = store->getRados()->get_raw_obj_ref(dpp, obj, &ref);
-  if (r < 0) {
-    ldpp_dout(dpp, -1) << "ERROR: failed to get ref for (" << obj << ") ret=" << r << dendl;
-    return r;
-  }
-
-  rados::cls::lock::Lock l(lock_name);
-  utime_t duration(duration_secs, 0);
-  l.set_duration(duration);
-  l.set_cookie(cookie);
-  l.set_may_renew(true);
-
-  return l.lock_exclusive(&ref.pool.ioctx(), ref.obj.oid);
-}
-
-RGWAsyncLockSystemObj::RGWAsyncLockSystemObj(RGWCoroutine *caller, RGWAioCompletionNotifier *cn, rgw::sal::RadosStore* _store,
-                      RGWObjVersionTracker *_objv_tracker, const rgw_raw_obj& _obj,
-                       const string& _name, const string& _cookie, uint32_t _duration_secs) : RGWAsyncRadosRequest(caller, cn), store(_store),
-                                                              obj(_obj),
-                                                              lock_name(_name),
-                                                              cookie(_cookie),
-                                                              duration_secs(_duration_secs)
-{
-}
-
-int RGWAsyncUnlockSystemObj::_send_request(const DoutPrefixProvider *dpp)
-{
-  rgw_rados_ref ref;
-  int r = store->getRados()->get_raw_obj_ref(dpp, obj, &ref);
-  if (r < 0) {
-    ldpp_dout(dpp, -1) << "ERROR: failed to get ref for (" << obj << ") ret=" << r << dendl;
-    return r;
-  }
-
-  rados::cls::lock::Lock l(lock_name);
-
-  l.set_cookie(cookie);
-
-  return l.unlock(&ref.pool.ioctx(), ref.obj.oid);
-}
-
-RGWAsyncUnlockSystemObj::RGWAsyncUnlockSystemObj(RGWCoroutine *caller, RGWAioCompletionNotifier *cn, rgw::sal::RadosStore* _store,
-                                                 RGWObjVersionTracker *_objv_tracker, const rgw_raw_obj& _obj,
-                                                 const string& _name, const string& _cookie) : RGWAsyncRadosRequest(caller, cn), store(_store),
-  obj(_obj),
-  lock_name(_name), cookie(_cookie)
-{
-}
-
-RGWRadosSetOmapKeysCR::RGWRadosSetOmapKeysCR(rgw::sal::RadosStore* _store,
-                      const rgw_raw_obj& _obj,
-                      map<string, bufferlist>& _entries) : RGWSimpleCoroutine(_store->ctx()),
-                                                store(_store),
-                                                entries(_entries),
-                                                obj(_obj), cn(NULL)
-{
-  stringstream& s = set_description();
-  s << "set omap keys dest=" << obj << " keys=[" << s.str() << "]";
-  for (auto i = entries.begin(); i != entries.end(); ++i) {
-    if (i != entries.begin()) {
-      s << ", ";
-    }
-    s << i->first;
-  }
-  s << "]";
-}
-
-int RGWRadosSetOmapKeysCR::send_request(const DoutPrefixProvider *dpp)
-{
-  int r = store->getRados()->get_raw_obj_ref(dpp, obj, &ref);
-  if (r < 0) {
-    ldpp_dout(dpp, -1) << "ERROR: failed to get ref for (" << obj << ") ret=" << r << dendl;
-    return r;
-  }
-
-  set_status() << "sending request";
-
-  librados::ObjectWriteOperation op;
-  op.omap_set(entries);
-
-  cn = stack->create_completion_notifier();
-  return ref.pool.ioctx().aio_operate(ref.obj.oid, cn->completion(), &op);
-}
-
-int RGWRadosSetOmapKeysCR::request_complete()
-{
-  int r = cn->completion()->get_return_value();
-
-  set_status() << "request complete; ret=" << r;
-
-  return r;
-}
-
-RGWRadosGetOmapKeysCR::RGWRadosGetOmapKeysCR(rgw::sal::RadosStore* _store,
-                      const rgw_raw_obj& _obj,
-                      const string& _marker,
-                      int _max_entries,
-                      ResultPtr _result)
-  : RGWSimpleCoroutine(_store->ctx()), store(_store), obj(_obj),
-    marker(_marker), max_entries(_max_entries),
-    result(std::move(_result))
-{
-  ceph_assert(result); // must be allocated
-  set_description() << "get omap keys dest=" << obj << " marker=" << marker;
-}
-
-int RGWRadosGetOmapKeysCR::send_request(const DoutPrefixProvider *dpp) {
-  int r = store->getRados()->get_raw_obj_ref(dpp, obj, &result->ref);
-  if (r < 0) {
-    ldpp_dout(dpp, -1) << "ERROR: failed to get ref for (" << obj << ") ret=" << r << dendl;
-    return r;
-  }
-
-  set_status() << "send request";
-
-  librados::ObjectReadOperation op;
-  op.omap_get_keys2(marker, max_entries, &result->entries, &result->more, nullptr);
-
-  cn = stack->create_completion_notifier(result);
-  return result->ref.pool.ioctx().aio_operate(result->ref.obj.oid, cn->completion(), &op, NULL);
-}
-
-int RGWRadosGetOmapKeysCR::request_complete()
-{
-  int r = cn->completion()->get_return_value();
-
-  set_status() << "request complete; ret=" << r;
-
-  return r;
-}
-
-RGWRadosGetOmapValsCR::RGWRadosGetOmapValsCR(rgw::sal::RadosStore* _store,
-                      const rgw_raw_obj& _obj,
-                      const string& _marker,
-                      int _max_entries,
-                      ResultPtr _result)
-  : RGWSimpleCoroutine(_store->ctx()), store(_store), obj(_obj),
-    marker(_marker), max_entries(_max_entries),
-    result(std::move(_result))
-{
-  ceph_assert(result); // must be allocated
-  set_description() << "get omap keys dest=" << obj << " marker=" << marker;
-}
-
-int RGWRadosGetOmapValsCR::send_request(const DoutPrefixProvider *dpp) {
-  int r = store->getRados()->get_raw_obj_ref(dpp, obj, &result->ref);
-  if (r < 0) {
-    ldpp_dout(dpp, -1) << "ERROR: failed to get ref for (" << obj << ") ret=" << r << dendl;
-    return r;
-  }
-
-  set_status() << "send request";
-
-  librados::ObjectReadOperation op;
-  op.omap_get_vals2(marker, max_entries, &result->entries, &result->more, nullptr);
-
-  cn = stack->create_completion_notifier(result);
-  return result->ref.pool.ioctx().aio_operate(result->ref.obj.oid, cn->completion(), &op, NULL);
-}
-
-int RGWRadosGetOmapValsCR::request_complete()
-{
-  int r = cn->completion()->get_return_value();
-
-  set_status() << "request complete; ret=" << r;
-
-  return r;
-}
-
-RGWRadosRemoveOmapKeysCR::RGWRadosRemoveOmapKeysCR(rgw::sal::RadosStore* _store,
-                      const rgw_raw_obj& _obj,
-                      const set<string>& _keys) : RGWSimpleCoroutine(_store->ctx()),
-                                                store(_store),
-                                                keys(_keys),
-                                                obj(_obj), cn(NULL)
-{
-  set_description() << "remove omap keys dest=" << obj << " keys=" << keys;
-}
-
-int RGWRadosRemoveOmapKeysCR::send_request(const DoutPrefixProvider *dpp) {
-  int r = store->getRados()->get_raw_obj_ref(dpp, obj, &ref);
-  if (r < 0) {
-    ldpp_dout(dpp, -1) << "ERROR: failed to get ref for (" << obj << ") ret=" << r << dendl;
-    return r;
-  }
-
-  set_status() << "send request";
-
-  librados::ObjectWriteOperation op;
-  op.omap_rm_keys(keys);
-
-  cn = stack->create_completion_notifier();
-  return ref.pool.ioctx().aio_operate(ref.obj.oid, cn->completion(), &op);
-}
-
-int RGWRadosRemoveOmapKeysCR::request_complete()
-{
-  int r = cn->completion()->get_return_value();
-
-  set_status() << "request complete; ret=" << r;
-
-  return r;
-}
-
-RGWRadosRemoveCR::RGWRadosRemoveCR(rgw::sal::RadosStore* store, const rgw_raw_obj& obj,
-                                   RGWObjVersionTracker* objv_tracker)
-  : RGWSimpleCoroutine(store->ctx()),
-    store(store), obj(obj), objv_tracker(objv_tracker)
-{
-  set_description() << "remove dest=" << obj;
-}
-
-int RGWRadosRemoveCR::send_request(const DoutPrefixProvider *dpp)
-{
-  auto rados = store->getRados()->get_rados_handle();
-  int r = rados->ioctx_create(obj.pool.name.c_str(), ioctx);
-  if (r < 0) {
-    lderr(cct) << "ERROR: failed to open pool (" << obj.pool.name << ") ret=" << r << dendl;
-    return r;
-  }
-  ioctx.locator_set_key(obj.loc);
-
-  set_status() << "send request";
-
-  librados::ObjectWriteOperation op;
-  if (objv_tracker) {
-    objv_tracker->prepare_op_for_write(&op);
-  }
-  op.remove();
-
-  cn = stack->create_completion_notifier();
-  return ioctx.aio_operate(obj.oid, cn->completion(), &op);
-}
-
-int RGWRadosRemoveCR::request_complete()
-{
-  int r = cn->completion()->get_return_value();
-
-  set_status() << "request complete; ret=" << r;
-
-  return r;
-}
-
-RGWRadosRemoveOidCR::RGWRadosRemoveOidCR(rgw::sal::RadosStore* store,
-                                        librados::IoCtx&& ioctx,
-                                        std::string_view oid,
-                                        RGWObjVersionTracker* objv_tracker)
-  : RGWSimpleCoroutine(store->ctx()), ioctx(std::move(ioctx)),
-    oid(std::string(oid)), objv_tracker(objv_tracker)
-{
-  set_description() << "remove dest=" << oid;
-}
-
-RGWRadosRemoveOidCR::RGWRadosRemoveOidCR(rgw::sal::RadosStore* store,
-                                        RGWSI_RADOS::Obj& obj,
-                                        RGWObjVersionTracker* objv_tracker)
-  : RGWSimpleCoroutine(store->ctx()),
-    ioctx(librados::IoCtx(obj.get_ref().pool.ioctx())),
-    oid(obj.get_ref().obj.oid),
-    objv_tracker(objv_tracker)
-{
-  set_description() << "remove dest=" << oid;
-}
-
-RGWRadosRemoveOidCR::RGWRadosRemoveOidCR(rgw::sal::RadosStore* store,
-                                        RGWSI_RADOS::Obj&& obj,
-                                        RGWObjVersionTracker* objv_tracker)
-  : RGWSimpleCoroutine(store->ctx()),
-    ioctx(std::move(obj.get_ref().pool.ioctx())),
-    oid(std::move(obj.get_ref().obj.oid)),
-    objv_tracker(objv_tracker)
-{
-  set_description() << "remove dest=" << oid;
-}
-
-int RGWRadosRemoveOidCR::send_request(const DoutPrefixProvider *dpp)
-{
-  librados::ObjectWriteOperation op;
-  if (objv_tracker) {
-    objv_tracker->prepare_op_for_write(&op);
-  }
-  op.remove();
-
-  cn = stack->create_completion_notifier();
-  return ioctx.aio_operate(oid, cn->completion(), &op);
-}
-
-int RGWRadosRemoveOidCR::request_complete()
-{
-  int r = cn->completion()->get_return_value();
-
-  set_status() << "request complete; ret=" << r;
-
-  return r;
-}
-
-RGWSimpleRadosLockCR::RGWSimpleRadosLockCR(RGWAsyncRadosProcessor *_async_rados, rgw::sal::RadosStore* _store,
-                      const rgw_raw_obj& _obj,
-                      const string& _lock_name,
-                      const string& _cookie,
-                      uint32_t _duration) : RGWSimpleCoroutine(_store->ctx()),
-                                                async_rados(_async_rados),
-                                                store(_store),
-                                                lock_name(_lock_name),
-                                                cookie(_cookie),
-                                                duration(_duration),
-                                                obj(_obj),
-                                                req(NULL)
-{
-  set_description() << "rados lock dest=" << obj << " lock=" << lock_name << " cookie=" << cookie << " duration=" << duration;
-}
-
-void RGWSimpleRadosLockCR::request_cleanup()
-{
-  if (req) {
-    req->finish();
-    req = NULL;
-  }
-}
-
-int RGWSimpleRadosLockCR::send_request(const DoutPrefixProvider *dpp)
-{
-  set_status() << "sending request";
-  req = new RGWAsyncLockSystemObj(this, stack->create_completion_notifier(),
-                                 store, NULL, obj, lock_name, cookie, duration);
-  async_rados->queue(req);
-  return 0;
-}
-
-int RGWSimpleRadosLockCR::request_complete()
-{
-  set_status() << "request complete; ret=" << req->get_ret_status();
-  return req->get_ret_status();
-}
-
-RGWSimpleRadosUnlockCR::RGWSimpleRadosUnlockCR(RGWAsyncRadosProcessor *_async_rados, rgw::sal::RadosStore* _store,
-                      const rgw_raw_obj& _obj,
-                      const string& _lock_name,
-                      const string& _cookie) : RGWSimpleCoroutine(_store->ctx()),
-                                                async_rados(_async_rados),
-                                                store(_store),
-                                                lock_name(_lock_name),
-                                                cookie(_cookie),
-                                                obj(_obj),
-                                                req(NULL)
-{
-  set_description() << "rados unlock dest=" << obj << " lock=" << lock_name << " cookie=" << cookie;
-}
-
-void RGWSimpleRadosUnlockCR::request_cleanup()
-{
-  if (req) {
-    req->finish();
-    req = NULL;
-  }
-}
-
-int RGWSimpleRadosUnlockCR::send_request(const DoutPrefixProvider *dpp)
-{
-  set_status() << "sending request";
-
-  req = new RGWAsyncUnlockSystemObj(this, stack->create_completion_notifier(),
-                                 store, NULL, obj, lock_name, cookie);
-  async_rados->queue(req);
-  return 0;
-}
-
-int RGWSimpleRadosUnlockCR::request_complete()
-{
-  set_status() << "request complete; ret=" << req->get_ret_status();
-  return req->get_ret_status();
-}
-
-int RGWOmapAppend::operate(const DoutPrefixProvider *dpp) {
-  reenter(this) {
-    for (;;) {
-      if (!has_product() && going_down) {
-        set_status() << "going down";
-        break;
-      }
-      set_status() << "waiting for product";
-      yield wait_for_product();
-      yield {
-        string entry;
-        while (consume(&entry)) {
-          set_status() << "adding entry: " << entry;
-          entries[entry] = bufferlist();
-          if (entries.size() >= window_size) {
-            break;
-          }
-        }
-        if (entries.size() >= window_size || going_down) {
-          set_status() << "flushing to omap";
-          call(new RGWRadosSetOmapKeysCR(store, obj, entries));
-          entries.clear();
-        }
-      }
-      if (get_ret_status() < 0) {
-        ldout(cct, 0) << "ERROR: failed to store entries in omap" << dendl;
-        return set_state(RGWCoroutine_Error);
-      }
-    }
-    /* done with coroutine */
-    return set_state(RGWCoroutine_Done);
-  }
-  return 0;
-}
-
-void RGWOmapAppend::flush_pending() {
-  receive(pending_entries);
-  num_pending_entries = 0;
-}
-
-bool RGWOmapAppend::append(const string& s) {
-  if (is_done()) {
-    return false;
-  }
-  ++total_entries;
-  pending_entries.push_back(s);
-  if (++num_pending_entries >= (int)window_size) {
-    flush_pending();
-  }
-  return true;
-}
-
-bool RGWOmapAppend::finish() {
-  going_down = true;
-  flush_pending();
-  set_sleeping(false);
-  return (!is_done());
-}
-
-int RGWAsyncGetBucketInstanceInfo::_send_request(const DoutPrefixProvider *dpp)
-{
-  int r;
-  if (!bucket.bucket_id.empty()) {
-    r = store->getRados()->get_bucket_instance_info(bucket, bucket_info, nullptr, &attrs, null_yield, dpp);
-  } else {
-    r = store->ctl()->bucket->read_bucket_info(bucket, &bucket_info, null_yield, dpp,
-                                               RGWBucketCtl::BucketInstance::GetParams().set_attrs(&attrs));
-  }
-  if (r < 0) {
-    ldpp_dout(dpp, 0) << "ERROR: failed to get bucket instance info for "
-        << bucket << dendl;
-    return r;
-  }
-
-  return 0;
-}
-
-int RGWAsyncPutBucketInstanceInfo::_send_request(const DoutPrefixProvider *dpp)
-{
-  auto r = store->getRados()->put_bucket_instance_info(bucket_info, exclusive,
-                                                      mtime, attrs, dpp);
-  if (r < 0) {
-    ldpp_dout(dpp, 0) << "ERROR: failed to put bucket instance info for "
-                     << bucket_info.bucket << dendl;
-    return r;
-  }
-
-  return 0;
-}
-
-RGWRadosBILogTrimCR::RGWRadosBILogTrimCR(
-  const DoutPrefixProvider *dpp,
-  rgw::sal::RadosStore* store,
-  const RGWBucketInfo& bucket_info,
-  int shard_id,
-  const rgw::bucket_index_layout_generation& generation,
-  const std::string& start_marker,
-  const std::string& end_marker)
-  : RGWSimpleCoroutine(store->ctx()), bucket_info(bucket_info),
-    shard_id(shard_id), generation(generation), bs(store->getRados()),
-    start_marker(BucketIndexShardsManager::get_shard_marker(start_marker)),
-    end_marker(BucketIndexShardsManager::get_shard_marker(end_marker))
-{
-}
-
-int RGWRadosBILogTrimCR::send_request(const DoutPrefixProvider *dpp)
-{
-  int r = bs.init(dpp, bucket_info, generation, shard_id);
-  if (r < 0) {
-    ldpp_dout(dpp, -1) << "ERROR: bucket shard init failed ret=" << r << dendl;
-    return r;
-  }
-
-  bufferlist in;
-  cls_rgw_bi_log_trim_op call;
-  call.start_marker = std::move(start_marker);
-  call.end_marker = std::move(end_marker);
-  encode(call, in);
-
-  librados::ObjectWriteOperation op;
-  op.exec(RGW_CLASS, RGW_BI_LOG_TRIM, in);
-
-  cn = stack->create_completion_notifier();
-  return bs.bucket_obj.aio_operate(cn->completion(), &op);
-}
-
-int RGWRadosBILogTrimCR::request_complete()
-{
-  int r = cn->completion()->get_return_value();
-  set_status() << "request complete; ret=" << r;
-  return r;
-}
-
-int RGWAsyncFetchRemoteObj::_send_request(const DoutPrefixProvider *dpp)
-{
-  RGWObjectCtx obj_ctx(store);
-
-  char buf[16];
-  snprintf(buf, sizeof(buf), ".%lld", (long long)store->getRados()->instance_id());
-  rgw::sal::Attrs attrs;
-
-  rgw::sal::RadosBucket bucket(store, src_bucket);
-  rgw::sal::RadosObject src_obj(store, key, &bucket);
-  rgw::sal::RadosBucket dest_bucket(store, dest_bucket_info);
-  rgw::sal::RadosObject dest_obj(store, dest_key.value_or(key), &dest_bucket);
-    
-  std::string etag;
-
-  std::optional<uint64_t> bytes_transferred;
-  int r = store->getRados()->fetch_remote_obj(obj_ctx,
-                       user_id.value_or(rgw_user()),
-                       NULL, /* req_info */
-                       source_zone,
-                       &dest_obj,
-                       &src_obj,
-                       &dest_bucket, /* dest */
-                       nullptr, /* source */
-                       dest_placement_rule,
-                       nullptr, /* real_time* src_mtime, */
-                       NULL, /* real_time* mtime, */
-                       NULL, /* const real_time* mod_ptr, */
-                       NULL, /* const real_time* unmod_ptr, */
-                       false, /* high precision time */
-                       NULL, /* const char *if_match, */
-                       NULL, /* const char *if_nomatch, */
-                       RGWRados::ATTRSMOD_NONE,
-                       copy_if_newer,
-                       attrs,
-                       RGWObjCategory::Main,
-                       versioned_epoch,
-                       real_time(), /* delete_at */
-                       NULL, /* string *ptag, */
-                       &etag, /* string *petag, */
-                       NULL, /* void (*progress_cb)(off_t, void *), */
-                       NULL, /* void *progress_data*); */
-                       dpp,
-                       filter.get(),
-                       &zones_trace,
-                       &bytes_transferred);
-
-  if (r < 0) {
-    ldpp_dout(dpp, 0) << "store->fetch_remote_obj() returned r=" << r << dendl;
-    if (counters) {
-      counters->inc(sync_counters::l_fetch_err, 1);
-    }
-  } else {
-      // r >= 0
-      if (bytes_transferred) {
-        // send notification that object was succesfully synced
-        std::string user_id = "rgw sync";
-        std::string req_id = "0";
-                       
-        RGWObjTags obj_tags;
-        auto iter = attrs.find(RGW_ATTR_TAGS);
-        if (iter != attrs.end()) {
-          try {
-            auto it = iter->second.cbegin();
-            obj_tags.decode(it);
-          } catch (buffer::error &err) {
-            ldpp_dout(dpp, 1) << "ERROR: " << __func__ << ": caught buffer::error couldn't decode TagSet " << dendl;
-          }
-        }
-
-        // NOTE: we create a mutable copy of bucket.get_tenant as the get_notification function expects a std::string&, not const
-        std::string tenant(dest_bucket.get_tenant());
-
-        std::unique_ptr<rgw::sal::Notification> notify 
-                 = store->get_notification(dpp, &dest_obj, nullptr, rgw::notify::ObjectSyncedCreate,
-                  &dest_bucket, user_id,
-                  tenant,
-                  req_id, null_yield);
-
-        auto notify_res = static_cast<rgw::sal::RadosNotification*>(notify.get())->get_reservation();
-        int ret = rgw::notify::publish_reserve(dpp, rgw::notify::ObjectSyncedCreate, notify_res, &obj_tags);
-        if (ret < 0) {
-          ldpp_dout(dpp, 1) << "ERROR: reserving notification failed, with error: " << ret << dendl;
-          // no need to return, the sync already happened
-        } else {
-          ret = rgw::notify::publish_commit(&dest_obj, dest_obj.get_obj_size(), ceph::real_clock::now(), etag, dest_obj.get_instance(), rgw::notify::ObjectSyncedCreate, notify_res, dpp);
-          if (ret < 0) {
-            ldpp_dout(dpp, 1) << "ERROR: publishing notification failed, with error: " << ret << dendl;
-          }
-        }
-      }
-      
-      if (counters) {
-        if (bytes_transferred) {
-          counters->inc(sync_counters::l_fetch, *bytes_transferred);
-        } else {
-          counters->inc(sync_counters::l_fetch_not_modified);
-        }
-      }
-  }
-  return r;
-}
-
-int RGWAsyncStatRemoteObj::_send_request(const DoutPrefixProvider *dpp)
-{
-  RGWObjectCtx obj_ctx(store);
-
-  string user_id;
-  char buf[16];
-  snprintf(buf, sizeof(buf), ".%lld", (long long)store->getRados()->instance_id());
-
-  rgw::sal::RadosBucket bucket(store, src_bucket);
-  rgw::sal::RadosObject src_obj(store, key, &bucket);
-
-  int r = store->getRados()->stat_remote_obj(dpp,
-                       obj_ctx,
-                       rgw_user(user_id),
-                       nullptr, /* req_info */
-                       source_zone,
-                       &src_obj,
-                       nullptr, /* source */
-                       pmtime, /* real_time* src_mtime, */
-                       psize, /* uint64_t * */
-                       nullptr, /* const real_time* mod_ptr, */
-                       nullptr, /* const real_time* unmod_ptr, */
-                       true, /* high precision time */
-                       nullptr, /* const char *if_match, */
-                       nullptr, /* const char *if_nomatch, */
-                       pattrs,
-                       pheaders,
-                       nullptr,
-                       nullptr, /* string *ptag, */
-                       petag); /* string *petag, */
-
-  if (r < 0) {
-    ldpp_dout(dpp, 0) << "store->stat_remote_obj() returned r=" << r << dendl;
-  }
-  return r;
-}
-
-
-int RGWAsyncRemoveObj::_send_request(const DoutPrefixProvider *dpp)
-{
-  ldpp_dout(dpp, 0) << __func__ << "(): deleting obj=" << obj << dendl;
-
-  obj->set_atomic();
-
-  RGWObjState *state;
-
-  int ret = obj->get_obj_state(dpp, &state, null_yield);
-  if (ret < 0) {
-    ldpp_dout(dpp, 20) << __func__ << "(): get_obj_state() obj=" << obj << " returned ret=" << ret << dendl;
-    return ret;
-  }
-
-  /* has there been any racing object write? */
-  if (del_if_older && (state->mtime > timestamp)) {
-    ldpp_dout(dpp, 20) << __func__ << "(): skipping object removal obj=" << obj << " (obj mtime=" << state->mtime << ", request timestamp=" << timestamp << ")" << dendl;
-    return 0;
-  }
-
-  RGWAccessControlPolicy policy;
-
-  /* decode policy */
-  map<string, bufferlist>::iterator iter = state->attrset.find(RGW_ATTR_ACL);
-  if (iter != state->attrset.end()) {
-    auto bliter = iter->second.cbegin();
-    try {
-      policy.decode(bliter);
-    } catch (buffer::error& err) {
-      ldpp_dout(dpp, 0) << "ERROR: could not decode policy, caught buffer::error" << dendl;
-      return -EIO;
-    }
-  }
-
-  std::unique_ptr<rgw::sal::Object::DeleteOp> del_op = obj->get_delete_op();
-
-  del_op->params.bucket_owner = bucket->get_info().owner;
-  del_op->params.obj_owner = policy.get_owner();
-  if (del_if_older) {
-    del_op->params.unmod_since = timestamp;
-  }
-  if (versioned) {
-    del_op->params.versioning_status = BUCKET_VERSIONED;
-  }
-  del_op->params.olh_epoch = versioned_epoch;
-  del_op->params.marker_version_id = marker_version_id;
-  del_op->params.obj_owner.set_id(rgw_user(owner));
-  del_op->params.obj_owner.set_name(owner_display_name);
-  del_op->params.mtime = timestamp;
-  del_op->params.high_precision_time = true;
-  del_op->params.zones_trace = &zones_trace;
-
-  ret = del_op->delete_obj(dpp, null_yield);
-  if (ret < 0) {
-    ldpp_dout(dpp, 20) << __func__ << "(): delete_obj() obj=" << obj << " returned ret=" << ret << dendl;
-  }
-  return ret;
-}
-
-int RGWContinuousLeaseCR::operate(const DoutPrefixProvider *dpp)
-{
-  if (aborted) {
-    caller->set_sleeping(false);
-    return set_cr_done();
-  }
-  reenter(this) {
-    last_renew_try_time = ceph::coarse_mono_clock::now();
-    while (!going_down) {
-      yield call(new RGWSimpleRadosLockCR(async_rados, store, obj, lock_name, cookie, interval));
-      current_time = ceph::coarse_mono_clock::now();
-      if (current_time - last_renew_try_time > interval_tolerance) {
-        // renewal should happen between 50%-90% of interval
-        ldout(store->ctx(), 1) << *this << ": WARNING: did not renew lock " << obj << ":" << lock_name << ": within 90\% of interval. " << 
-          (current_time - last_renew_try_time) << " > " << interval_tolerance << dendl;
-      }
-      last_renew_try_time = current_time;
-
-      caller->set_sleeping(false); /* will only be relevant when we return, that's why we can do it early */
-      if (retcode < 0) {
-        set_locked(false);
-        ldout(store->ctx(), 20) << *this << ": couldn't lock " << obj << ":" << lock_name << ": retcode=" << retcode << dendl;
-        return set_state(RGWCoroutine_Error, retcode);
-      }
-      ldout(store->ctx(), 20) << *this << ": successfully locked " << obj << ":" << lock_name << dendl;
-      set_locked(true);
-      yield wait(utime_t(interval / 2, 0));
-    }
-    set_locked(false); /* moot at this point anyway */
-    yield call(new RGWSimpleRadosUnlockCR(async_rados, store, obj, lock_name, cookie));
-    return set_state(RGWCoroutine_Done);
-  }
-  return 0;
-}
-
-RGWRadosTimelogAddCR::RGWRadosTimelogAddCR(const DoutPrefixProvider *_dpp, rgw::sal::RadosStore* _store, const string& _oid,
-                      const cls_log_entry& entry) : RGWSimpleCoroutine(_store->ctx()),
-                                                dpp(_dpp),
-                                                store(_store),
-                                                oid(_oid), cn(NULL)
-{
-  stringstream& s = set_description();
-  s << "timelog add entry oid=" <<  oid << "entry={id=" << entry.id << ", section=" << entry.section << ", name=" << entry.name << "}";
-  entries.push_back(entry);
-}
-
-int RGWRadosTimelogAddCR::send_request(const DoutPrefixProvider *dpp)
-{
-  set_status() << "sending request";
-
-  cn = stack->create_completion_notifier();
-  return store->svc()->cls->timelog.add(dpp, oid, entries, cn->completion(), true, null_yield);
-}
-
-int RGWRadosTimelogAddCR::request_complete()
-{
-  int r = cn->completion()->get_return_value();
-
-  set_status() << "request complete; ret=" << r;
-
-  return r;
-}
-
-RGWRadosTimelogTrimCR::RGWRadosTimelogTrimCR(const DoutPrefixProvider *dpp,
-                                             rgw::sal::RadosStore* store,
-                                             const std::string& oid,
-                                             const real_time& start_time,
-                                             const real_time& end_time,
-                                             const std::string& from_marker,
-                                             const std::string& to_marker)
-  : RGWSimpleCoroutine(store->ctx()), dpp(dpp), store(store), oid(oid),
-    start_time(start_time), end_time(end_time),
-    from_marker(from_marker), to_marker(to_marker)
-{
-  set_description() << "timelog trim oid=" <<  oid
-      << " start_time=" << start_time << " end_time=" << end_time
-      << " from_marker=" << from_marker << " to_marker=" << to_marker;
-}
-
-int RGWRadosTimelogTrimCR::send_request(const DoutPrefixProvider *dpp)
-{
-  set_status() << "sending request";
-
-  cn = stack->create_completion_notifier();
-  return store->svc()->cls->timelog.trim(dpp, oid, start_time, end_time, from_marker,
-                                      to_marker, cn->completion(),
-                                      null_yield);
-}
-
-int RGWRadosTimelogTrimCR::request_complete()
-{
-  int r = cn->completion()->get_return_value();
-
-  set_status() << "request complete; ret=" << r;
-
-  return r;
-}
-
-
-RGWSyncLogTrimCR::RGWSyncLogTrimCR(const DoutPrefixProvider *dpp,
-                                   rgw::sal::RadosStore* store, const std::string& oid,
-                                   const std::string& to_marker,
-                                   std::string *last_trim_marker)
-  : RGWRadosTimelogTrimCR(dpp, store, oid, real_time{}, real_time{},
-                          std::string{}, to_marker),
-    cct(store->ctx()), last_trim_marker(last_trim_marker)
-{
-}
-
-int RGWSyncLogTrimCR::request_complete()
-{
-  int r = RGWRadosTimelogTrimCR::request_complete();
-  if (r != -ENODATA) {
-    return r;
-  }
-  // nothing left to trim, update last_trim_marker
-  if (*last_trim_marker < to_marker && to_marker != max_marker) {
-    *last_trim_marker = to_marker;
-  }
-  return 0;
-}
-
-
-int RGWAsyncStatObj::_send_request(const DoutPrefixProvider *dpp)
-{
-  rgw_raw_obj raw_obj;
-  store->getRados()->obj_to_raw(bucket_info.placement_rule, obj, &raw_obj);
-  return store->getRados()->raw_obj_stat(dpp, raw_obj, psize, pmtime, pepoch,
-                             nullptr, nullptr, objv_tracker, null_yield);
-}
-
-RGWStatObjCR::RGWStatObjCR(const DoutPrefixProvider *dpp,
-                           RGWAsyncRadosProcessor *async_rados, rgw::sal::RadosStore* store,
-                           const RGWBucketInfo& _bucket_info, const rgw_obj& obj, uint64_t *psize,
-                           real_time* pmtime, uint64_t *pepoch,
-                           RGWObjVersionTracker *objv_tracker)
-  : RGWSimpleCoroutine(store->ctx()), dpp(dpp), store(store), async_rados(async_rados),
-    bucket_info(_bucket_info), obj(obj), psize(psize), pmtime(pmtime), pepoch(pepoch),
-    objv_tracker(objv_tracker)
-{
-}
-
-void RGWStatObjCR::request_cleanup()
-{
-  if (req) {
-    req->finish();
-    req = NULL;
-  }
-}
-
-int RGWStatObjCR::send_request(const DoutPrefixProvider *dpp)
-{
-  req = new RGWAsyncStatObj(dpp, this, stack->create_completion_notifier(),
-                            store, bucket_info, obj, psize, pmtime, pepoch, objv_tracker);
-  async_rados->queue(req);
-  return 0;
-}
-
-int RGWStatObjCR::request_complete()
-{
-  return req->get_ret_status();
-}
-
-RGWRadosNotifyCR::RGWRadosNotifyCR(rgw::sal::RadosStore* store, const rgw_raw_obj& obj,
-                                   bufferlist& request, uint64_t timeout_ms,
-                                   bufferlist *response)
-  : RGWSimpleCoroutine(store->ctx()), store(store), obj(obj),
-    request(request), timeout_ms(timeout_ms), response(response)
-{
-  set_description() << "notify dest=" << obj;
-}
-
-int RGWRadosNotifyCR::send_request(const DoutPrefixProvider *dpp)
-{
-  int r = store->getRados()->get_raw_obj_ref(dpp, obj, &ref);
-  if (r < 0) {
-    ldpp_dout(dpp, -1) << "ERROR: failed to get ref for (" << obj << ") ret=" << r << dendl;
-    return r;
-  }
-
-  set_status() << "sending request";
-
-  cn = stack->create_completion_notifier();
-  return ref.pool.ioctx().aio_notify(ref.obj.oid, cn->completion(), request,
-                              timeout_ms, response);
-}
-
-int RGWRadosNotifyCR::request_complete()
-{
-  int r = cn->completion()->get_return_value();
-
-  set_status() << "request complete; ret=" << r;
-
-  return r;
-}
-
-
-int RGWDataPostNotifyCR::operate(const DoutPrefixProvider* dpp)
-{
-  reenter(this) {
-    using PostNotify2 = RGWPostRESTResourceCR<bc::flat_map<int, bc::flat_set<rgw_data_notify_entry>>, int>;
-    yield {
-      rgw_http_param_pair pairs[] = { { "type", "data" },
-                                      { "notify2", NULL },
-                                      { "source-zone", source_zone },
-                                      { NULL, NULL } };
-      call(new PostNotify2(store->ctx(), conn, &http_manager, "/admin/log", pairs, shards, nullptr));
-    }
-    if (retcode == -ERR_METHOD_NOT_ALLOWED) {
-      using PostNotify1 = RGWPostRESTResourceCR<rgw_data_notify_v1_encoder, int>;
-      yield {
-        rgw_http_param_pair pairs[] = { { "type", "data" },
-                                        { "notify", NULL },
-                                        { "source-zone", source_zone },
-                                        { NULL, NULL } };
-        auto encoder = rgw_data_notify_v1_encoder{shards};
-        call(new PostNotify1(store->ctx(), conn, &http_manager, "/admin/log", pairs, encoder, nullptr));
-      }
-    }
-    if (retcode < 0) {
-      return set_cr_error(retcode);
-    }
-    return set_cr_done();
-  }
-  return 0;
-}
diff --git a/src/rgw/store/rados/rgw_cr_rados.h b/src/rgw/store/rados/rgw_cr_rados.h
deleted file mode 100644 (file)
index 03c5303..0000000
+++ /dev/null
@@ -1,1595 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab ft=cpp
-
-#ifndef CEPH_RGW_CR_RADOS_H
-#define CEPH_RGW_CR_RADOS_H
-
-#include <boost/intrusive_ptr.hpp>
-#include "include/ceph_assert.h"
-#include "rgw_coroutine.h"
-#include "rgw_sal.h"
-#include "rgw_sal_rados.h"
-#include "common/WorkQueue.h"
-#include "common/Throttle.h"
-
-#include <atomic>
-#include "common/ceph_time.h"
-
-#include "services/svc_sys_obj.h"
-#include "services/svc_bucket.h"
-
-struct rgw_http_param_pair;
-class RGWRESTConn;
-
-class RGWAsyncRadosRequest : public RefCountedObject {
-  RGWCoroutine *caller;
-  RGWAioCompletionNotifier *notifier;
-
-  int retcode;
-
-  ceph::mutex lock = ceph::make_mutex("RGWAsyncRadosRequest::lock");
-
-protected:
-  virtual int _send_request(const DoutPrefixProvider *dpp) = 0;
-public:
-  RGWAsyncRadosRequest(RGWCoroutine *_caller, RGWAioCompletionNotifier *_cn)
-    : caller(_caller), notifier(_cn), retcode(0) {
-  }
-  ~RGWAsyncRadosRequest() override {
-    if (notifier) {
-      notifier->put();
-    }
-  }
-
-  void send_request(const DoutPrefixProvider *dpp) {
-    get();
-    retcode = _send_request(dpp);
-    {
-      std::lock_guard l{lock};
-      if (notifier) {
-        notifier->cb(); // drops its own ref
-        notifier = nullptr;
-      }
-    }
-    put();
-  }
-
-  int get_ret_status() { return retcode; }
-
-  void finish() {
-    {
-      std::lock_guard l{lock};
-      if (notifier) {
-        // we won't call notifier->cb() to drop its ref, so drop it here
-        notifier->put();
-        notifier = nullptr;
-      }
-    }
-    put();
-  }
-};
-
-
-class RGWAsyncRadosProcessor {
-  std::deque<RGWAsyncRadosRequest *> m_req_queue;
-  std::atomic<bool> going_down = { false };
-protected:
-  CephContext *cct;
-  ThreadPool m_tp;
-  Throttle req_throttle;
-
-  struct RGWWQ : public DoutPrefixProvider, public ThreadPool::WorkQueue<RGWAsyncRadosRequest> {
-    RGWAsyncRadosProcessor *processor;
-    RGWWQ(RGWAsyncRadosProcessor *p,
-         ceph::timespan timeout, ceph::timespan suicide_timeout,
-         ThreadPool *tp)
-      : ThreadPool::WorkQueue<RGWAsyncRadosRequest>("RGWWQ", timeout, suicide_timeout, tp), processor(p) {}
-
-    bool _enqueue(RGWAsyncRadosRequest *req) override;
-    void _dequeue(RGWAsyncRadosRequest *req) override {
-      ceph_abort();
-    }
-    bool _empty() override;
-    RGWAsyncRadosRequest *_dequeue() override;
-    using ThreadPool::WorkQueue<RGWAsyncRadosRequest>::_process;
-    void _process(RGWAsyncRadosRequest *req, ThreadPool::TPHandle& handle) override;
-    void _dump_queue();
-    void _clear() override {
-      ceph_assert(processor->m_req_queue.empty());
-    }
-
-  CephContext *get_cct() const { return processor->cct; }
-  unsigned get_subsys() const { return ceph_subsys_rgw; }
-  std::ostream& gen_prefix(std::ostream& out) const { return out << "rgw async rados processor: ";}
-
-  } req_wq;
-
-public:
-  RGWAsyncRadosProcessor(CephContext *_cct, int num_threads);
-  ~RGWAsyncRadosProcessor() {}
-  void start();
-  void stop();
-  void handle_request(const DoutPrefixProvider *dpp, RGWAsyncRadosRequest *req);
-  void queue(RGWAsyncRadosRequest *req);
-
-  bool is_going_down() {
-    return going_down;
-  }
-
-};
-
-template <class P>
-class RGWSimpleWriteOnlyAsyncCR : public RGWSimpleCoroutine {
-  RGWAsyncRadosProcessor *async_rados;
-  rgw::sal::RadosStore* store;
-
-  P params;
-  const DoutPrefixProvider *dpp;
-
-  class Request : public RGWAsyncRadosRequest {
-    rgw::sal::RadosStore* store;
-    P params;
-    const DoutPrefixProvider *dpp;
-  protected:
-    int _send_request(const DoutPrefixProvider *dpp) override;
-  public:
-    Request(RGWCoroutine *caller,
-            RGWAioCompletionNotifier *cn,
-            rgw::sal::RadosStore* store,
-            const P& _params,
-            const DoutPrefixProvider *dpp) : RGWAsyncRadosRequest(caller, cn),
-                                store(store),
-                                params(_params),
-                                dpp(dpp) {}
-  } *req{nullptr};
-
- public:
-  RGWSimpleWriteOnlyAsyncCR(RGWAsyncRadosProcessor *_async_rados,
-                           rgw::sal::RadosStore* _store,
-                           const P& _params,
-                            const DoutPrefixProvider *_dpp) : RGWSimpleCoroutine(_store->ctx()),
-                                                async_rados(_async_rados),
-                                                store(_store),
-                                               params(_params),
-                                                dpp(_dpp) {}
-
-  ~RGWSimpleWriteOnlyAsyncCR() override {
-    request_cleanup();
-  }
-  void request_cleanup() override {
-    if (req) {
-      req->finish();
-      req = NULL;
-    }
-  }
-
-  int send_request(const DoutPrefixProvider *dpp) override {
-    req = new Request(this,
-                      stack->create_completion_notifier(),
-                      store,
-                      params,
-                      dpp);
-
-    async_rados->queue(req);
-    return 0;
-  }
-  int request_complete() override {
-    return req->get_ret_status();
-  }
-};
-
-
-template <class P, class R>
-class RGWSimpleAsyncCR : public RGWSimpleCoroutine {
-  RGWAsyncRadosProcessor *async_rados;
-  rgw::sal::RadosStore* store;
-
-  P params;
-  std::shared_ptr<R> result;
-  const DoutPrefixProvider *dpp;
-
-  class Request : public RGWAsyncRadosRequest {
-    rgw::sal::RadosStore* store;
-    P params;
-    std::shared_ptr<R> result;
-    const DoutPrefixProvider *dpp;
-  protected:
-    int _send_request(const DoutPrefixProvider *dpp) override;
-  public:
-    Request(const DoutPrefixProvider *dpp,
-            RGWCoroutine *caller,
-            RGWAioCompletionNotifier *cn,
-            rgw::sal::RadosStore* _store,
-            const P& _params,
-            std::shared_ptr<R>& _result,
-            const DoutPrefixProvider *_dpp) : RGWAsyncRadosRequest(caller, cn),
-                                           store(_store),
-                                           params(_params),
-                                           result(_result),
-                                           dpp(_dpp) {}
-  } *req{nullptr};
-
- public:
-  RGWSimpleAsyncCR(RGWAsyncRadosProcessor *_async_rados,
-                   rgw::sal::RadosStore* _store,
-                   const P& _params,
-                   std::shared_ptr<R>& _result,
-                   const DoutPrefixProvider *_dpp) : RGWSimpleCoroutine(_store->ctx()),
-                                                  async_rados(_async_rados),
-                                                  store(_store),
-                                                  params(_params),
-                                                  result(_result),
-                                                  dpp(_dpp) {}
-
-  ~RGWSimpleAsyncCR() override {
-    request_cleanup();
-  }
-  void request_cleanup() override {
-    if (req) {
-      req->finish();
-      req = NULL;
-    }
-  }
-
-  int send_request(const DoutPrefixProvider *dpp) override {
-    req = new Request(dpp,
-                      this,
-                      stack->create_completion_notifier(),
-                      store,
-                      params,
-                      result,
-                      dpp);
-
-    async_rados->queue(req);
-    return 0;
-  }
-  int request_complete() override {
-    return req->get_ret_status();
-  }
-};
-
-class RGWGenericAsyncCR : public RGWSimpleCoroutine {
-  RGWAsyncRadosProcessor *async_rados;
-  rgw::sal::RadosStore* store;
-
-
-public:
-  class Action {
-  public:
-    virtual ~Action() {}
-    virtual int operate() = 0;
-  };
-
-private:
-  std::shared_ptr<Action> action;
-
-  class Request : public RGWAsyncRadosRequest {
-    std::shared_ptr<Action> action;
-  protected:
-    int _send_request(const DoutPrefixProvider *dpp) override {
-      if (!action) {
-       return 0;
-      }
-      return action->operate();
-    }
-  public:
-    Request(const DoutPrefixProvider *dpp,
-            RGWCoroutine *caller,
-            RGWAioCompletionNotifier *cn,
-            std::shared_ptr<Action>& _action) : RGWAsyncRadosRequest(caller, cn),
-                                           action(_action) {}
-  } *req{nullptr};
-
- public:
-  RGWGenericAsyncCR(CephContext *_cct,
-                   RGWAsyncRadosProcessor *_async_rados,
-                   std::shared_ptr<Action>& _action) : RGWSimpleCoroutine(_cct),
-                                                  async_rados(_async_rados),
-                                                  action(_action) {}
-  template<typename T>
-  RGWGenericAsyncCR(CephContext *_cct,
-                   RGWAsyncRadosProcessor *_async_rados,
-                   std::shared_ptr<T>& _action) : RGWSimpleCoroutine(_cct),
-                                                  async_rados(_async_rados),
-                                                  action(std::static_pointer_cast<Action>(_action)) {}
-
-  ~RGWGenericAsyncCR() override {
-    request_cleanup();
-  }
-  void request_cleanup() override {
-    if (req) {
-      req->finish();
-      req = NULL;
-    }
-  }
-
-  int send_request(const DoutPrefixProvider *dpp) override {
-    req = new Request(dpp, this,
-                      stack->create_completion_notifier(),
-                      action);
-
-    async_rados->queue(req);
-    return 0;
-  }
-  int request_complete() override {
-    return req->get_ret_status();
-  }
-};
-
-
-class RGWAsyncGetSystemObj : public RGWAsyncRadosRequest {
-  const DoutPrefixProvider *dpp;
-  RGWSI_SysObj* svc_sysobj;
-  rgw_raw_obj obj;
-  const bool want_attrs;
-  const bool raw_attrs;
-protected:
-  int _send_request(const DoutPrefixProvider *dpp) override;
-public:
-  RGWAsyncGetSystemObj(const DoutPrefixProvider *dpp, 
-                       RGWCoroutine *caller, RGWAioCompletionNotifier *cn, RGWSI_SysObj *_svc,
-                       RGWObjVersionTracker *_objv_tracker, const rgw_raw_obj& _obj,
-                       bool want_attrs, bool raw_attrs);
-
-  bufferlist bl;
-  std::map<std::string, bufferlist> attrs;
-  RGWObjVersionTracker objv_tracker;
-};
-
-class RGWAsyncPutSystemObj : public RGWAsyncRadosRequest {
-  const DoutPrefixProvider *dpp;
-  RGWSI_SysObj *svc;
-  rgw_raw_obj obj;
-  bool exclusive;
-  bufferlist bl;
-
-protected:
-  int _send_request(const DoutPrefixProvider *dpp) override;
-public:
-  RGWAsyncPutSystemObj(const DoutPrefixProvider *dpp, RGWCoroutine *caller, 
-                       RGWAioCompletionNotifier *cn, RGWSI_SysObj *_svc,
-                       RGWObjVersionTracker *_objv_tracker, const rgw_raw_obj& _obj,
-                       bool _exclusive, bufferlist _bl);
-
-  RGWObjVersionTracker objv_tracker;
-};
-
-class RGWAsyncPutSystemObjAttrs : public RGWAsyncRadosRequest {
-  const DoutPrefixProvider *dpp;
-  RGWSI_SysObj *svc;
-  rgw_raw_obj obj;
-  std::map<std::string, bufferlist> attrs;
-  bool exclusive;
-
-protected:
-  int _send_request(const DoutPrefixProvider *dpp) override;
-public:
-  RGWAsyncPutSystemObjAttrs(const DoutPrefixProvider *dpp, RGWCoroutine *caller, RGWAioCompletionNotifier *cn, RGWSI_SysObj *_svc,
-                           RGWObjVersionTracker *_objv_tracker, const rgw_raw_obj& _obj,
-                           std::map<std::string, bufferlist> _attrs, bool exclusive);
-
-  RGWObjVersionTracker objv_tracker;
-};
-
-class RGWAsyncLockSystemObj : public RGWAsyncRadosRequest {
-  rgw::sal::RadosStore* store;
-  rgw_raw_obj obj;
-  std::string lock_name;
-  std::string cookie;
-  uint32_t duration_secs;
-
-protected:
-  int _send_request(const DoutPrefixProvider *dpp) override;
-public:
-  RGWAsyncLockSystemObj(RGWCoroutine *caller, RGWAioCompletionNotifier *cn, rgw::sal::RadosStore* _store,
-                        RGWObjVersionTracker *_objv_tracker, const rgw_raw_obj& _obj,
-                       const std::string& _name, const std::string& _cookie, uint32_t _duration_secs);
-};
-
-class RGWAsyncUnlockSystemObj : public RGWAsyncRadosRequest {
-  rgw::sal::RadosStore* store;
-  rgw_raw_obj obj;
-  std::string lock_name;
-  std::string cookie;
-
-protected:
-  int _send_request(const DoutPrefixProvider *dpp) override;
-public:
-  RGWAsyncUnlockSystemObj(RGWCoroutine *caller, RGWAioCompletionNotifier *cn, rgw::sal::RadosStore* _store,
-                        RGWObjVersionTracker *_objv_tracker, const rgw_raw_obj& _obj,
-                       const std::string& _name, const std::string& _cookie);
-};
-
-template <class T>
-class RGWSimpleRadosReadCR : public RGWSimpleCoroutine {
-  const DoutPrefixProvider *dpp;
-  RGWAsyncRadosProcessor *async_rados;
-  RGWSI_SysObj *svc;
-
-  rgw_raw_obj obj;
-  T *result;
-  /// on ENOENT, call handle_data() with an empty object instead of failing
-  const bool empty_on_enoent;
-  RGWObjVersionTracker *objv_tracker;
-  RGWAsyncGetSystemObj *req{nullptr};
-
-public:
-  RGWSimpleRadosReadCR(const DoutPrefixProvider *_dpp, 
-                      RGWAsyncRadosProcessor *_async_rados, RGWSI_SysObj *_svc,
-                     const rgw_raw_obj& _obj,
-                     T *_result, bool empty_on_enoent = true,
-                     RGWObjVersionTracker *objv_tracker = nullptr)
-    : RGWSimpleCoroutine(_svc->ctx()), dpp(_dpp), async_rados(_async_rados), svc(_svc),
-      obj(_obj), result(_result),
-      empty_on_enoent(empty_on_enoent), objv_tracker(objv_tracker) {}
-  ~RGWSimpleRadosReadCR() override {
-    request_cleanup();
-  }
-
-  void request_cleanup() override {
-    if (req) {
-      req->finish();
-      req = NULL;
-    }
-  }
-
-  int send_request(const DoutPrefixProvider *dpp) override;
-  int request_complete() override;
-
-  virtual int handle_data(T& data) {
-    return 0;
-  }
-};
-
-template <class T>
-int RGWSimpleRadosReadCR<T>::send_request(const DoutPrefixProvider *dpp)
-{
-  req = new RGWAsyncGetSystemObj(dpp, this, stack->create_completion_notifier(), svc,
-                                objv_tracker, obj, false, false);
-  async_rados->queue(req);
-  return 0;
-}
-
-template <class T>
-int RGWSimpleRadosReadCR<T>::request_complete()
-{
-  int ret = req->get_ret_status();
-  retcode = ret;
-  if (ret == -ENOENT && empty_on_enoent) {
-    *result = T();
-  } else {
-    if (ret < 0) {
-      return ret;
-    }
-    if (objv_tracker) { // copy the updated version
-      *objv_tracker = req->objv_tracker;
-    }
-    try {
-      auto iter = req->bl.cbegin();
-      if (iter.end()) {
-        // allow successful reads with empty buffers. ReadSyncStatus coroutines
-        // depend on this to be able to read without locking, because the
-        // cls lock from InitSyncStatus will create an empty object if it didn't
-        // exist
-        *result = T();
-      } else {
-        decode(*result, iter);
-      }
-    } catch (buffer::error& err) {
-      return -EIO;
-    }
-  }
-
-  return handle_data(*result);
-}
-
-class RGWSimpleRadosReadAttrsCR : public RGWSimpleCoroutine {
-  const DoutPrefixProvider *dpp;
-  RGWAsyncRadosProcessor *async_rados;
-  RGWSI_SysObj *svc;
-
-  rgw_raw_obj obj;
-  std::map<std::string, bufferlist> *pattrs;
-  bool raw_attrs;
-  RGWObjVersionTracker* objv_tracker;
-  RGWAsyncGetSystemObj *req = nullptr;
-
-public:
-  RGWSimpleRadosReadAttrsCR(const DoutPrefixProvider *_dpp, RGWAsyncRadosProcessor *_async_rados, RGWSI_SysObj *_svc,
-                            const rgw_raw_obj& _obj, std::map<std::string, bufferlist> *_pattrs,
-                            bool _raw_attrs, RGWObjVersionTracker* objv_tracker = nullptr)
-    : RGWSimpleCoroutine(_svc->ctx()),
-      dpp(_dpp),
-      async_rados(_async_rados), svc(_svc),
-      obj(_obj),
-      pattrs(_pattrs),
-      raw_attrs(_raw_attrs),
-      objv_tracker(objv_tracker)
-  {}
-  ~RGWSimpleRadosReadAttrsCR() override {
-    request_cleanup();
-  }
-                                                         
-  void request_cleanup() override {
-    if (req) {
-      req->finish();
-      req = NULL;
-    }
-  }
-
-  int send_request(const DoutPrefixProvider *dpp) override;
-  int request_complete() override;
-};
-
-template <class T>
-class RGWSimpleRadosWriteCR : public RGWSimpleCoroutine {
-  const DoutPrefixProvider *dpp;
-  RGWAsyncRadosProcessor *async_rados;
-  RGWSI_SysObj *svc;
-  bufferlist bl;
-  rgw_raw_obj obj;
-  RGWObjVersionTracker *objv_tracker;
-  bool exclusive;
-  RGWAsyncPutSystemObj *req{nullptr};
-
-public:
-  RGWSimpleRadosWriteCR(const DoutPrefixProvider *_dpp, 
-                       RGWAsyncRadosProcessor *_async_rados, RGWSI_SysObj *_svc,
-                       const rgw_raw_obj& _obj, const T& _data,
-                       RGWObjVersionTracker *objv_tracker = nullptr,
-                       bool exclusive = false)
-    : RGWSimpleCoroutine(_svc->ctx()), dpp(_dpp), async_rados(_async_rados),
-      svc(_svc), obj(_obj), objv_tracker(objv_tracker), exclusive(exclusive) {
-    encode(_data, bl);
-  }
-
-  ~RGWSimpleRadosWriteCR() override {
-    request_cleanup();
-  }
-
-  void request_cleanup() override {
-    if (req) {
-      req->finish();
-      req = NULL;
-    }
-  }
-
-  int send_request(const DoutPrefixProvider *dpp) override {
-    req = new RGWAsyncPutSystemObj(dpp, this, stack->create_completion_notifier(),
-                                  svc, objv_tracker, obj, exclusive, std::move(bl));
-    async_rados->queue(req);
-    return 0;
-  }
-
-  int request_complete() override {
-    if (objv_tracker) { // copy the updated version
-      *objv_tracker = req->objv_tracker;
-    }
-    return req->get_ret_status();
-  }
-};
-
-class RGWSimpleRadosWriteAttrsCR : public RGWSimpleCoroutine {
-  const DoutPrefixProvider *dpp;
-  RGWAsyncRadosProcessor *async_rados;
-  RGWSI_SysObj *svc;
-  RGWObjVersionTracker *objv_tracker;
-
-  rgw_raw_obj obj;
-  std::map<std::string, bufferlist> attrs;
-  bool exclusive;
-  RGWAsyncPutSystemObjAttrs *req = nullptr;
-
-public:
-  RGWSimpleRadosWriteAttrsCR(const DoutPrefixProvider *_dpp,
-                             RGWAsyncRadosProcessor *_async_rados,
-                             RGWSI_SysObj *_svc, const rgw_raw_obj& _obj,
-                             std::map<std::string, bufferlist> _attrs,
-                             RGWObjVersionTracker *objv_tracker = nullptr,
-                             bool exclusive = false)
-                            : RGWSimpleCoroutine(_svc->ctx()), dpp(_dpp), async_rados(_async_rados),
-      svc(_svc), objv_tracker(objv_tracker), obj(_obj),
-      attrs(std::move(_attrs)), exclusive(exclusive) {
-  }
-  ~RGWSimpleRadosWriteAttrsCR() override {
-    request_cleanup();
-  }
-
-  void request_cleanup() override {
-    if (req) {
-      req->finish();
-      req = NULL;
-    }
-  }
-
-  int send_request(const DoutPrefixProvider *dpp) override {
-    req = new RGWAsyncPutSystemObjAttrs(dpp, this, stack->create_completion_notifier(),
-                                  svc, objv_tracker, obj, std::move(attrs),
-                                   exclusive);
-    async_rados->queue(req);
-    return 0;
-  }
-
-  int request_complete() override {
-    if (objv_tracker) { // copy the updated version
-      *objv_tracker = req->objv_tracker;
-    }
-    return req->get_ret_status();
-  }
-};
-
-class RGWRadosSetOmapKeysCR : public RGWSimpleCoroutine {
-  rgw::sal::RadosStore* store;
-  std::map<std::string, bufferlist> entries;
-
-  rgw_rados_ref ref;
-
-  rgw_raw_obj obj;
-
-  boost::intrusive_ptr<RGWAioCompletionNotifier> cn;
-
-public:
-  RGWRadosSetOmapKeysCR(rgw::sal::RadosStore* _store,
-                     const rgw_raw_obj& _obj,
-                     std::map<std::string, bufferlist>& _entries);
-
-  int send_request(const DoutPrefixProvider *dpp) override;
-  int request_complete() override;
-};
-
-class RGWRadosGetOmapKeysCR : public RGWSimpleCoroutine {
- public:
-  struct Result {
-    rgw_rados_ref ref;
-    std::set<std::string> entries;
-    bool more = false;
-  };
-  using ResultPtr = std::shared_ptr<Result>;
-
-  RGWRadosGetOmapKeysCR(rgw::sal::RadosStore* _store, const rgw_raw_obj& _obj,
-                        const std::string& _marker, int _max_entries,
-                        ResultPtr result);
-
-  int send_request(const DoutPrefixProvider *dpp) override;
-  int request_complete() override;
-
- private:
-  rgw::sal::RadosStore* store;
-  rgw_raw_obj obj;
-  std::string marker;
-  int max_entries;
-  ResultPtr result;
-  boost::intrusive_ptr<RGWAioCompletionNotifier> cn;
-};
-
-class RGWRadosGetOmapValsCR : public RGWSimpleCoroutine {
- public:
-  struct Result {
-    rgw_rados_ref ref;
-    std::map<std::string, bufferlist> entries;
-    bool more = false;
-  };
-  using ResultPtr = std::shared_ptr<Result>;
-
-  RGWRadosGetOmapValsCR(rgw::sal::RadosStore* _store, const rgw_raw_obj& _obj,
-                        const std::string& _marker, int _max_entries,
-                        ResultPtr result);
-
-  int send_request(const DoutPrefixProvider *dpp) override;
-  int request_complete() override;
-
- private:
-  rgw::sal::RadosStore* store;
-  rgw_raw_obj obj;
-  std::string marker;
-  int max_entries;
-  ResultPtr result;
-  boost::intrusive_ptr<RGWAioCompletionNotifier> cn;
-};
-
-class RGWRadosRemoveOmapKeysCR : public RGWSimpleCoroutine {
-  rgw::sal::RadosStore* store;
-
-  rgw_rados_ref ref;
-
-  std::set<std::string> keys;
-
-  rgw_raw_obj obj;
-
-  boost::intrusive_ptr<RGWAioCompletionNotifier> cn;
-
-public:
-  RGWRadosRemoveOmapKeysCR(rgw::sal::RadosStore* _store,
-                     const rgw_raw_obj& _obj,
-                     const std::set<std::string>& _keys);
-
-  int send_request(const DoutPrefixProvider *dpp) override;
-
-  int request_complete() override;
-};
-
-class RGWRadosRemoveCR : public RGWSimpleCoroutine {
-  rgw::sal::RadosStore* store;
-  librados::IoCtx ioctx;
-  const rgw_raw_obj obj;
-  RGWObjVersionTracker* objv_tracker;
-  boost::intrusive_ptr<RGWAioCompletionNotifier> cn;
-
-public:
-  RGWRadosRemoveCR(rgw::sal::RadosStore* store, const rgw_raw_obj& obj,
-                   RGWObjVersionTracker* objv_tracker = nullptr);
-
-  int send_request(const DoutPrefixProvider *dpp) override;
-  int request_complete() override;
-};
-
-class RGWRadosRemoveOidCR : public RGWSimpleCoroutine {
-  librados::IoCtx ioctx;
-  const std::string oid;
-  RGWObjVersionTracker* objv_tracker;
-  boost::intrusive_ptr<RGWAioCompletionNotifier> cn;
-
-public:
-  RGWRadosRemoveOidCR(rgw::sal::RadosStore* store,
-                     librados::IoCtx&& ioctx, std::string_view oid,
-                     RGWObjVersionTracker* objv_tracker = nullptr);
-
-  RGWRadosRemoveOidCR(rgw::sal::RadosStore* store,
-                     RGWSI_RADOS::Obj& obj,
-                     RGWObjVersionTracker* objv_tracker = nullptr);
-
-  RGWRadosRemoveOidCR(rgw::sal::RadosStore* store,
-                     RGWSI_RADOS::Obj&& obj,
-                     RGWObjVersionTracker* objv_tracker = nullptr);
-
-  int send_request(const DoutPrefixProvider *dpp) override;
-  int request_complete() override;
-};
-
-class RGWSimpleRadosLockCR : public RGWSimpleCoroutine {
-  RGWAsyncRadosProcessor *async_rados;
-  rgw::sal::RadosStore* store;
-  std::string lock_name;
-  std::string cookie;
-  uint32_t duration;
-
-  rgw_raw_obj obj;
-
-  RGWAsyncLockSystemObj *req;
-
-public:
-  RGWSimpleRadosLockCR(RGWAsyncRadosProcessor *_async_rados, rgw::sal::RadosStore* _store,
-                     const rgw_raw_obj& _obj,
-                      const std::string& _lock_name,
-                     const std::string& _cookie,
-                     uint32_t _duration);
-  ~RGWSimpleRadosLockCR() override {
-    request_cleanup();
-  }
-  void request_cleanup() override;
-
-  int send_request(const DoutPrefixProvider *dpp) override;
-  int request_complete() override;
-
-  static std::string gen_random_cookie(CephContext* cct) {
-#define COOKIE_LEN 16
-    char buf[COOKIE_LEN + 1];
-    gen_rand_alphanumeric(cct, buf, sizeof(buf) - 1);
-    return buf;
-  }
-};
-
-class RGWSimpleRadosUnlockCR : public RGWSimpleCoroutine {
-  RGWAsyncRadosProcessor *async_rados;
-  rgw::sal::RadosStore* store;
-  std::string lock_name;
-  std::string cookie;
-
-  rgw_raw_obj obj;
-
-  RGWAsyncUnlockSystemObj *req;
-
-public:
-  RGWSimpleRadosUnlockCR(RGWAsyncRadosProcessor *_async_rados, rgw::sal::RadosStore* _store,
-                     const rgw_raw_obj& _obj, 
-                      const std::string& _lock_name,
-                     const std::string& _cookie);
-  ~RGWSimpleRadosUnlockCR() override {
-    request_cleanup();
-  }
-  void request_cleanup() override;
-
-  int send_request(const DoutPrefixProvider *dpp) override;
-  int request_complete() override;
-};
-
-#define OMAP_APPEND_MAX_ENTRIES_DEFAULT 100
-
-class RGWOmapAppend : public RGWConsumerCR<std::string> {
-  RGWAsyncRadosProcessor *async_rados;
-  rgw::sal::RadosStore* store;
-
-  rgw_raw_obj obj;
-
-  bool going_down;
-
-  int num_pending_entries;
-  std::list<std::string> pending_entries;
-
-  std::map<std::string, bufferlist> entries;
-
-  uint64_t window_size;
-  uint64_t total_entries;
-public:
-  RGWOmapAppend(RGWAsyncRadosProcessor *_async_rados, rgw::sal::RadosStore* _store,
-                const rgw_raw_obj& _obj,
-                uint64_t _window_size = OMAP_APPEND_MAX_ENTRIES_DEFAULT);
-  int operate(const DoutPrefixProvider *dpp) override;
-  void flush_pending();
-  bool append(const std::string& s);
-  bool finish();
-
-  uint64_t get_total_entries() {
-    return total_entries;
-  }
-
-  const rgw_raw_obj& get_obj() {
-    return obj;
-  }
-};
-
-class RGWShardedOmapCRManager {
-  RGWAsyncRadosProcessor *async_rados;
-  rgw::sal::RadosStore* store;
-  RGWCoroutine *op;
-
-  int num_shards;
-
-  std::vector<RGWOmapAppend *> shards;
-public:
-  RGWShardedOmapCRManager(RGWAsyncRadosProcessor *_async_rados, rgw::sal::RadosStore* _store, RGWCoroutine *_op, int _num_shards, const rgw_pool& pool, const std::string& oid_prefix)
-                      : async_rados(_async_rados),
-                       store(_store), op(_op), num_shards(_num_shards) {
-    shards.reserve(num_shards);
-    for (int i = 0; i < num_shards; ++i) {
-      char buf[oid_prefix.size() + 16];
-      snprintf(buf, sizeof(buf), "%s.%d", oid_prefix.c_str(), i);
-      RGWOmapAppend *shard = new RGWOmapAppend(async_rados, store, rgw_raw_obj(pool, buf));
-      shard->get();
-      shards.push_back(shard);
-      op->spawn(shard, false);
-    }
-  }
-
-  ~RGWShardedOmapCRManager() {
-    for (auto shard : shards) {
-      shard->put();
-    }
-  }
-
-  bool append(const std::string& entry, int shard_id) {
-    return shards[shard_id]->append(entry);
-  }
-  bool finish() {
-    bool success = true;
-    for (auto& append_op : shards) {
-      success &= (append_op->finish() && (!append_op->is_error()));
-    }
-    return success;
-  }
-
-  uint64_t get_total_entries(int shard_id) {
-    return shards[shard_id]->get_total_entries();
-  }
-};
-
-class RGWAsyncGetBucketInstanceInfo : public RGWAsyncRadosRequest {
-  rgw::sal::RadosStore* store;
-  rgw_bucket bucket;
-  const DoutPrefixProvider *dpp;
-
-protected:
-  int _send_request(const DoutPrefixProvider *dpp) override;
-public:
-  RGWAsyncGetBucketInstanceInfo(RGWCoroutine *caller, RGWAioCompletionNotifier *cn,
-                                rgw::sal::RadosStore* _store, const rgw_bucket& bucket,
-                                const DoutPrefixProvider *dpp)
-    : RGWAsyncRadosRequest(caller, cn), store(_store), bucket(bucket), dpp(dpp) {}
-
-  RGWBucketInfo bucket_info;
-  std::map<std::string, bufferlist> attrs;
-};
-
-class RGWAsyncPutBucketInstanceInfo : public RGWAsyncRadosRequest {
-  rgw::sal::RadosStore* store;
-  RGWBucketInfo& bucket_info;
-  bool exclusive;
-  real_time mtime;
-  std::map<std::string, ceph::bufferlist>* attrs;
-  const DoutPrefixProvider *dpp;
-
-protected:
-  int _send_request(const DoutPrefixProvider *dpp) override;
-public:
-  RGWAsyncPutBucketInstanceInfo(RGWCoroutine* caller,
-                               RGWAioCompletionNotifier* cn,
-                                rgw::sal::RadosStore* store,
-                               RGWBucketInfo& bucket_info,
-                               bool exclusive,
-                               real_time mtime,
-                               std::map<std::string, ceph::bufferlist>* attrs,
-                                const DoutPrefixProvider* dpp)
-    : RGWAsyncRadosRequest(caller, cn), store(store), bucket_info(bucket_info),
-      exclusive(exclusive), mtime(mtime), attrs(attrs), dpp(dpp) {}
-};
-
-class RGWGetBucketInstanceInfoCR : public RGWSimpleCoroutine {
-  RGWAsyncRadosProcessor *async_rados;
-  rgw::sal::RadosStore* store;
-  rgw_bucket bucket;
-  RGWBucketInfo *bucket_info;
-  std::map<std::string, bufferlist> *pattrs;
-  const DoutPrefixProvider *dpp;
-
-  RGWAsyncGetBucketInstanceInfo *req{nullptr};
-
-public:
-  // rgw_bucket constructor
-  RGWGetBucketInstanceInfoCR(RGWAsyncRadosProcessor *_async_rados, rgw::sal::RadosStore* _store,
-                             const rgw_bucket& _bucket, RGWBucketInfo *_bucket_info,
-                             std::map<std::string, bufferlist> *_pattrs, const DoutPrefixProvider *dpp)
-    : RGWSimpleCoroutine(_store->ctx()), async_rados(_async_rados), store(_store),
-      bucket(_bucket), bucket_info(_bucket_info), pattrs(_pattrs), dpp(dpp) {}
-  ~RGWGetBucketInstanceInfoCR() override {
-    request_cleanup();
-  }
-  void request_cleanup() override {
-    if (req) {
-      req->finish();
-      req = NULL;
-    }
-  }
-
-  int send_request(const DoutPrefixProvider *dpp) override {
-    req = new RGWAsyncGetBucketInstanceInfo(this, stack->create_completion_notifier(), store, bucket, dpp);
-    async_rados->queue(req);
-    return 0;
-  }
-  int request_complete() override {
-    if (bucket_info) {
-      *bucket_info = std::move(req->bucket_info);
-    }
-    if (pattrs) {
-      *pattrs = std::move(req->attrs);
-    }
-    return req->get_ret_status();
-  }
-};
-
-class RGWPutBucketInstanceInfoCR : public RGWSimpleCoroutine {
-  RGWAsyncRadosProcessor *async_rados;
-  rgw::sal::RadosStore* store;
-  RGWBucketInfo& bucket_info;
-  bool exclusive;
-  real_time mtime;
-  std::map<std::string, ceph::bufferlist>* attrs;
-  const DoutPrefixProvider *dpp;
-
-  RGWAsyncPutBucketInstanceInfo* req = nullptr;
-
-public:
-  // rgw_bucket constructor
-  RGWPutBucketInstanceInfoCR(RGWAsyncRadosProcessor *async_rados,
-                            rgw::sal::RadosStore* store,
-                            RGWBucketInfo& bucket_info,
-                            bool exclusive,
-                            real_time mtime,
-                            std::map<std::string, ceph::bufferlist>* attrs,
-                             const DoutPrefixProvider *dpp)
-    : RGWSimpleCoroutine(store->ctx()), async_rados(async_rados), store(store),
-      bucket_info(bucket_info), exclusive(exclusive),
-      mtime(mtime), attrs(attrs), dpp(dpp) {}
-  ~RGWPutBucketInstanceInfoCR() override {
-    request_cleanup();
-  }
-  void request_cleanup() override {
-    if (req) {
-      req->finish();
-      req = nullptr;
-    }
-  }
-
-  int send_request(const DoutPrefixProvider *dpp) override {
-    req = new RGWAsyncPutBucketInstanceInfo(this,
-                                           stack->create_completion_notifier(),
-                                           store, bucket_info, exclusive,
-                                           mtime, attrs, dpp);
-    async_rados->queue(req);
-    return 0;
-  }
-  int request_complete() override {
-    return req->get_ret_status();
-  }
-};
-
-class RGWRadosBILogTrimCR : public RGWSimpleCoroutine {
-  const RGWBucketInfo& bucket_info;
-  int shard_id;
-  const rgw::bucket_index_layout_generation generation;
-  RGWRados::BucketShard bs;
-  std::string start_marker;
-  std::string end_marker;
-  boost::intrusive_ptr<RGWAioCompletionNotifier> cn;
- public:
-  RGWRadosBILogTrimCR(const DoutPrefixProvider *dpp,
-                      rgw::sal::RadosStore* store, const RGWBucketInfo& bucket_info,
-                      int shard_id,
-                     const rgw::bucket_index_layout_generation& generation,
-                     const std::string& start_marker,
-                      const std::string& end_marker);
-
-  int send_request(const DoutPrefixProvider *dpp) override;
-  int request_complete() override;
-};
-
-class RGWAsyncFetchRemoteObj : public RGWAsyncRadosRequest {
-  rgw::sal::RadosStore* store;
-  rgw_zone_id source_zone;
-
-  std::optional<rgw_user> user_id;
-
-  rgw_bucket src_bucket;
-  std::optional<rgw_placement_rule> dest_placement_rule;
-  RGWBucketInfo dest_bucket_info;
-
-  rgw_obj_key key;
-  std::optional<rgw_obj_key> dest_key;
-  std::optional<uint64_t> versioned_epoch;
-
-  real_time src_mtime;
-
-  bool copy_if_newer;
-  std::shared_ptr<RGWFetchObjFilter> filter;
-  rgw_zone_set zones_trace;
-  PerfCounters* counters;
-  const DoutPrefixProvider *dpp;
-
-protected:
-  int _send_request(const DoutPrefixProvider *dpp) override;
-public:
-  RGWAsyncFetchRemoteObj(RGWCoroutine *caller, RGWAioCompletionNotifier *cn, rgw::sal::RadosStore* _store,
-                         const rgw_zone_id& _source_zone,
-                         std::optional<rgw_user>& _user_id,
-                         const rgw_bucket& _src_bucket,
-                        std::optional<rgw_placement_rule> _dest_placement_rule,
-                         const RGWBucketInfo& _dest_bucket_info,
-                         const rgw_obj_key& _key,
-                         const std::optional<rgw_obj_key>& _dest_key,
-                         std::optional<uint64_t> _versioned_epoch,
-                         bool _if_newer,
-                         std::shared_ptr<RGWFetchObjFilter> _filter,
-                         rgw_zone_set *_zones_trace,
-                         PerfCounters* counters, const DoutPrefixProvider *dpp)
-    : RGWAsyncRadosRequest(caller, cn), store(_store),
-      source_zone(_source_zone),
-      user_id(_user_id),
-      src_bucket(_src_bucket),
-      dest_placement_rule(_dest_placement_rule),
-      dest_bucket_info(_dest_bucket_info),
-      key(_key),
-      dest_key(_dest_key),
-      versioned_epoch(_versioned_epoch),
-      copy_if_newer(_if_newer),
-      filter(_filter),
-      counters(counters),
-      dpp(dpp)
-  {
-    if (_zones_trace) {
-      zones_trace = *_zones_trace;
-    }
-  }
-};
-
-class RGWFetchRemoteObjCR : public RGWSimpleCoroutine {
-  CephContext *cct;
-  RGWAsyncRadosProcessor *async_rados;
-  rgw::sal::RadosStore* store;
-  rgw_zone_id source_zone;
-
-  std::optional<rgw_user> user_id;
-
-  rgw_bucket src_bucket;
-  std::optional<rgw_placement_rule> dest_placement_rule;
-  RGWBucketInfo dest_bucket_info;
-
-  rgw_obj_key key;
-  std::optional<rgw_obj_key> dest_key;
-  std::optional<uint64_t> versioned_epoch;
-
-  real_time src_mtime;
-
-  bool copy_if_newer;
-
-  std::shared_ptr<RGWFetchObjFilter> filter;
-
-  RGWAsyncFetchRemoteObj *req;
-  rgw_zone_set *zones_trace;
-  PerfCounters* counters;
-  const DoutPrefixProvider *dpp;
-
-public:
-  RGWFetchRemoteObjCR(RGWAsyncRadosProcessor *_async_rados, rgw::sal::RadosStore* _store,
-                      const rgw_zone_id& _source_zone,
-                      std::optional<rgw_user> _user_id,
-                      const rgw_bucket& _src_bucket,
-                     std::optional<rgw_placement_rule> _dest_placement_rule,
-                      const RGWBucketInfo& _dest_bucket_info,
-                      const rgw_obj_key& _key,
-                      const std::optional<rgw_obj_key>& _dest_key,
-                      std::optional<uint64_t> _versioned_epoch,
-                      bool _if_newer,
-                      std::shared_ptr<RGWFetchObjFilter> _filter,
-                      rgw_zone_set *_zones_trace,
-                      PerfCounters* counters, const DoutPrefixProvider *dpp)
-    : RGWSimpleCoroutine(_store->ctx()), cct(_store->ctx()),
-      async_rados(_async_rados), store(_store),
-      source_zone(_source_zone),
-      user_id(_user_id),
-      src_bucket(_src_bucket),
-      dest_placement_rule(_dest_placement_rule),
-      dest_bucket_info(_dest_bucket_info),
-      key(_key),
-      dest_key(_dest_key),
-      versioned_epoch(_versioned_epoch),
-      copy_if_newer(_if_newer),
-      filter(_filter),
-      req(NULL),
-      zones_trace(_zones_trace), counters(counters), dpp(dpp) {}
-
-
-  ~RGWFetchRemoteObjCR() override {
-    request_cleanup();
-  }
-
-  void request_cleanup() override {
-    if (req) {
-      req->finish();
-      req = NULL;
-    }
-  }
-
-  int send_request(const DoutPrefixProvider *dpp) override {
-    req = new RGWAsyncFetchRemoteObj(this, stack->create_completion_notifier(), store,
-                                    source_zone, user_id, src_bucket, dest_placement_rule, dest_bucket_info,
-                                     key, dest_key, versioned_epoch, copy_if_newer, filter,
-                                     zones_trace, counters, dpp);
-    async_rados->queue(req);
-    return 0;
-  }
-
-  int request_complete() override {
-    return req->get_ret_status();
-  }
-};
-
-class RGWAsyncStatRemoteObj : public RGWAsyncRadosRequest {
-  rgw::sal::RadosStore* store;
-  rgw_zone_id source_zone;
-
-  rgw_bucket src_bucket;
-  rgw_obj_key key;
-
-  ceph::real_time *pmtime;
-  uint64_t *psize;
-  std::string *petag;
-  std::map<std::string, bufferlist> *pattrs;
-  std::map<std::string, std::string> *pheaders;
-
-protected:
-  int _send_request(const DoutPrefixProvider *dpp) override;
-public:
-  RGWAsyncStatRemoteObj(RGWCoroutine *caller, RGWAioCompletionNotifier *cn, rgw::sal::RadosStore* _store,
-                         const rgw_zone_id& _source_zone,
-                         rgw_bucket& _src_bucket,
-                         const rgw_obj_key& _key,
-                         ceph::real_time *_pmtime,
-                         uint64_t *_psize,
-                         std::string *_petag,
-                         std::map<std::string, bufferlist> *_pattrs,
-                         std::map<std::string, std::string> *_pheaders) : RGWAsyncRadosRequest(caller, cn), store(_store),
-                                                      source_zone(_source_zone),
-                                                      src_bucket(_src_bucket),
-                                                      key(_key),
-                                                      pmtime(_pmtime),
-                                                      psize(_psize),
-                                                      petag(_petag),
-                                                      pattrs(_pattrs),
-                                                      pheaders(_pheaders) {}
-};
-
-class RGWStatRemoteObjCR : public RGWSimpleCoroutine {
-  CephContext *cct;
-  RGWAsyncRadosProcessor *async_rados;
-  rgw::sal::RadosStore* store;
-  rgw_zone_id source_zone;
-
-  rgw_bucket src_bucket;
-  rgw_obj_key key;
-
-  ceph::real_time *pmtime;
-  uint64_t *psize;
-  std::string *petag;
-  std::map<std::string, bufferlist> *pattrs;
-  std::map<std::string, std::string> *pheaders;
-
-  RGWAsyncStatRemoteObj *req;
-
-public:
-  RGWStatRemoteObjCR(RGWAsyncRadosProcessor *_async_rados, rgw::sal::RadosStore* _store,
-                      const rgw_zone_id& _source_zone,
-                      rgw_bucket& _src_bucket,
-                      const rgw_obj_key& _key,
-                      ceph::real_time *_pmtime,
-                      uint64_t *_psize,
-                      std::string *_petag,
-                      std::map<std::string, bufferlist> *_pattrs,
-                      std::map<std::string, std::string> *_pheaders) : RGWSimpleCoroutine(_store->ctx()), cct(_store->ctx()),
-                                       async_rados(_async_rados), store(_store),
-                                       source_zone(_source_zone),
-                                       src_bucket(_src_bucket),
-                                       key(_key),
-                                       pmtime(_pmtime),
-                                       psize(_psize),
-                                       petag(_petag),
-                                       pattrs(_pattrs),
-                                       pheaders(_pheaders),
-                                       req(NULL) {}
-
-
-  ~RGWStatRemoteObjCR() override {
-    request_cleanup();
-  }
-
-  void request_cleanup() override {
-    if (req) {
-      req->finish();
-      req = NULL;
-    }
-  }
-
-  int send_request(const DoutPrefixProvider *dpp) override {
-    req = new RGWAsyncStatRemoteObj(this, stack->create_completion_notifier(), store, source_zone,
-                                    src_bucket, key, pmtime, psize, petag, pattrs, pheaders);
-    async_rados->queue(req);
-    return 0;
-  }
-
-  int request_complete() override {
-    return req->get_ret_status();
-  }
-};
-
-class RGWAsyncRemoveObj : public RGWAsyncRadosRequest {
-  const DoutPrefixProvider *dpp;
-  rgw::sal::RadosStore* store;
-  rgw_zone_id source_zone;
-
-  std::unique_ptr<rgw::sal::Bucket> bucket;
-  std::unique_ptr<rgw::sal::Object> obj;
-
-  std::string owner;
-  std::string owner_display_name;
-  bool versioned;
-  uint64_t versioned_epoch;
-  std::string marker_version_id;
-
-  bool del_if_older;
-  ceph::real_time timestamp;
-  rgw_zone_set zones_trace;
-
-protected:
-  int _send_request(const DoutPrefixProvider *dpp) override;
-public:
-  RGWAsyncRemoveObj(const DoutPrefixProvider *_dpp, RGWCoroutine *caller, RGWAioCompletionNotifier *cn, 
-                         rgw::sal::RadosStore* _store,
-                         const rgw_zone_id& _source_zone,
-                         RGWBucketInfo& _bucket_info,
-                         const rgw_obj_key& _key,
-                         const std::string& _owner,
-                         const std::string& _owner_display_name,
-                         bool _versioned,
-                         uint64_t _versioned_epoch,
-                         bool _delete_marker,
-                         bool _if_older,
-                         real_time& _timestamp,
-                         rgw_zone_set* _zones_trace) : RGWAsyncRadosRequest(caller, cn), dpp(_dpp), store(_store),
-                                                      source_zone(_source_zone),
-                                                      owner(_owner),
-                                                      owner_display_name(_owner_display_name),
-                                                      versioned(_versioned),
-                                                      versioned_epoch(_versioned_epoch),
-                                                      del_if_older(_if_older),
-                                                      timestamp(_timestamp) {
-    if (_delete_marker) {
-      marker_version_id = _key.instance;
-    }
-
-    if (_zones_trace) {
-      zones_trace = *_zones_trace;
-    }
-    store->get_bucket(nullptr, _bucket_info, &bucket);
-    obj = bucket->get_object(_key);
-  }
-};
-
-class RGWRemoveObjCR : public RGWSimpleCoroutine {
-  const DoutPrefixProvider *dpp;
-  CephContext *cct;
-  RGWAsyncRadosProcessor *async_rados;
-  rgw::sal::RadosStore* store;
-  rgw_zone_id source_zone;
-
-  RGWBucketInfo bucket_info;
-
-  rgw_obj_key key;
-  bool versioned;
-  uint64_t versioned_epoch;
-  bool delete_marker;
-  std::string owner;
-  std::string owner_display_name;
-
-  bool del_if_older;
-  real_time timestamp;
-
-  RGWAsyncRemoveObj *req;
-  
-  rgw_zone_set *zones_trace;
-
-public:
-  RGWRemoveObjCR(const DoutPrefixProvider *_dpp, RGWAsyncRadosProcessor *_async_rados, rgw::sal::RadosStore* _store,
-                      const rgw_zone_id& _source_zone,
-                      RGWBucketInfo& _bucket_info,
-                      const rgw_obj_key& _key,
-                      bool _versioned,
-                      uint64_t _versioned_epoch,
-                      std::string *_owner,
-                      std::string *_owner_display_name,
-                      bool _delete_marker,
-                      real_time *_timestamp,
-                      rgw_zone_set *_zones_trace) : RGWSimpleCoroutine(_store->ctx()), dpp(_dpp), cct(_store->ctx()),
-                                       async_rados(_async_rados), store(_store),
-                                       source_zone(_source_zone),
-                                       bucket_info(_bucket_info),
-                                       key(_key),
-                                       versioned(_versioned),
-                                       versioned_epoch(_versioned_epoch),
-                                       delete_marker(_delete_marker), req(NULL), zones_trace(_zones_trace) {
-    del_if_older = (_timestamp != NULL);
-    if (_timestamp) {
-      timestamp = *_timestamp;
-    }
-
-    if (_owner) {
-      owner = *_owner;
-    }
-
-    if (_owner_display_name) {
-      owner_display_name = *_owner_display_name;
-    }
-  }
-  ~RGWRemoveObjCR() override {
-    request_cleanup();
-  }
-
-  void request_cleanup() override {
-    if (req) {
-      req->finish();
-      req = NULL;
-    }
-  }
-
-  int send_request(const DoutPrefixProvider *dpp) override {
-    req = new RGWAsyncRemoveObj(dpp, this, stack->create_completion_notifier(), store, source_zone, bucket_info,
-                                key, owner, owner_display_name, versioned, versioned_epoch,
-                                delete_marker, del_if_older, timestamp, zones_trace);
-    async_rados->queue(req);
-    return 0;
-  }
-
-  int request_complete() override {
-    return req->get_ret_status();
-  }
-};
-
-class RGWContinuousLeaseCR : public RGWCoroutine {
-  RGWAsyncRadosProcessor *async_rados;
-  rgw::sal::RadosStore* store;
-
-  const rgw_raw_obj obj;
-
-  const std::string lock_name;
-  const std::string cookie;
-
-  int interval;
-  bool going_down{ false };
-  bool locked{false};
-  
-  const ceph::timespan interval_tolerance;
-  const ceph::timespan ts_interval;
-
-  RGWCoroutine *caller;
-
-  bool aborted{false};
-  
-  ceph::coarse_mono_time last_renew_try_time;
-  ceph::coarse_mono_time current_time;
-
-public:
-  RGWContinuousLeaseCR(RGWAsyncRadosProcessor *_async_rados, rgw::sal::RadosStore* _store,
-                       const rgw_raw_obj& _obj,
-                       const std::string& _lock_name, int _interval, RGWCoroutine *_caller)
-    : RGWCoroutine(_store->ctx()), async_rados(_async_rados), store(_store),
-    obj(_obj), lock_name(_lock_name),
-    cookie(RGWSimpleRadosLockCR::gen_random_cookie(cct)),
-    interval(_interval), interval_tolerance(ceph::make_timespan(9*interval/10)), ts_interval(ceph::make_timespan(interval)),
-      caller(_caller)
-  {}
-
-  virtual ~RGWContinuousLeaseCR() override;
-
-  int operate(const DoutPrefixProvider *dpp) override;
-
-  bool is_locked() const {
-    if (ceph::coarse_mono_clock::now() - last_renew_try_time > ts_interval) {
-      return false;
-    }
-    return locked;
-  }
-
-  void set_locked(bool status) {
-    locked = status;
-  }
-
-  void go_down() {
-    going_down = true;
-    wakeup();
-  }
-
-  void abort() {
-    aborted = true;
-  }
-};
-
-class RGWRadosTimelogAddCR : public RGWSimpleCoroutine {
-  const DoutPrefixProvider *dpp;
-  rgw::sal::RadosStore* store;
-  std::list<cls_log_entry> entries;
-
-  std::string oid;
-
-  boost::intrusive_ptr<RGWAioCompletionNotifier> cn;
-
-public:
-  RGWRadosTimelogAddCR(const DoutPrefixProvider *dpp, rgw::sal::RadosStore* _store, const std::string& _oid,
-                       const cls_log_entry& entry);
-
-  int send_request(const DoutPrefixProvider *dpp) override;
-  int request_complete() override;
-};
-
-class RGWRadosTimelogTrimCR : public RGWSimpleCoroutine {
-  const DoutPrefixProvider *dpp;
-  rgw::sal::RadosStore* store;
-  boost::intrusive_ptr<RGWAioCompletionNotifier> cn;
- protected:
-  std::string oid;
-  real_time start_time;
-  real_time end_time;
-  std::string from_marker;
-  std::string to_marker;
-
- public:
-  RGWRadosTimelogTrimCR(const DoutPrefixProvider *dpp, 
-                        rgw::sal::RadosStore* store, const std::string& oid,
-                        const real_time& start_time, const real_time& end_time,
-                        const std::string& from_marker,
-                        const std::string& to_marker);
-
-  int send_request(const DoutPrefixProvider *dpp) override;
-  int request_complete() override;
-};
-
-// wrapper to update last_trim_marker on success
-class RGWSyncLogTrimCR : public RGWRadosTimelogTrimCR {
-  CephContext *cct;
-  std::string *last_trim_marker;
- public:
-  static constexpr const char* max_marker = "99999999";
-
-  RGWSyncLogTrimCR(const DoutPrefixProvider *dpp,
-                   rgw::sal::RadosStore* store, const std::string& oid,
-                   const std::string& to_marker, std::string *last_trim_marker);
-  int request_complete() override;
-};
-
-class RGWAsyncStatObj : public RGWAsyncRadosRequest {
-  const DoutPrefixProvider *dpp;
-  rgw::sal::RadosStore* store;
-  RGWBucketInfo bucket_info;
-  rgw_obj obj;
-  uint64_t *psize;
-  real_time *pmtime;
-  uint64_t *pepoch;
-  RGWObjVersionTracker *objv_tracker;
-protected:
-  int _send_request(const DoutPrefixProvider *dpp) override;
-public:
-  RGWAsyncStatObj(const DoutPrefixProvider *dpp, RGWCoroutine *caller, RGWAioCompletionNotifier *cn, rgw::sal::RadosStore* store,
-                  const RGWBucketInfo& _bucket_info, const rgw_obj& obj, uint64_t *psize = nullptr,
-                  real_time *pmtime = nullptr, uint64_t *pepoch = nullptr,
-                  RGWObjVersionTracker *objv_tracker = nullptr)
-         : RGWAsyncRadosRequest(caller, cn), dpp(dpp), store(store), obj(obj), psize(psize),
-         pmtime(pmtime), pepoch(pepoch), objv_tracker(objv_tracker) {}
-};
-
-class RGWStatObjCR : public RGWSimpleCoroutine {
-  const DoutPrefixProvider *dpp;
-  rgw::sal::RadosStore* store;
-  RGWAsyncRadosProcessor *async_rados;
-  RGWBucketInfo bucket_info;
-  rgw_obj obj;
-  uint64_t *psize;
-  real_time *pmtime;
-  uint64_t *pepoch;
-  RGWObjVersionTracker *objv_tracker;
-  RGWAsyncStatObj *req = nullptr;
- public:
-  RGWStatObjCR(const DoutPrefixProvider *dpp, RGWAsyncRadosProcessor *async_rados, rgw::sal::RadosStore* store,
-         const RGWBucketInfo& _bucket_info, const rgw_obj& obj, uint64_t *psize = nullptr,
-         real_time* pmtime = nullptr, uint64_t *pepoch = nullptr,
-         RGWObjVersionTracker *objv_tracker = nullptr);
-  ~RGWStatObjCR() override {
-    request_cleanup();
-  }
-  void request_cleanup() override;
-
-  int send_request(const DoutPrefixProvider *dpp) override;
-  int request_complete() override;
-};
-
-/// coroutine wrapper for IoCtx::aio_notify()
-class RGWRadosNotifyCR : public RGWSimpleCoroutine {
-  rgw::sal::RadosStore* const store;
-  const rgw_raw_obj obj;
-  bufferlist request;
-  const uint64_t timeout_ms;
-  bufferlist *response;
-  rgw_rados_ref ref;
-  boost::intrusive_ptr<RGWAioCompletionNotifier> cn;
-
-public:
-  RGWRadosNotifyCR(rgw::sal::RadosStore* store, const rgw_raw_obj& obj,
-                   bufferlist& request, uint64_t timeout_ms,
-                   bufferlist *response);
-
-  int send_request(const DoutPrefixProvider *dpp) override;
-  int request_complete() override;
-};
-
-class RGWDataPostNotifyCR : public RGWCoroutine {
-  RGWRados *store;
-  RGWHTTPManager& http_manager;
-  bc::flat_map<int, bc::flat_set<rgw_data_notify_entry> >& shards;
-  const char *source_zone;
-  RGWRESTConn *conn;
-
-public:
-  RGWDataPostNotifyCR(RGWRados *_store, RGWHTTPManager& _http_manager, bc::flat_map<int,
-                    bc::flat_set<rgw_data_notify_entry> >& _shards, const char *_zone, RGWRESTConn *_conn)
-                    : RGWCoroutine(_store->ctx()), store(_store), http_manager(_http_manager),
-                      shards(_shards), source_zone(_zone), conn(_conn) {}
-
-  int operate(const DoutPrefixProvider* dpp) override;
-};
-
-#endif
diff --git a/src/rgw/store/rados/rgw_cr_tools.cc b/src/rgw/store/rados/rgw_cr_tools.cc
deleted file mode 100644 (file)
index 94665a3..0000000
+++ /dev/null
@@ -1,292 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab ft=cpp
-
-#include "common/errno.h"
-
-#include "rgw_cr_tools.h"
-#include "rgw_bucket.h"
-#include "rgw_user.h"
-#include "rgw_op.h"
-#include "rgw_acl_s3.h"
-#include "rgw_zone.h"
-
-#include "services/svc_zone.h"
-
-#define dout_context g_ceph_context
-#define dout_subsys ceph_subsys_rgw
-
-using namespace std;
-
-template<>
-int RGWUserCreateCR::Request::_send_request(const DoutPrefixProvider *dpp)
-{
-  CephContext *cct = store->ctx();
-
-  const int32_t default_max_buckets =
-    cct->_conf.get_val<int64_t>("rgw_user_max_buckets");
-
-  RGWUserAdminOpState op_state(store);
-
-  auto& user = params.user;
-
-  op_state.set_user_id(user);
-  op_state.set_display_name(params.display_name);
-  op_state.set_user_email(params.email);
-  op_state.set_caps(params.caps);
-  op_state.set_access_key(params.access_key);
-  op_state.set_secret_key(params.secret_key);
-
-  if (!params.key_type.empty()) {
-    int32_t key_type = KEY_TYPE_S3;
-    if (params.key_type == "swift") {
-      key_type = KEY_TYPE_SWIFT;
-    }
-
-    op_state.set_key_type(key_type);
-  }
-
-  op_state.set_max_buckets(params.max_buckets.value_or(default_max_buckets));
-  op_state.set_suspension(params.suspended);
-  op_state.set_system(params.system);
-  op_state.set_exclusive(params.exclusive);
-
-  if (params.generate_key) {
-    op_state.set_generate_key();
-  }
-
-
-  if (params.apply_quota) {
-    RGWQuota quota;
-
-    if (cct->_conf->rgw_bucket_default_quota_max_objects >= 0) {
-      quota.bucket_quota.max_objects = cct->_conf->rgw_bucket_default_quota_max_objects;
-      quota.bucket_quota.enabled = true;
-    }
-
-    if (cct->_conf->rgw_bucket_default_quota_max_size >= 0) {
-      quota.bucket_quota.max_size = cct->_conf->rgw_bucket_default_quota_max_size;
-      quota.bucket_quota.enabled = true;
-    }
-
-    if (cct->_conf->rgw_user_default_quota_max_objects >= 0) {
-      quota.user_quota.max_objects = cct->_conf->rgw_user_default_quota_max_objects;
-      quota.user_quota.enabled = true;
-    }
-
-    if (cct->_conf->rgw_user_default_quota_max_size >= 0) {
-      quota.user_quota.max_size = cct->_conf->rgw_user_default_quota_max_size;
-      quota.user_quota.enabled = true;
-    }
-
-    if (quota.bucket_quota.enabled) {
-      op_state.set_bucket_quota(quota.bucket_quota);
-    }
-
-    if (quota.user_quota.enabled) {
-      op_state.set_user_quota(quota.user_quota);
-    }
-  }
-
-  RGWNullFlusher flusher;
-  return RGWUserAdminOp_User::create(dpp, store, op_state, flusher, null_yield);
-}
-
-template<>
-int RGWGetUserInfoCR::Request::_send_request(const DoutPrefixProvider *dpp)
-{
-  return store->ctl()->user->get_info_by_uid(dpp, params.user, result.get(), null_yield);
-}
-
-template<>
-int RGWGetBucketInfoCR::Request::_send_request(const DoutPrefixProvider *dpp)
-{
-  return store->get_bucket(dpp, nullptr, params.tenant, params.bucket_name, &result->bucket, null_yield);
-}
-
-template<>
-int RGWBucketCreateLocalCR::Request::_send_request(const DoutPrefixProvider *dpp)
-{
-  CephContext *cct = store->ctx();
-  auto& zone_svc = store->svc()->zone;
-
-  const auto& user_info = params.user_info.get();
-  const auto& user = user_info->user_id;
-  const auto& bucket_name = params.bucket_name;
-  auto& placement_rule = params.placement_rule;
-
-  if (!placement_rule.empty() &&
-      !zone_svc->get_zone_params().valid_placement(placement_rule)) {
-    ldpp_dout(dpp, 0) << "placement target (" << placement_rule << ")"
-      << " doesn't exist in the placement targets of zonegroup"
-      << " (" << zone_svc->get_zonegroup().api_name << ")" << dendl;
-    return -ERR_INVALID_LOCATION_CONSTRAINT;
-  }
-
-  /* we need to make sure we read bucket info, it's not read before for this
-   * specific request */
-  RGWBucketInfo bucket_info;
-  map<string, bufferlist> bucket_attrs;
-
-  int ret = store->getRados()->get_bucket_info(store->svc(), user.tenant, bucket_name,
-                                 bucket_info, nullptr, null_yield, dpp, &bucket_attrs);
-  if (ret < 0 && ret != -ENOENT)
-    return ret;
-  bool bucket_exists = (ret != -ENOENT);
-
-  RGWAccessControlPolicy old_policy(cct);
-  ACLOwner bucket_owner;
-  bucket_owner.set_id(user);
-  bucket_owner.set_name(user_info->display_name);
-  if (bucket_exists) {
-    ret = rgw_op_get_bucket_policy_from_attr(dpp, cct, store, bucket_info,
-                                             bucket_attrs, &old_policy, null_yield);
-    if (ret >= 0)  {
-      if (old_policy.get_owner().get_id().compare(user) != 0) {
-        return -EEXIST;
-      }
-    }
-  }
-
-  RGWBucketInfo master_info;
-  rgw_bucket *pmaster_bucket = nullptr;
-  uint32_t *pmaster_num_shards = nullptr;
-  real_time creation_time;
-
-  string zonegroup_id = zone_svc->get_zonegroup().get_id();
-
-  if (bucket_exists) {
-    rgw_placement_rule selected_placement_rule;
-    rgw_bucket bucket;
-    bucket.tenant = user.tenant;
-    bucket.name = bucket_name;
-    ret = zone_svc->select_bucket_placement(dpp, *user_info, zonegroup_id,
-                                           placement_rule,
-                                           &selected_placement_rule, nullptr, null_yield);
-    if (selected_placement_rule != bucket_info.placement_rule) {
-      ldpp_dout(dpp, 0) << "bucket already exists on a different placement rule: "
-        << " selected_rule= " << selected_placement_rule
-        << " existing_rule= " << bucket_info.placement_rule << dendl;
-      return -EEXIST;
-    }
-  }
-
-  /* Encode special metadata first as we're using std::map::emplace under
-   * the hood. This method will add the new items only if the map doesn't
-   * contain such keys yet. */
-  RGWAccessControlPolicy_S3 policy(cct);
-  policy.create_canned(bucket_owner, bucket_owner, string()); /* default private policy */
-  bufferlist aclbl;
-  policy.encode(aclbl);
-  map<string, buffer::list> attrs;
-  attrs.emplace(std::move(RGW_ATTR_ACL), std::move(aclbl));
-
-  RGWQuotaInfo quota_info;
-  const RGWQuotaInfo * pquota_info = nullptr;
-
-  rgw_bucket bucket;
-  bucket.tenant = user.tenant;
-  bucket.name = bucket_name;
-
-  RGWBucketInfo info;
-  obj_version ep_objv;
-
-  ret = store->getRados()->create_bucket(*user_info, bucket, zonegroup_id,
-                                placement_rule, bucket_info.swift_ver_location,
-                                pquota_info, attrs,
-                                info, nullptr, &ep_objv, creation_time,
-                               pmaster_bucket, pmaster_num_shards, null_yield, dpp, true);
-
-
-  if (ret && ret != -EEXIST)
-    return ret;
-
-  bool existed = (ret == -EEXIST);
-
-  if (existed) {
-    if (info.owner != user) {
-      ldpp_dout(dpp, 20) << "NOTICE: bucket already exists under a different user (bucket=" << bucket << " user=" << user << " bucket_owner=" << info.owner << dendl;
-      return -EEXIST;
-    }
-    bucket = info.bucket;
-  }
-
-  ret = store->ctl()->bucket->link_bucket(user, bucket, info.creation_time, null_yield, dpp, false);
-  if (ret && !existed && ret != -EEXIST) {
-    /* if it exists (or previously existed), don't remove it! */
-    int r = store->ctl()->bucket->unlink_bucket(user, bucket, null_yield, dpp);
-    if (r < 0) {
-      ldpp_dout(dpp, 0) << "WARNING: failed to unlink bucket: ret=" << r << dendl;
-    }
-  } else if (ret == -EEXIST || (ret == 0 && existed)) {
-    ret = -ERR_BUCKET_EXISTS;
-  }
-
-  if (ret < 0) {
-    ldpp_dout(dpp, 0) << "ERROR: bucket creation (bucket=" << bucket << ") return ret=" << ret << dendl;
-  }
-
-  return ret;
-}
-
-template<>
-int RGWObjectSimplePutCR::Request::_send_request(const DoutPrefixProvider *dpp)
-{
-  RGWDataAccess::ObjectRef obj;
-
-  CephContext *cct = store->ctx();
-
-  int ret = params.bucket->get_object(params.key, &obj);
-  if (ret < 0) {
-    lderr(cct) << "ERROR: failed to get object: " << cpp_strerror(-ret) << dendl;
-    return -ret;
-  }
-
-  if (params.user_data) {
-    obj->set_user_data(*params.user_data);
-  }
-
-  ret = obj->put(params.data, params.attrs, dpp, null_yield);
-  if (ret < 0) {
-    ldpp_dout(dpp, -1) << "ERROR: put object returned error: " << cpp_strerror(-ret) << dendl;
-  }
-
-  return 0;
-}
-
-template<>
-int RGWBucketLifecycleConfigCR::Request::_send_request(const DoutPrefixProvider *dpp)
-{
-  CephContext *cct = store->ctx();
-
-  RGWLC *lc = store->getRados()->get_lc();
-  if (!lc) {
-    lderr(cct) << "ERROR: lifecycle object is not initialized!" << dendl;
-    return -EIO;
-  }
-
-  int ret = lc->set_bucket_config(params.bucket,
-                                  params.bucket_attrs,
-                                  &params.config);
-  if (ret < 0) {
-    lderr(cct) << "ERROR: failed to set lifecycle on bucke: " << cpp_strerror(-ret) << dendl;
-    return -ret;
-  }
-
-  return 0;
-}
-
-template<>
-int RGWBucketGetSyncPolicyHandlerCR::Request::_send_request(const DoutPrefixProvider *dpp)
-{
-  int r = store->ctl()->bucket->get_sync_policy_handler(params.zone,
-                                                        params.bucket,
-                                                        &result->policy_handler,
-                                                        null_yield,
-                                                        dpp);
-  if (r < 0) {
-    ldpp_dout(dpp, -1) << "ERROR: " << __func__ << "(): get_sync_policy_handler() returned " << r << dendl;
-    return  r;
-  }
-
-  return 0;
-}
diff --git a/src/rgw/store/rados/rgw_cr_tools.h b/src/rgw/store/rados/rgw_cr_tools.h
deleted file mode 100644 (file)
index ebdbfeb..0000000
+++ /dev/null
@@ -1,87 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab ft=cpp
-
-#ifndef CEPH_RGW_CR_TOOLS_H
-#define CEPH_RGW_CR_TOOLS_H
-
-#include "rgw_cr_rados.h"
-#include "rgw_tools.h"
-#include "rgw_lc.h"
-
-#include "services/svc_bucket_sync.h"
-
-struct rgw_user_create_params {
-  rgw_user user;
-  std::string display_name;
-  std::string email;
-  std::string access_key;
-  std::string secret_key;
-  std::string key_type; /* "swift" or "s3" */
-  std::string caps;
-
-  bool generate_key{true};
-  bool suspended{false};
-  std::optional<int32_t> max_buckets;
-  bool system{false};
-  bool exclusive{false};
-  bool apply_quota{true};
-};
-
-using RGWUserCreateCR = RGWSimpleWriteOnlyAsyncCR<rgw_user_create_params>;
-
-struct rgw_get_user_info_params {
-  rgw_user user;
-};
-
-using RGWGetUserInfoCR = RGWSimpleAsyncCR<rgw_get_user_info_params, RGWUserInfo>;
-
-struct rgw_get_bucket_info_params {
-  std::string tenant;
-  std::string bucket_name;
-};
-
-struct rgw_get_bucket_info_result {
-  std::unique_ptr<rgw::sal::Bucket> bucket;
-};
-
-using RGWGetBucketInfoCR = RGWSimpleAsyncCR<rgw_get_bucket_info_params, rgw_get_bucket_info_result>;
-
-struct rgw_bucket_create_local_params {
-  std::shared_ptr<RGWUserInfo> user_info;
-  std::string bucket_name;
-  rgw_placement_rule placement_rule;
-};
-
-using RGWBucketCreateLocalCR = RGWSimpleWriteOnlyAsyncCR<rgw_bucket_create_local_params>;
-
-struct rgw_object_simple_put_params {
-  RGWDataAccess::BucketRef bucket;
-  rgw_obj_key key;
-  bufferlist data;
-  std::map<std::string, bufferlist> attrs;
-  std::optional<std::string> user_data;
-};
-
-using RGWObjectSimplePutCR = RGWSimpleWriteOnlyAsyncCR<rgw_object_simple_put_params>;
-
-
-struct rgw_bucket_lifecycle_config_params {
-  rgw::sal::Bucket* bucket;
-  rgw::sal::Attrs bucket_attrs;
-  RGWLifecycleConfiguration config;
-};
-
-using RGWBucketLifecycleConfigCR = RGWSimpleWriteOnlyAsyncCR<rgw_bucket_lifecycle_config_params>;
-
-struct rgw_bucket_get_sync_policy_params {
-  std::optional<rgw_zone_id> zone;
-  std::optional<rgw_bucket> bucket;
-};
-
-struct rgw_bucket_get_sync_policy_result {
-  RGWBucketSyncPolicyHandlerRef policy_handler;
-};
-
-using RGWBucketGetSyncPolicyHandlerCR = RGWSimpleAsyncCR<rgw_bucket_get_sync_policy_params, rgw_bucket_get_sync_policy_result>;
-
-#endif
diff --git a/src/rgw/store/rados/rgw_d3n_datacache.cc b/src/rgw/store/rados/rgw_d3n_datacache.cc
deleted file mode 100644 (file)
index ed375e2..0000000
+++ /dev/null
@@ -1,369 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab ft=cpp
-
-#include "rgw_d3n_datacache.h"
-#include "rgw_rest_client.h"
-#include "rgw_auth_s3.h"
-#include "rgw_op.h"
-#include "rgw_common.h"
-#include "rgw_auth_s3.h"
-#include "rgw_op.h"
-#include "rgw_crypt_sanitize.h"
-#if defined(__linux__)
-#include <features.h>
-#endif
-
-#if __has_include(<filesystem>)
-#include <filesystem>
-namespace efs = std::filesystem;
-#else
-#include <experimental/filesystem>
-namespace efs = std::experimental::filesystem;
-#endif
-
-#define dout_subsys ceph_subsys_rgw
-
-using namespace std;
-
-int D3nCacheAioWriteRequest::d3n_prepare_libaio_write_op(bufferlist& bl, unsigned int len, string oid, string cache_location)
-{
-  std::string location = cache_location + oid;
-  int r = 0;
-
-  lsubdout(g_ceph_context, rgw_datacache, 20) << "D3nDataCache: " << __func__ << "(): Write To Cache, location=" << location << dendl;
-  cb = new struct aiocb;
-  mode_t mode = S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH;
-  memset(cb, 0, sizeof(struct aiocb));
-  r = fd = ::open(location.c_str(), O_WRONLY | O_CREAT | O_TRUNC, mode);
-  if (fd < 0) {
-    ldout(cct, 0) << "ERROR: D3nCacheAioWriteRequest::create_io: open file failed, errno=" << errno << ", location='" << location.c_str() << "'" << dendl;
-    goto done;
-  }
-  if (g_conf()->rgw_d3n_l1_fadvise != POSIX_FADV_NORMAL)
-    posix_fadvise(fd, 0, 0, g_conf()->rgw_d3n_l1_fadvise);
-  cb->aio_fildes = fd;
-
-  data = malloc(len);
-  if (!data) {
-    ldout(cct, 0) << "ERROR: D3nCacheAioWriteRequest::create_io: memory allocation failed" << dendl;
-    goto close_file;
-  }
-  cb->aio_buf = data;
-  memcpy((void*)data, bl.c_str(), len);
-  cb->aio_nbytes = len;
-  goto done;
-
-close_file:
-  ::close(fd);
-done:
-  return r;
-}
-
-D3nDataCache::D3nDataCache()
-  : cct(nullptr), io_type(_io_type::ASYNC_IO), free_data_cache_size(0), outstanding_write_size(0)
-{
-  lsubdout(g_ceph_context, rgw_datacache, 5) << "D3nDataCache: " << __func__ << "()" << dendl;
-}
-
-void D3nDataCache::init(CephContext *_cct) {
-  cct = _cct;
-  free_data_cache_size = cct->_conf->rgw_d3n_l1_datacache_size;
-  head = nullptr;
-  tail = nullptr;
-  cache_location = cct->_conf->rgw_d3n_l1_datacache_persistent_path;
-  if(cache_location.back() != '/') {
-      cache_location += "/";
-  }
-  try {
-    if (efs::exists(cache_location)) {
-      // d3n: evict the cache storage directory
-      if (g_conf()->rgw_d3n_l1_evict_cache_on_start) {
-        lsubdout(g_ceph_context, rgw, 5) << "D3nDataCache: init: evicting the persistent storage directory on start" << dendl;
-        for (auto& p : efs::directory_iterator(cache_location)) {
-          efs::remove_all(p.path());
-        }
-      }
-    } else {
-      // create the cache storage directory
-      lsubdout(g_ceph_context, rgw, 5) << "D3nDataCache: init: creating the persistent storage directory on start" << dendl;
-      efs::create_directories(cache_location);
-    }
-  } catch (const efs::filesystem_error& e) {
-    lderr(g_ceph_context) << "D3nDataCache: init: ERROR initializing the cache storage directory '" << cache_location <<
-                              "' : " << e.what() << dendl;
-  }
-
-  auto conf_eviction_policy = cct->_conf.get_val<std::string>("rgw_d3n_l1_eviction_policy");
-  ceph_assert(conf_eviction_policy == "lru" || conf_eviction_policy == "random");
-  if (conf_eviction_policy == "lru")
-    eviction_policy = _eviction_policy::LRU;
-  if (conf_eviction_policy == "random")
-    eviction_policy = _eviction_policy::RANDOM;
-
-#if defined(HAVE_LIBAIO) && defined(__GLIBC__)
-  // libaio setup
-  struct aioinit ainit{0};
-  ainit.aio_threads = cct->_conf.get_val<int64_t>("rgw_d3n_libaio_aio_threads");
-  ainit.aio_num = cct->_conf.get_val<int64_t>("rgw_d3n_libaio_aio_num");
-  ainit.aio_idle_time = 120;
-  aio_init(&ainit);
-#endif
-}
-
-int D3nDataCache::d3n_io_write(bufferlist& bl, unsigned int len, std::string oid)
-{
-  D3nChunkDataInfo* chunk_info = new D3nChunkDataInfo;
-  std::string location = cache_location + oid;
-
-  lsubdout(g_ceph_context, rgw_datacache, 20) << "D3nDataCache: " << __func__ << "(): location=" << location << dendl;
-  FILE *cache_file = nullptr;
-  int r = 0;
-  size_t nbytes = 0;
-
-  cache_file = fopen(location.c_str(), "w+");
-  if (cache_file == nullptr) {
-    ldout(cct, 0) << "ERROR: D3nDataCache::fopen file has return error, errno=" << errno << dendl;
-    return -errno;
-  }
-
-  nbytes = fwrite(bl.c_str(), 1, len, cache_file);
-  if (nbytes != len) {
-    ldout(cct, 0) << "ERROR: D3nDataCache::io_write: fwrite has returned error: nbytes!=len, nbytes=" << nbytes << ", len=" << len << dendl;
-    return -EIO;
-  }
-
-  r = fclose(cache_file);
-  if (r != 0) {
-    ldout(cct, 0) << "ERROR: D3nDataCache::fclsoe file has return error, errno=" << errno << dendl;
-    return -errno;
-  }
-
-  { // update cahce_map entries for new chunk in cache
-    const std::lock_guard l(d3n_cache_lock);
-    chunk_info->oid = oid;
-    chunk_info->set_ctx(cct);
-    chunk_info->size = len;
-    d3n_cache_map.insert(pair<string, D3nChunkDataInfo*>(oid, chunk_info));
-  }
-
-  return r;
-}
-
-void d3n_libaio_write_cb(sigval sigval)
-{
-  lsubdout(g_ceph_context, rgw_datacache, 30) << "D3nDataCache: " << __func__ << "()" << dendl;
-  D3nCacheAioWriteRequest* c = static_cast<D3nCacheAioWriteRequest*>(sigval.sival_ptr);
-  c->priv_data->d3n_libaio_write_completion_cb(c);
-}
-
-
-void D3nDataCache::d3n_libaio_write_completion_cb(D3nCacheAioWriteRequest* c)
-{
-  D3nChunkDataInfo* chunk_info{nullptr};
-
-  ldout(cct, 5) << "D3nDataCache: " << __func__ << "(): oid=" << c->oid << dendl;
-
-  { // update cache_map entries for new chunk in cache
-    const std::lock_guard l(d3n_cache_lock);
-    d3n_outstanding_write_list.erase(c->oid);
-    chunk_info = new D3nChunkDataInfo;
-    chunk_info->oid = c->oid;
-    chunk_info->set_ctx(cct);
-    chunk_info->size = c->cb->aio_nbytes;
-    d3n_cache_map.insert(pair<string, D3nChunkDataInfo*>(c->oid, chunk_info));
-  }
-
-  { // update free size
-    const std::lock_guard l(d3n_eviction_lock);
-    free_data_cache_size -= c->cb->aio_nbytes;
-    outstanding_write_size -= c->cb->aio_nbytes;
-    lru_insert_head(chunk_info);
-  }
-  delete c;
-  c = nullptr;
-}
-
-int D3nDataCache::d3n_libaio_create_write_request(bufferlist& bl, unsigned int len, std::string oid)
-{
-  lsubdout(g_ceph_context, rgw_datacache, 30) << "D3nDataCache: " << __func__ << "(): Write To Cache, oid=" << oid << ", len=" << len << dendl;
-  struct D3nCacheAioWriteRequest* wr = new struct D3nCacheAioWriteRequest(cct);
-  int r=0;
-  if ((r = wr->d3n_prepare_libaio_write_op(bl, len, oid, cache_location)) < 0) {
-    ldout(cct, 0) << "ERROR: D3nDataCache: " << __func__ << "() prepare libaio write op r=" << r << dendl;
-    goto done;
-  }
-  wr->cb->aio_sigevent.sigev_notify = SIGEV_THREAD;
-  wr->cb->aio_sigevent.sigev_notify_function = d3n_libaio_write_cb;
-  wr->cb->aio_sigevent.sigev_notify_attributes = nullptr;
-  wr->cb->aio_sigevent.sigev_value.sival_ptr = (void*)wr;
-  wr->oid = oid;
-  wr->priv_data = this;
-
-  if ((r = ::aio_write(wr->cb)) != 0) {
-    ldout(cct, 0) << "ERROR: D3nDataCache: " << __func__ << "() aio_write r=" << r << dendl;
-    goto error;
-  }
-  return 0;
-
-error:
-  delete wr;
-done:
-  return r;
-}
-
-void D3nDataCache::put(bufferlist& bl, unsigned int len, std::string& oid)
-{
-  size_t sr = 0;
-  uint64_t freed_size = 0, _free_data_cache_size = 0, _outstanding_write_size = 0;
-
-  ldout(cct, 10) << "D3nDataCache::" << __func__ << "(): oid=" << oid << ", len=" << len << dendl;
-  {
-    const std::lock_guard l(d3n_cache_lock);
-    std::unordered_map<string, D3nChunkDataInfo*>::iterator iter = d3n_cache_map.find(oid);
-    if (iter != d3n_cache_map.end()) {
-      ldout(cct, 10) << "D3nDataCache::" << __func__ << "(): data already cached, no rewrite" << dendl;
-      return;
-    }
-    auto it = d3n_outstanding_write_list.find(oid);
-    if (it != d3n_outstanding_write_list.end()) {
-      ldout(cct, 10) << "D3nDataCache: NOTE: data put in cache already issued, no rewrite" << dendl;
-      return;
-    }
-    d3n_outstanding_write_list.insert(oid);
-  }
-  {
-    const std::lock_guard l(d3n_eviction_lock);
-    _free_data_cache_size = free_data_cache_size;
-    _outstanding_write_size = outstanding_write_size;
-  }
-  ldout(cct, 20) << "D3nDataCache: Before eviction _free_data_cache_size:" << _free_data_cache_size << ", _outstanding_write_size:" << _outstanding_write_size << ", freed_size:" << freed_size << dendl;
-  while (len > (_free_data_cache_size - _outstanding_write_size + freed_size)) {
-    ldout(cct, 20) << "D3nDataCache: enter eviction" << dendl;
-    if (eviction_policy == _eviction_policy::LRU) {
-      sr = lru_eviction();
-    } else if (eviction_policy == _eviction_policy::RANDOM) {
-      sr = random_eviction();
-    } else {
-      ldout(cct, 0) << "D3nDataCache: Warning: unknown cache eviction policy, defaulting to lru eviction" << dendl;
-      sr = lru_eviction();
-    }
-    if (sr == 0) {
-      ldout(cct, 2) << "D3nDataCache: Warning: eviction was not able to free disk space, not writing to cache" << dendl;
-      d3n_outstanding_write_list.erase(oid);
-      return;
-    }
-    ldout(cct, 20) << "D3nDataCache: completed eviction of " << sr << " bytes" << dendl;
-    freed_size += sr;
-  }
-  int r = 0;
-  r = d3n_libaio_create_write_request(bl, len, oid);
-  if (r < 0) {
-    const std::lock_guard l(d3n_cache_lock);
-    d3n_outstanding_write_list.erase(oid);
-    ldout(cct, 1) << "D3nDataCache: create_aio_write_request fail, r=" << r << dendl;
-    return;
-  }
-
-  const std::lock_guard l(d3n_eviction_lock);
-  free_data_cache_size += freed_size;
-  outstanding_write_size += len;
-}
-
-bool D3nDataCache::get(const string& oid, const off_t len)
-{
-  const std::lock_guard l(d3n_cache_lock);
-  bool exist = false;
-  string location = cache_location + oid;
-
-  lsubdout(g_ceph_context, rgw_datacache, 20) << "D3nDataCache: " << __func__ << "(): location=" << location << dendl;
-  std::unordered_map<string, D3nChunkDataInfo*>::iterator iter = d3n_cache_map.find(oid);
-  if (!(iter == d3n_cache_map.end())) {
-    // check inside cache whether file exists or not!!!! then make exist true;
-    struct D3nChunkDataInfo* chdo = iter->second;
-    struct stat st;
-    int r = stat(location.c_str(), &st);
-    if ( r != -1 && st.st_size == len) { // file exists and containes required data range length
-      exist = true;
-      /*LRU*/
-      /*get D3nChunkDataInfo*/
-      const std::lock_guard l(d3n_eviction_lock);
-      lru_remove(chdo);
-      lru_insert_head(chdo);
-    } else {
-      d3n_cache_map.erase(oid);
-      const std::lock_guard l(d3n_eviction_lock);
-      lru_remove(chdo);
-      delete chdo;
-      exist = false;
-    }
-  }
-  return exist;
-}
-
-size_t D3nDataCache::random_eviction()
-{
-  lsubdout(g_ceph_context, rgw_datacache, 20) << "D3nDataCache: " << __func__ << "()" << dendl;
-  int n_entries = 0;
-  int random_index = 0;
-  size_t freed_size = 0;
-  D3nChunkDataInfo* del_entry;
-  string del_oid, location;
-  {
-    const std::lock_guard l(d3n_cache_lock);
-    n_entries = d3n_cache_map.size();
-    if (n_entries <= 0) {
-      return -1;
-    }
-    srand (time(NULL));
-    random_index = ceph::util::generate_random_number<int>(0, n_entries-1);
-    std::unordered_map<string, D3nChunkDataInfo*>::iterator iter = d3n_cache_map.begin();
-    std::advance(iter, random_index);
-    del_oid = iter->first;
-    del_entry =  iter->second;
-    ldout(cct, 20) << "D3nDataCache: random_eviction: index:" << random_index << ", free size: " << del_entry->size << dendl;
-    freed_size = del_entry->size;
-    delete del_entry;
-    del_entry = nullptr;
-    d3n_cache_map.erase(del_oid); // oid
-  }
-
-  location = cache_location + del_oid;
-  ::remove(location.c_str());
-  return freed_size;
-}
-
-size_t D3nDataCache::lru_eviction()
-{
-  lsubdout(g_ceph_context, rgw_datacache, 20) << "D3nDataCache: " << __func__ << "()" << dendl;
-  int n_entries = 0;
-  size_t freed_size = 0;
-  D3nChunkDataInfo* del_entry;
-  string del_oid, location;
-
-  {
-    const std::lock_guard l(d3n_eviction_lock);
-    del_entry = tail;
-    if (del_entry == nullptr) {
-      ldout(cct, 2) << "D3nDataCache: lru_eviction: del_entry=null_ptr" << dendl;
-      return 0;
-    }
-    lru_remove(del_entry);
-  }
-
-  {
-    const std::lock_guard l(d3n_cache_lock);
-    n_entries = d3n_cache_map.size();
-    if (n_entries <= 0) {
-      ldout(cct, 2) << "D3nDataCache: lru_eviction: cache_map.size<=0" << dendl;
-      return -1;
-    }
-    del_oid = del_entry->oid;
-    ldout(cct, 20) << "D3nDataCache: lru_eviction: oid to remove: " << del_oid << dendl;
-    d3n_cache_map.erase(del_oid); // oid
-  }
-  freed_size = del_entry->size;
-  delete del_entry;
-  location = cache_location + del_oid;
-  ::remove(location.c_str());
-  return freed_size;
-}
diff --git a/src/rgw/store/rados/rgw_d3n_datacache.h b/src/rgw/store/rados/rgw_d3n_datacache.h
deleted file mode 100644 (file)
index 5d3537f..0000000
+++ /dev/null
@@ -1,261 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab ft=cpp
-
-#ifndef CEPH_RGWD3NDATACACHE_H
-#define CEPH_RGWD3NDATACACHE_H
-
-#include "rgw_rados.h"
-#include <curl/curl.h>
-
-#include "rgw_common.h"
-
-#include <unistd.h>
-#include <signal.h>
-#include "include/Context.h"
-#include "include/lru.h"
-#include "rgw_d3n_cacherequest.h"
-
-
-/*D3nDataCache*/
-struct D3nDataCache;
-
-
-struct D3nChunkDataInfo : public LRUObject {
-       CephContext *cct;
-       uint64_t size;
-       time_t access_time;
-       std::string address;
-       std::string oid;
-       bool complete;
-       struct D3nChunkDataInfo* lru_prev;
-       struct D3nChunkDataInfo* lru_next;
-
-       D3nChunkDataInfo(): size(0) {}
-
-       void set_ctx(CephContext *_cct) {
-               cct = _cct;
-       }
-
-       void dump(Formatter *f) const;
-       static void generate_test_instances(std::list<D3nChunkDataInfo*>& o);
-};
-
-struct D3nCacheAioWriteRequest {
-       std::string oid;
-       void *data;
-       int fd;
-       struct aiocb *cb;
-       D3nDataCache *priv_data;
-       CephContext *cct;
-
-       D3nCacheAioWriteRequest(CephContext *_cct) : cct(_cct) {}
-       int d3n_prepare_libaio_write_op(bufferlist& bl, unsigned int len, std::string oid, std::string cache_location);
-
-  ~D3nCacheAioWriteRequest() {
-    ::close(fd);
-               cb->aio_buf = nullptr;
-               free(data);
-               data = nullptr;
-               delete(cb);
-  }
-};
-
-struct D3nDataCache {
-
-private:
-  std::unordered_map<std::string, D3nChunkDataInfo*> d3n_cache_map;
-  std::set<std::string> d3n_outstanding_write_list;
-  std::mutex d3n_cache_lock;
-  std::mutex d3n_eviction_lock;
-
-  CephContext *cct;
-  enum class _io_type {
-    SYNC_IO = 1,
-    ASYNC_IO = 2,
-    SEND_FILE = 3
-  } io_type;
-  enum class _eviction_policy {
-    LRU=0, RANDOM=1
-  } eviction_policy;
-
-  struct sigaction action;
-  uint64_t free_data_cache_size = 0;
-  uint64_t outstanding_write_size = 0;
-  struct D3nChunkDataInfo* head;
-  struct D3nChunkDataInfo* tail;
-
-private:
-  void add_io();
-
-public:
-  D3nDataCache();
-  ~D3nDataCache() {
-    while (lru_eviction() > 0);
-  }
-
-  std::string cache_location;
-
-  bool get(const std::string& oid, const off_t len);
-  void put(bufferlist& bl, unsigned int len, std::string& obj_key);
-  int d3n_io_write(bufferlist& bl, unsigned int len, std::string oid);
-  int d3n_libaio_create_write_request(bufferlist& bl, unsigned int len, std::string oid);
-  void d3n_libaio_write_completion_cb(D3nCacheAioWriteRequest* c);
-  size_t random_eviction();
-  size_t lru_eviction();
-
-  void init(CephContext *_cct);
-
-  void lru_insert_head(struct D3nChunkDataInfo* o) {
-    lsubdout(g_ceph_context, rgw_datacache, 30) << "D3nDataCache: " << __func__ << "()" << dendl;
-    o->lru_next = head;
-    o->lru_prev = nullptr;
-    if (head) {
-      head->lru_prev = o;
-    } else {
-      tail = o;
-    }
-    head = o;
-  }
-
-  void lru_insert_tail(struct D3nChunkDataInfo* o) {
-    lsubdout(g_ceph_context, rgw_datacache, 30) << "D3nDataCache: " << __func__ << "()" << dendl;
-    o->lru_next = nullptr;
-    o->lru_prev = tail;
-    if (tail) {
-      tail->lru_next = o;
-    } else {
-      head = o;
-    }
-    tail = o;
-  }
-
-  void lru_remove(struct D3nChunkDataInfo* o) {
-    lsubdout(g_ceph_context, rgw_datacache, 30) << "D3nDataCache: " << __func__ << "()" << dendl;
-    if (o->lru_next)
-      o->lru_next->lru_prev = o->lru_prev;
-    else
-      tail = o->lru_prev;
-    if (o->lru_prev)
-      o->lru_prev->lru_next = o->lru_next;
-    else
-      head = o->lru_next;
-    o->lru_next = o->lru_prev = nullptr;
-  }
-};
-
-
-template <class T>
-class D3nRGWDataCache : public T {
-
-public:
-  D3nRGWDataCache() {}
-
-  int init_rados() override {
-    int ret;
-    ret = T::init_rados();
-    if (ret < 0)
-      return ret;
-
-    return 0;
-  }
-
-  int get_obj_iterate_cb(const DoutPrefixProvider *dpp, const rgw_raw_obj& read_obj, off_t obj_ofs,
-                         off_t read_ofs, off_t len, bool is_head_obj,
-                         RGWObjState *astate, void *arg) override;
-};
-
-template<typename T>
-int D3nRGWDataCache<T>::get_obj_iterate_cb(const DoutPrefixProvider *dpp, const rgw_raw_obj& read_obj, off_t obj_ofs,
-                                 off_t read_ofs, off_t len, bool is_head_obj,
-                                 RGWObjState *astate, void *arg) {
-  lsubdout(g_ceph_context, rgw_datacache, 30) << "D3nDataCache::" << __func__ << "(): is head object : " << is_head_obj << dendl;
-  librados::ObjectReadOperation op;
-  struct get_obj_data* d = static_cast<struct get_obj_data*>(arg);
-  std::string oid, key;
-
-  if (is_head_obj) {
-    // only when reading from the head object do we need to do the atomic test
-    int r = T::append_atomic_test(dpp, astate, op);
-    if (r < 0)
-      return r;
-
-    if (astate &&
-        obj_ofs < astate->data.length()) {
-      unsigned chunk_len = std::min((uint64_t)astate->data.length() - obj_ofs, (uint64_t)len);
-
-      r = d->client_cb->handle_data(astate->data, obj_ofs, chunk_len);
-      if (r < 0)
-        return r;
-
-      len -= chunk_len;
-      d->offset += chunk_len;
-      read_ofs += chunk_len;
-      obj_ofs += chunk_len;
-      if (!len)
-        return 0;
-    }
-
-    auto obj = d->rgwrados->svc.rados->obj(read_obj);
-    r = obj.open(dpp);
-    if (r < 0) {
-      lsubdout(g_ceph_context, rgw, 4) << "failed to open rados context for " << read_obj << dendl;
-      return r;
-    }
-
-    ldpp_dout(dpp, 20) << "D3nDataCache::" << __func__ << "(): oid=" << read_obj.oid << " obj-ofs=" << obj_ofs << " read_ofs=" << read_ofs << " len=" << len << dendl;
-    op.read(read_ofs, len, nullptr, nullptr);
-
-    const uint64_t cost = len;
-    const uint64_t id = obj_ofs; // use logical object offset for sorting replies
-
-    auto completed = d->aio->get(obj, rgw::Aio::librados_op(std::move(op), d->yield), cost, id);
-    return d->flush(std::move(completed));
-  } else {
-    ldpp_dout(dpp, 20) << "D3nDataCache::" << __func__ << "(): oid=" << read_obj.oid << ", is_head_obj=" << is_head_obj << ", obj-ofs=" << obj_ofs << ", read_ofs=" << read_ofs << ", len=" << len << dendl;
-    int r;
-
-    op.read(read_ofs, len, nullptr, nullptr);
-
-    const uint64_t cost = len;
-    const uint64_t id = obj_ofs; // use logical object offset for sorting replies
-    oid = read_obj.oid;
-
-    auto obj = d->rgwrados->svc.rados->obj(read_obj);
-    r = obj.open(dpp);
-    if (r < 0) {
-      lsubdout(g_ceph_context, rgw, 0) << "D3nDataCache: Error: failed to open rados context for " << read_obj << ", r=" << r << dendl;
-      return r;
-    }
-
-    const bool is_compressed = (astate->attrset.find(RGW_ATTR_COMPRESSION) != astate->attrset.end());
-    const bool is_encrypted = (astate->attrset.find(RGW_ATTR_CRYPT_MODE) != astate->attrset.end());
-    if (read_ofs != 0 || astate->size != astate->accounted_size || is_compressed || is_encrypted) {
-      d->d3n_bypass_cache_write = true;
-      lsubdout(g_ceph_context, rgw, 5) << "D3nDataCache: " << __func__ << "(): Note - bypassing datacache: oid=" << read_obj.oid << ", read_ofs!=0 = " << read_ofs << ", size=" << astate->size << " != accounted_size=" << astate->accounted_size << ", is_compressed=" << is_compressed << ", is_encrypted=" << is_encrypted  << dendl;
-      auto completed = d->aio->get(obj, rgw::Aio::librados_op(std::move(op), d->yield), cost, id);
-      r = d->flush(std::move(completed));
-      return r;
-    }
-
-    if (d->rgwrados->d3n_data_cache->get(oid, len)) {
-      // Read From Cache
-      ldpp_dout(dpp, 20) << "D3nDataCache: " << __func__ << "(): READ FROM CACHE: oid=" << read_obj.oid << ", obj-ofs=" << obj_ofs << ", read_ofs=" << read_ofs << ", len=" << len << dendl;
-      auto completed = d->aio->get(obj, rgw::Aio::d3n_cache_op(dpp, d->yield, read_ofs, len, d->rgwrados->d3n_data_cache->cache_location), cost, id);
-      r = d->flush(std::move(completed));
-      if (r < 0) {
-        lsubdout(g_ceph_context, rgw, 0) << "D3nDataCache: " << __func__ << "(): Error: failed to drain/flush, r= " << r << dendl;
-      }
-      return r;
-    } else {
-      // Write To Cache
-      ldpp_dout(dpp, 20) << "D3nDataCache: " << __func__ << "(): WRITE TO CACHE: oid=" << read_obj.oid << ", obj-ofs=" << obj_ofs << ", read_ofs=" << read_ofs << " len=" << len << dendl;
-      auto completed = d->aio->get(obj, rgw::Aio::librados_op(std::move(op), d->yield), cost, id);
-      return d->flush(std::move(completed));
-    }
-  }
-  lsubdout(g_ceph_context, rgw, 1) << "D3nDataCache: " << __func__ << "(): Warning: Check head object cache handling flow, oid=" << read_obj.oid << dendl;
-
-  return 0;
-}
-
-#endif
diff --git a/src/rgw/store/rados/rgw_data_sync.cc b/src/rgw/store/rados/rgw_data_sync.cc
deleted file mode 100644 (file)
index 47573b7..0000000
+++ /dev/null
@@ -1,6460 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab ft=cpp
-
-#include "common/ceph_json.h"
-#include "common/RefCountedObj.h"
-#include "common/WorkQueue.h"
-#include "common/Throttle.h"
-#include "common/errno.h"
-
-#include "rgw_common.h"
-#include "rgw_zone.h"
-#include "rgw_sync.h"
-#include "rgw_data_sync.h"
-#include "rgw_rest_conn.h"
-#include "rgw_cr_rados.h"
-#include "rgw_cr_rest.h"
-#include "rgw_cr_tools.h"
-#include "rgw_http_client.h"
-#include "rgw_bucket.h"
-#include "rgw_bucket_sync.h"
-#include "rgw_bucket_sync_cache.h"
-#include "rgw_datalog.h"
-#include "rgw_metadata.h"
-#include "rgw_sync_counters.h"
-#include "rgw_sync_error_repo.h"
-#include "rgw_sync_module.h"
-#include "rgw_sal.h"
-
-#include "cls/lock/cls_lock_client.h"
-#include "cls/rgw/cls_rgw_client.h"
-
-#include "services/svc_zone.h"
-#include "services/svc_sync_modules.h"
-#include "rgw_bucket.h"
-
-#include "include/common_fwd.h"
-#include "include/random.h"
-
-#include <boost/asio/yield.hpp>
-#include <string_view>
-
-#define dout_subsys ceph_subsys_rgw
-
-#undef dout_prefix
-#define dout_prefix (*_dout << "data sync: ")
-
-using namespace std;
-
-static const string datalog_sync_status_oid_prefix = "datalog.sync-status";
-static const string datalog_sync_status_shard_prefix = "datalog.sync-status.shard";
-static const string datalog_sync_full_sync_index_prefix = "data.full-sync.index";
-static const string bucket_full_status_oid_prefix = "bucket.full-sync-status";
-static const string bucket_status_oid_prefix = "bucket.sync-status";
-static const string object_status_oid_prefix = "bucket.sync-status";
-
-void rgw_datalog_info::decode_json(JSONObj *obj) {
-  JSONDecoder::decode_json("num_objects", num_shards, obj);
-}
-
-void rgw_datalog_entry::decode_json(JSONObj *obj) {
-  JSONDecoder::decode_json("key", key, obj);
-  utime_t ut;
-  JSONDecoder::decode_json("timestamp", ut, obj);
-  timestamp = ut.to_real_time();
-}
-
-void rgw_datalog_shard_data::decode_json(JSONObj *obj) {
-  JSONDecoder::decode_json("marker", marker, obj);
-  JSONDecoder::decode_json("truncated", truncated, obj);
-  JSONDecoder::decode_json("entries", entries, obj);
-};
-
-// print a bucket shard with [gen]
-std::string to_string(const rgw_bucket_shard& bs, std::optional<uint64_t> gen)
-{
-  constexpr auto digits10 = std::numeric_limits<uint64_t>::digits10;
-  constexpr auto reserve = 2 + digits10; // [value]
-  auto str = bs.get_key('/', ':', ':', reserve);
-  str.append(1, '[');
-  str.append(std::to_string(gen.value_or(0)));
-  str.append(1, ']');
-  return str;
-}
-
-class RGWReadDataSyncStatusMarkersCR : public RGWShardCollectCR {
-  static constexpr int MAX_CONCURRENT_SHARDS = 16;
-
-  RGWDataSyncCtx *sc;
-  RGWDataSyncEnv *env;
-  const int num_shards;
-  int shard_id{0};;
-
-  map<uint32_t, rgw_data_sync_marker>& markers;
-
-  int handle_result(int r) override {
-    if (r == -ENOENT) { // ENOENT is not a fatal error
-      return 0;
-    }
-    if (r < 0) {
-      ldout(cct, 4) << "failed to read data sync status: "
-          << cpp_strerror(r) << dendl;
-    }
-    return r;
-  }
- public:
-  RGWReadDataSyncStatusMarkersCR(RGWDataSyncCtx *sc, int num_shards,
-                                 map<uint32_t, rgw_data_sync_marker>& markers)
-    : RGWShardCollectCR(sc->cct, MAX_CONCURRENT_SHARDS),
-      sc(sc), env(sc->env), num_shards(num_shards), markers(markers)
-  {}
-  bool spawn_next() override;
-};
-
-bool RGWReadDataSyncStatusMarkersCR::spawn_next()
-{
-  if (shard_id >= num_shards) {
-    return false;
-  }
-  using CR = RGWSimpleRadosReadCR<rgw_data_sync_marker>;
-  spawn(new CR(env->dpp, env->async_rados, env->svc->sysobj,
-               rgw_raw_obj(env->svc->zone->get_zone_params().log_pool, RGWDataSyncStatusManager::shard_obj_name(sc->source_zone, shard_id)),
-               &markers[shard_id]),
-        false);
-  shard_id++;
-  return true;
-}
-
-class RGWReadDataSyncRecoveringShardsCR : public RGWShardCollectCR {
-  static constexpr int MAX_CONCURRENT_SHARDS = 16;
-
-  RGWDataSyncCtx *sc;
-  RGWDataSyncEnv *env;
-
-  uint64_t max_entries;
-  int num_shards;
-  int shard_id{0};
-
-  string marker;
-  std::vector<RGWRadosGetOmapKeysCR::ResultPtr>& omapkeys;
-
-  int handle_result(int r) override {
-    if (r == -ENOENT) { // ENOENT is not a fatal error
-      return 0;
-    }
-    if (r < 0) {
-      ldout(cct, 4) << "failed to list recovering data sync: "
-          << cpp_strerror(r) << dendl;
-    }
-    return r;
-  }
- public:
-  RGWReadDataSyncRecoveringShardsCR(RGWDataSyncCtx *sc, uint64_t _max_entries, int _num_shards,
-                                    std::vector<RGWRadosGetOmapKeysCR::ResultPtr>& omapkeys)
-    : RGWShardCollectCR(sc->cct, MAX_CONCURRENT_SHARDS), sc(sc), env(sc->env),
-      max_entries(_max_entries), num_shards(_num_shards), omapkeys(omapkeys)
-  {}
-  bool spawn_next() override;
-};
-
-bool RGWReadDataSyncRecoveringShardsCR::spawn_next()
-{
-  if (shard_id >= num_shards)
-    return false;
-  string error_oid = RGWDataSyncStatusManager::shard_obj_name(sc->source_zone, shard_id) + ".retry";
-  auto& shard_keys = omapkeys[shard_id];
-  shard_keys = std::make_shared<RGWRadosGetOmapKeysCR::Result>();
-  spawn(new RGWRadosGetOmapKeysCR(env->driver, rgw_raw_obj(env->svc->zone->get_zone_params().log_pool, error_oid),
-                                  marker, max_entries, shard_keys), false);
-
-  ++shard_id;
-  return true;
-}
-
-class RGWReadDataSyncStatusCoroutine : public RGWCoroutine {
-  RGWDataSyncCtx *sc;
-  RGWDataSyncEnv *sync_env;
-  rgw_data_sync_status *sync_status;
-
-public:
-  RGWReadDataSyncStatusCoroutine(RGWDataSyncCtx *_sc,
-                                 rgw_data_sync_status *_status)
-    : RGWCoroutine(_sc->cct), sc(_sc), sync_env(sc->env), sync_status(_status)
-  {}
-  int operate(const DoutPrefixProvider *dpp) override;
-};
-
-int RGWReadDataSyncStatusCoroutine::operate(const DoutPrefixProvider *dpp)
-{
-  reenter(this) {
-    // read sync info
-    using ReadInfoCR = RGWSimpleRadosReadCR<rgw_data_sync_info>;
-    yield {
-      bool empty_on_enoent = false; // fail on ENOENT
-      call(new ReadInfoCR(dpp, sync_env->async_rados, sync_env->svc->sysobj,
-                          rgw_raw_obj(sync_env->svc->zone->get_zone_params().log_pool, RGWDataSyncStatusManager::sync_status_oid(sc->source_zone)),
-                          &sync_status->sync_info, empty_on_enoent));
-    }
-    if (retcode < 0) {
-      ldpp_dout(dpp, 4) << "failed to read sync status info with "
-          << cpp_strerror(retcode) << dendl;
-      return set_cr_error(retcode);
-    }
-    // read shard markers
-    using ReadMarkersCR = RGWReadDataSyncStatusMarkersCR;
-    yield call(new ReadMarkersCR(sc, sync_status->sync_info.num_shards,
-                                 sync_status->sync_markers));
-    if (retcode < 0) {
-      ldpp_dout(dpp, 4) << "failed to read sync status markers with "
-          << cpp_strerror(retcode) << dendl;
-      return set_cr_error(retcode);
-    }
-    return set_cr_done();
-  }
-  return 0;
-}
-
-class RGWReadRemoteDataLogShardInfoCR : public RGWCoroutine {
-  RGWDataSyncCtx *sc;
-  RGWDataSyncEnv *sync_env;
-
-  RGWRESTReadResource *http_op;
-
-  int shard_id;
-  RGWDataChangesLogInfo *shard_info;
-
-public:
-  RGWReadRemoteDataLogShardInfoCR(RGWDataSyncCtx *_sc,
-                                  int _shard_id, RGWDataChangesLogInfo *_shard_info) : RGWCoroutine(_sc->cct),
-                                                      sc(_sc),
-                                                      sync_env(_sc->env),
-                                                      http_op(NULL),
-                                                      shard_id(_shard_id),
-                                                      shard_info(_shard_info) {
-  }
-
-  ~RGWReadRemoteDataLogShardInfoCR() override {
-    if (http_op) {
-      http_op->put();
-    }
-  }
-
-  int operate(const DoutPrefixProvider *dpp) override {
-    reenter(this) {
-      yield {
-       char buf[16];
-       snprintf(buf, sizeof(buf), "%d", shard_id);
-        rgw_http_param_pair pairs[] = { { "type" , "data" },
-                                       { "id", buf },
-                                       { "info" , NULL },
-                                       { NULL, NULL } };
-
-        string p = "/admin/log/";
-
-        http_op = new RGWRESTReadResource(sc->conn, p, pairs, NULL, sync_env->http_manager);
-
-        init_new_io(http_op);
-
-        int ret = http_op->aio_read(dpp);
-        if (ret < 0) {
-          ldpp_dout(dpp, 0) << "ERROR: failed to read from " << p << dendl;
-          log_error() << "failed to send http operation: " << http_op->to_str() << " ret=" << ret << std::endl;
-          return set_cr_error(ret);
-        }
-
-        return io_block(0);
-      }
-      yield {
-        int ret = http_op->wait(shard_info, null_yield);
-        if (ret < 0) {
-          return set_cr_error(ret);
-        }
-        return set_cr_done();
-      }
-    }
-    return 0;
-  }
-};
-
-struct read_remote_data_log_response {
-  string marker;
-  bool truncated;
-  vector<rgw_data_change_log_entry> entries;
-
-  read_remote_data_log_response() : truncated(false) {}
-
-  void decode_json(JSONObj *obj) {
-    JSONDecoder::decode_json("marker", marker, obj);
-    JSONDecoder::decode_json("truncated", truncated, obj);
-    JSONDecoder::decode_json("entries", entries, obj);
-  };
-};
-
-class RGWReadRemoteDataLogShardCR : public RGWCoroutine {
-  RGWDataSyncCtx *sc;
-  RGWDataSyncEnv *sync_env;
-
-  RGWRESTReadResource *http_op = nullptr;
-
-  int shard_id;
-  const std::string& marker;
-  string *pnext_marker;
-  vector<rgw_data_change_log_entry> *entries;
-  bool *truncated;
-
-  read_remote_data_log_response response;
-  std::optional<TOPNSPC::common::PerfGuard> timer;
-
-public:
-  RGWReadRemoteDataLogShardCR(RGWDataSyncCtx *_sc, int _shard_id,
-                              const std::string& marker, string *pnext_marker,
-                              vector<rgw_data_change_log_entry> *_entries,
-                              bool *_truncated)
-    : RGWCoroutine(_sc->cct), sc(_sc), sync_env(_sc->env),
-      shard_id(_shard_id), marker(marker), pnext_marker(pnext_marker),
-      entries(_entries), truncated(_truncated) {
-  }
-  ~RGWReadRemoteDataLogShardCR() override {
-    if (http_op) {
-      http_op->put();
-    }
-  }
-
-  int operate(const DoutPrefixProvider *dpp) override {
-    reenter(this) {
-      yield {
-       char buf[16];
-       snprintf(buf, sizeof(buf), "%d", shard_id);
-        rgw_http_param_pair pairs[] = { { "type" , "data" },
-                                       { "id", buf },
-                                       { "marker", marker.c_str() },
-                                       { "extra-info", "true" },
-                                       { NULL, NULL } };
-
-        string p = "/admin/log/";
-
-        http_op = new RGWRESTReadResource(sc->conn, p, pairs, NULL, sync_env->http_manager);
-
-        init_new_io(http_op);
-
-        if (sync_env->counters) {
-          timer.emplace(sync_env->counters, sync_counters::l_poll);
-        }
-        int ret = http_op->aio_read(dpp);
-        if (ret < 0) {
-          ldpp_dout(dpp, 0) << "ERROR: failed to read from " << p << dendl;
-          log_error() << "failed to send http operation: " << http_op->to_str() << " ret=" << ret << std::endl;
-          if (sync_env->counters) {
-            sync_env->counters->inc(sync_counters::l_poll_err);
-          }
-          return set_cr_error(ret);
-        }
-
-        return io_block(0);
-      }
-      yield {
-        timer.reset();
-        int ret = http_op->wait(&response, null_yield);
-        if (ret < 0) {
-          if (sync_env->counters && ret != -ENOENT) {
-            sync_env->counters->inc(sync_counters::l_poll_err);
-          }
-          return set_cr_error(ret);
-        }
-        entries->clear();
-        entries->swap(response.entries);
-        *pnext_marker = response.marker;
-        *truncated = response.truncated;
-        return set_cr_done();
-      }
-    }
-    return 0;
-  }
-};
-
-class RGWReadRemoteDataLogInfoCR : public RGWShardCollectCR {
-  RGWDataSyncCtx *sc;
-  RGWDataSyncEnv *sync_env;
-
-  int num_shards;
-  map<int, RGWDataChangesLogInfo> *datalog_info;
-
-  int shard_id;
-#define READ_DATALOG_MAX_CONCURRENT 10
-
-  int handle_result(int r) override {
-    if (r == -ENOENT) { // ENOENT is not a fatal error
-      return 0;
-    }
-    if (r < 0) {
-      ldout(cct, 4) << "failed to fetch remote datalog info: "
-          << cpp_strerror(r) << dendl;
-    }
-    return r;
-  }
-public:
-  RGWReadRemoteDataLogInfoCR(RGWDataSyncCtx *_sc,
-                     int _num_shards,
-                     map<int, RGWDataChangesLogInfo> *_datalog_info) : RGWShardCollectCR(_sc->cct, READ_DATALOG_MAX_CONCURRENT),
-                                                                 sc(_sc), sync_env(_sc->env), num_shards(_num_shards),
-                                                                 datalog_info(_datalog_info), shard_id(0) {}
-  bool spawn_next() override;
-};
-
-bool RGWReadRemoteDataLogInfoCR::spawn_next() {
-  if (shard_id >= num_shards) {
-    return false;
-  }
-  spawn(new RGWReadRemoteDataLogShardInfoCR(sc, shard_id, &(*datalog_info)[shard_id]), false);
-  shard_id++;
-  return true;
-}
-
-class RGWListRemoteDataLogShardCR : public RGWSimpleCoroutine {
-  RGWDataSyncCtx *sc;
-  RGWDataSyncEnv *sync_env;
-  RGWRESTReadResource *http_op;
-
-  int shard_id;
-  string marker;
-  uint32_t max_entries;
-  rgw_datalog_shard_data *result;
-
-public:
-  RGWListRemoteDataLogShardCR(RGWDataSyncCtx *sc, int _shard_id,
-                              const string& _marker, uint32_t _max_entries,
-                              rgw_datalog_shard_data *_result)
-    : RGWSimpleCoroutine(sc->cct), sc(sc), sync_env(sc->env), http_op(NULL),
-      shard_id(_shard_id), marker(_marker), max_entries(_max_entries), result(_result) {}
-
-  int send_request(const DoutPrefixProvider *dpp) override {
-    RGWRESTConn *conn = sc->conn;
-
-    char buf[32];
-    snprintf(buf, sizeof(buf), "%d", shard_id);
-
-    char max_entries_buf[32];
-    snprintf(max_entries_buf, sizeof(max_entries_buf), "%d", (int)max_entries);
-
-    const char *marker_key = (marker.empty() ? "" : "marker");
-
-    rgw_http_param_pair pairs[] = { { "type", "data" },
-      { "id", buf },
-      { "max-entries", max_entries_buf },
-      { marker_key, marker.c_str() },
-      { NULL, NULL } };
-
-    string p = "/admin/log/";
-
-    http_op = new RGWRESTReadResource(conn, p, pairs, NULL, sync_env->http_manager);
-    init_new_io(http_op);
-
-    int ret = http_op->aio_read(dpp);
-    if (ret < 0) {
-      ldpp_dout(dpp, 0) << "ERROR: failed to read from " << p << dendl;
-      log_error() << "failed to send http operation: " << http_op->to_str() << " ret=" << ret << std::endl;
-      http_op->put();
-      return ret;
-    }
-
-    return 0;
-  }
-
-  int request_complete() override {
-    int ret = http_op->wait(result, null_yield);
-    http_op->put();
-    if (ret < 0 && ret != -ENOENT) {
-      ldpp_dout(sync_env->dpp, 0) << "ERROR: failed to list remote datalog shard, ret=" << ret << dendl;
-      return ret;
-    }
-    return 0;
-  }
-};
-
-class RGWListRemoteDataLogCR : public RGWShardCollectCR {
-  RGWDataSyncCtx *sc;
-  RGWDataSyncEnv *sync_env;
-
-  map<int, string> shards;
-  int max_entries_per_shard;
-  map<int, rgw_datalog_shard_data> *result;
-
-  map<int, string>::iterator iter;
-#define READ_DATALOG_MAX_CONCURRENT 10
-
-  int handle_result(int r) override {
-    if (r == -ENOENT) { // ENOENT is not a fatal error
-      return 0;
-    }
-    if (r < 0) {
-      ldout(cct, 4) << "failed to list remote datalog: "
-          << cpp_strerror(r) << dendl;
-    }
-    return r;
-  }
-public:
-  RGWListRemoteDataLogCR(RGWDataSyncCtx *_sc,
-                     map<int, string>& _shards,
-                     int _max_entries_per_shard,
-                     map<int, rgw_datalog_shard_data> *_result) : RGWShardCollectCR(_sc->cct, READ_DATALOG_MAX_CONCURRENT),
-                                                                 sc(_sc), sync_env(_sc->env), max_entries_per_shard(_max_entries_per_shard),
-                                                                 result(_result) {
-    shards.swap(_shards);
-    iter = shards.begin();
-  }
-  bool spawn_next() override;
-};
-
-bool RGWListRemoteDataLogCR::spawn_next() {
-  if (iter == shards.end()) {
-    return false;
-  }
-
-  spawn(new RGWListRemoteDataLogShardCR(sc, iter->first, iter->second, max_entries_per_shard, &(*result)[iter->first]), false);
-  ++iter;
-  return true;
-}
-
-class RGWInitDataSyncStatusCoroutine : public RGWCoroutine {
-  static constexpr uint32_t lock_duration = 30;
-  RGWDataSyncCtx *sc;
-  RGWDataSyncEnv *sync_env;
-  rgw::sal::RadosStore* driver; // RGWDataSyncEnv also has a pointer to driver
-  const rgw_pool& pool;
-  const uint32_t num_shards;
-
-  string sync_status_oid;
-
-  string lock_name;
-  string cookie;
-  rgw_data_sync_status *status;
-  map<int, RGWDataChangesLogInfo> shards_info;
-
-  RGWSyncTraceNodeRef tn;
-public:
-  RGWInitDataSyncStatusCoroutine(RGWDataSyncCtx *_sc, uint32_t num_shards,
-                                 uint64_t instance_id,
-                                 RGWSyncTraceNodeRef& _tn_parent,
-                                 rgw_data_sync_status *status)
-    : RGWCoroutine(_sc->cct), sc(_sc), sync_env(_sc->env), driver(sync_env->driver),
-      pool(sync_env->svc->zone->get_zone_params().log_pool),
-      num_shards(num_shards), status(status),
-      tn(sync_env->sync_tracer->add_node(_tn_parent, "init_data_sync_status")) {
-    lock_name = "sync_lock";
-
-    status->sync_info.instance_id = instance_id;
-
-#define COOKIE_LEN 16
-    char buf[COOKIE_LEN + 1];
-
-    gen_rand_alphanumeric(cct, buf, sizeof(buf) - 1);
-    cookie = buf;
-
-    sync_status_oid = RGWDataSyncStatusManager::sync_status_oid(sc->source_zone);
-
-  }
-
-  int operate(const DoutPrefixProvider *dpp) override {
-    int ret;
-    reenter(this) {
-      using LockCR = RGWSimpleRadosLockCR;
-      yield call(new LockCR(sync_env->async_rados, driver,
-                            rgw_raw_obj{pool, sync_status_oid},
-                            lock_name, cookie, lock_duration));
-      if (retcode < 0) {
-        tn->log(0, SSTR("ERROR: failed to take a lock on " << sync_status_oid));
-        return set_cr_error(retcode);
-      }
-      using WriteInfoCR = RGWSimpleRadosWriteCR<rgw_data_sync_info>;
-      yield call(new WriteInfoCR(dpp, sync_env->async_rados, sync_env->svc->sysobj,
-                                 rgw_raw_obj{pool, sync_status_oid},
-                                 status->sync_info));
-      if (retcode < 0) {
-        tn->log(0, SSTR("ERROR: failed to write sync status info with " << retcode));
-        return set_cr_error(retcode);
-      }
-
-      /* take lock again, we just recreated the object */
-      yield call(new LockCR(sync_env->async_rados, driver,
-                            rgw_raw_obj{pool, sync_status_oid},
-                            lock_name, cookie, lock_duration));
-      if (retcode < 0) {
-        tn->log(0, SSTR("ERROR: failed to take a lock on " << sync_status_oid));
-        return set_cr_error(retcode);
-      }
-
-      tn->log(10, "took lease");
-
-      /* fetch current position in logs */
-      yield {
-        RGWRESTConn *conn = sync_env->svc->zone->get_zone_conn(sc->source_zone);
-        if (!conn) {
-          tn->log(0, SSTR("ERROR: connection to zone " << sc->source_zone << " does not exist!"));
-          return set_cr_error(-EIO);
-        }
-        for (uint32_t i = 0; i < num_shards; i++) {
-          spawn(new RGWReadRemoteDataLogShardInfoCR(sc, i, &shards_info[i]), true);
-        }
-      }
-      while (collect(&ret, NULL)) {
-        if (ret < 0) {
-          tn->log(0, SSTR("ERROR: failed to read remote data log shards"));
-          return set_state(RGWCoroutine_Error);
-        }
-        yield;
-      }
-      yield {
-        for (uint32_t i = 0; i < num_shards; i++) {
-          RGWDataChangesLogInfo& info = shards_info[i];
-          auto& marker = status->sync_markers[i];
-          marker.next_step_marker = info.marker;
-          marker.timestamp = info.last_update;
-          const auto& oid = RGWDataSyncStatusManager::shard_obj_name(sc->source_zone, i);
-          using WriteMarkerCR = RGWSimpleRadosWriteCR<rgw_data_sync_marker>;
-          spawn(new WriteMarkerCR(dpp, sync_env->async_rados, sync_env->svc->sysobj,
-                                  rgw_raw_obj{pool, oid}, marker), true);
-        }
-      }
-      while (collect(&ret, NULL)) {
-        if (ret < 0) {
-          tn->log(0, SSTR("ERROR: failed to write data sync status markers"));
-          return set_state(RGWCoroutine_Error);
-        }
-        yield;
-      }
-
-      status->sync_info.state = rgw_data_sync_info::StateBuildingFullSyncMaps;
-      yield call(new WriteInfoCR(dpp, sync_env->async_rados, sync_env->svc->sysobj,
-                                 rgw_raw_obj{pool, sync_status_oid},
-                                 status->sync_info));
-      if (retcode < 0) {
-        tn->log(0, SSTR("ERROR: failed to write sync status info with " << retcode));
-        return set_cr_error(retcode);
-      }
-      yield call(new RGWSimpleRadosUnlockCR(sync_env->async_rados, driver,
-                                            rgw_raw_obj{pool, sync_status_oid},
-                                            lock_name, cookie));
-      return set_cr_done();
-    }
-    return 0;
-  }
-};
-
-RGWRemoteDataLog::RGWRemoteDataLog(const DoutPrefixProvider *dpp,
-                                   rgw::sal::RadosStore* driver,
-                                   RGWAsyncRadosProcessor *async_rados)
-  : RGWCoroutinesManager(driver->ctx(), driver->getRados()->get_cr_registry()),
-      dpp(dpp), driver(driver),
-      cct(driver->ctx()), cr_registry(driver->getRados()->get_cr_registry()),
-      async_rados(async_rados),
-      http_manager(driver->ctx(), completion_mgr),
-      data_sync_cr(NULL),
-      initialized(false)
-{
-}
-
-int RGWRemoteDataLog::read_log_info(const DoutPrefixProvider *dpp, rgw_datalog_info *log_info)
-{
-  rgw_http_param_pair pairs[] = { { "type", "data" },
-                                  { NULL, NULL } };
-
-  int ret = sc.conn->get_json_resource(dpp, "/admin/log", pairs, null_yield, *log_info);
-  if (ret < 0) {
-    ldpp_dout(dpp, 0) << "ERROR: failed to fetch datalog info" << dendl;
-    return ret;
-  }
-
-  ldpp_dout(dpp, 20) << "remote datalog, num_shards=" << log_info->num_shards << dendl;
-
-  return 0;
-}
-
-int RGWRemoteDataLog::read_source_log_shards_info(const DoutPrefixProvider *dpp, map<int, RGWDataChangesLogInfo> *shards_info)
-{
-  rgw_datalog_info log_info;
-  int ret = read_log_info(dpp, &log_info);
-  if (ret < 0) {
-    return ret;
-  }
-
-  return run(dpp, new RGWReadRemoteDataLogInfoCR(&sc, log_info.num_shards, shards_info));
-}
-
-int RGWRemoteDataLog::read_source_log_shards_next(const DoutPrefixProvider *dpp, map<int, string> shard_markers, map<int, rgw_datalog_shard_data> *result)
-{
-  return run(dpp, new RGWListRemoteDataLogCR(&sc, shard_markers, 1, result));
-}
-
-int RGWRemoteDataLog::init(const rgw_zone_id& _source_zone, RGWRESTConn *_conn, RGWSyncErrorLogger *_error_logger,
-                           RGWSyncTraceManager *_sync_tracer, RGWSyncModuleInstanceRef& _sync_module,
-                           PerfCounters* counters)
-{
-  sync_env.init(dpp, cct, driver, driver->svc(), async_rados, &http_manager, _error_logger,
-                _sync_tracer, _sync_module, counters);
-  sc.init(&sync_env, _conn, _source_zone);
-
-  if (initialized) {
-    return 0;
-  }
-
-  int ret = http_manager.start();
-  if (ret < 0) {
-    ldpp_dout(dpp, 0) << "failed in http_manager.start() ret=" << ret << dendl;
-    return ret;
-  }
-
-  tn = sync_env.sync_tracer->add_node(sync_env.sync_tracer->root_node, "data");
-
-  initialized = true;
-
-  return 0;
-}
-
-void RGWRemoteDataLog::finish()
-{
-  stop();
-}
-
-int RGWRemoteDataLog::read_sync_status(const DoutPrefixProvider *dpp, rgw_data_sync_status *sync_status)
-{
-  // cannot run concurrently with run_sync(), so run in a separate manager
-  RGWCoroutinesManager crs(cct, cr_registry);
-  RGWHTTPManager http_manager(cct, crs.get_completion_mgr());
-  int ret = http_manager.start();
-  if (ret < 0) {
-    ldpp_dout(dpp, 0) << "failed in http_manager.start() ret=" << ret << dendl;
-    return ret;
-  }
-  RGWDataSyncEnv sync_env_local = sync_env;
-  sync_env_local.http_manager = &http_manager;
-
-  RGWDataSyncCtx sc_local = sc;
-  sc_local.env = &sync_env_local;
-
-  ret = crs.run(dpp, new RGWReadDataSyncStatusCoroutine(&sc_local, sync_status));
-  http_manager.stop();
-  return ret;
-}
-
-int RGWRemoteDataLog::read_recovering_shards(const DoutPrefixProvider *dpp, const int num_shards, set<int>& recovering_shards)
-{
-  // cannot run concurrently with run_sync(), so run in a separate manager
-  RGWCoroutinesManager crs(cct, cr_registry);
-  RGWHTTPManager http_manager(cct, crs.get_completion_mgr());
-  int ret = http_manager.start();
-  if (ret < 0) {
-    ldpp_dout(dpp, 0) << "failed in http_manager.start() ret=" << ret << dendl;
-    return ret;
-  }
-  RGWDataSyncEnv sync_env_local = sync_env;
-  sync_env_local.http_manager = &http_manager;
-
-  RGWDataSyncCtx sc_local = sc;
-  sc_local.env = &sync_env_local;
-
-  std::vector<RGWRadosGetOmapKeysCR::ResultPtr> omapkeys;
-  omapkeys.resize(num_shards);
-  uint64_t max_entries{1};
-
-  ret = crs.run(dpp, new RGWReadDataSyncRecoveringShardsCR(&sc_local, max_entries, num_shards, omapkeys));
-  http_manager.stop();
-
-  if (ret == 0) {
-    for (int i = 0; i < num_shards; i++) {
-      if (omapkeys[i]->entries.size() != 0) {
-        recovering_shards.insert(i);
-      }
-    }
-  }
-
-  return ret;
-}
-
-int RGWRemoteDataLog::init_sync_status(const DoutPrefixProvider *dpp, int num_shards)
-{
-  rgw_data_sync_status sync_status;
-  sync_status.sync_info.num_shards = num_shards;
-
-  RGWCoroutinesManager crs(cct, cr_registry);
-  RGWHTTPManager http_manager(cct, crs.get_completion_mgr());
-  int ret = http_manager.start();
-  if (ret < 0) {
-    ldpp_dout(dpp, 0) << "failed in http_manager.start() ret=" << ret << dendl;
-    return ret;
-  }
-  RGWDataSyncEnv sync_env_local = sync_env;
-  sync_env_local.http_manager = &http_manager;
-  auto instance_id = ceph::util::generate_random_number<uint64_t>();
-  RGWDataSyncCtx sc_local = sc;
-  sc_local.env = &sync_env_local;
-  ret = crs.run(dpp, new RGWInitDataSyncStatusCoroutine(&sc_local, num_shards, instance_id, tn, &sync_status));
-  http_manager.stop();
-  return ret;
-}
-
-static string full_data_sync_index_shard_oid(const rgw_zone_id& source_zone, int shard_id)
-{
-  char buf[datalog_sync_full_sync_index_prefix.size() + 1 + source_zone.id.size() + 1 + 16];
-  snprintf(buf, sizeof(buf), "%s.%s.%d", datalog_sync_full_sync_index_prefix.c_str(), source_zone.id.c_str(), shard_id);
-  return string(buf);
-}
-
-struct read_metadata_list {
-  string marker;
-  bool truncated;
-  list<string> keys;
-  int count;
-
-  read_metadata_list() : truncated(false), count(0) {}
-
-  void decode_json(JSONObj *obj) {
-    JSONDecoder::decode_json("marker", marker, obj);
-    JSONDecoder::decode_json("truncated", truncated, obj);
-    JSONDecoder::decode_json("keys", keys, obj);
-    JSONDecoder::decode_json("count", count, obj);
-  }
-};
-
-struct bucket_instance_meta_info {
-  string key;
-  obj_version ver;
-  utime_t mtime;
-  RGWBucketInstanceMetadataObject data;
-
-  bucket_instance_meta_info() {}
-
-  void decode_json(JSONObj *obj) {
-    JSONDecoder::decode_json("key", key, obj);
-    JSONDecoder::decode_json("ver", ver, obj);
-    JSONDecoder::decode_json("mtime", mtime, obj);
-    JSONDecoder::decode_json("data", data, obj);
-  }
-};
-
-class RGWReadRemoteBucketIndexLogInfoCR : public RGWCoroutine {
-  RGWDataSyncCtx *sc;
-  RGWDataSyncEnv *sync_env;
-  const string instance_key;
-
-  rgw_bucket_index_marker_info *info;
-
-public:
-  RGWReadRemoteBucketIndexLogInfoCR(RGWDataSyncCtx *_sc,
-                                   const rgw_bucket& bucket,
-                                   rgw_bucket_index_marker_info *_info)
-    : RGWCoroutine(_sc->cct), sc(_sc), sync_env(_sc->env),
-      instance_key(bucket.get_key()), info(_info) {}
-
-  int operate(const DoutPrefixProvider *dpp) override {
-    reenter(this) {
-      yield {
-        rgw_http_param_pair pairs[] = { { "type" , "bucket-index" },
-                                       { "bucket-instance", instance_key.c_str() },
-                                       { "info" , NULL },
-                                       { NULL, NULL } };
-
-        string p = "/admin/log/";
-        call(new RGWReadRESTResourceCR<rgw_bucket_index_marker_info>(sync_env->cct, sc->conn, sync_env->http_manager, p, pairs, info));
-      }
-      if (retcode < 0) {
-        return set_cr_error(retcode);
-      }
-
-      return set_cr_done();
-    }
-    return 0;
-  }
-};
-
-
-class RGWListBucketIndexesCR : public RGWCoroutine {
-  RGWDataSyncCtx *sc;
-  RGWDataSyncEnv *sync_env = sc->env;
-
-  rgw::sal::RadosStore* driver = sync_env->driver;
-
-  rgw_data_sync_status *sync_status;
-
-  int req_ret = 0;
-  int ret = 0;
-
-  list<string>::iterator iter;
-
-  unique_ptr<RGWShardedOmapCRManager> entries_index;
-  string oid_prefix =
-    datalog_sync_full_sync_index_prefix + "." + sc->source_zone.id;
-
-  string path = "/admin/metadata/bucket.instance";
-  bucket_instance_meta_info meta_info;
-  string key;
-
-  bool failed = false;
-  bool truncated = false;
-  read_metadata_list result;
-
-public:
-  RGWListBucketIndexesCR(RGWDataSyncCtx* sc,
-                         rgw_data_sync_status* sync_status)
-    : RGWCoroutine(sc->cct), sc(sc), sync_status(sync_status) {}
-  ~RGWListBucketIndexesCR() override { }
-
-  int operate(const DoutPrefixProvider *dpp) override {
-    reenter(this) {
-      entries_index = std::make_unique<RGWShardedOmapCRManager>(
-       sync_env->async_rados, driver, this,
-       cct->_conf->rgw_data_log_num_shards,
-       sync_env->svc->zone->get_zone_params().log_pool,
-       oid_prefix);
-      yield; // yield so OmapAppendCRs can start
-
-      do {
-        yield {
-          string entrypoint = "/admin/metadata/bucket.instance"s;
-
-          rgw_http_param_pair pairs[] = {{"max-entries", "1000"},
-                                         {"marker", result.marker.c_str()},
-                                         {NULL, NULL}};
-
-          call(new RGWReadRESTResourceCR<read_metadata_list>(
-                sync_env->cct, sc->conn, sync_env->http_manager,
-                entrypoint, pairs, &result));
-       }
-       if (retcode < 0) {
-         ldpp_dout(dpp, 0)
-           << "ERROR: failed to fetch metadata for section bucket.instance"
-           << dendl;
-          return set_cr_error(retcode);
-        }
-
-        for (iter = result.keys.begin(); iter != result.keys.end(); ++iter) {
-          ldpp_dout(dpp, 20) << "list metadata: section=bucket.instance key="
-                            << *iter << dendl;
-          key = *iter;
-
-          yield {
-            rgw_http_param_pair pairs[] = {{"key", key.c_str()},
-                                           {NULL, NULL}};
-
-            call(new RGWReadRESTResourceCR<bucket_instance_meta_info>(
-                  sync_env->cct, sc->conn, sync_env->http_manager, path, pairs,
-                  &meta_info));
-          }
-         if (retcode < 0) {
-           ldpp_dout(dpp, 0) << "ERROR: failed to fetch metadata for key: "
-                             << key << dendl;
-           return set_cr_error(retcode);
-         }
-         // Now that bucket full sync is bucket-wide instead of
-         // per-shard, we only need to register a single shard of
-         // each bucket to guarantee that sync will see everything
-         // that happened before data full sync starts. This also
-         // means we don't have to care about the bucket's current
-         // shard count.
-         yield entries_index->append(
-           fmt::format("{}:{}", key, 0),
-           sync_env->svc->datalog_rados->get_log_shard_id(
-             meta_info.data.get_bucket_info().bucket, 0));
-       }
-       truncated = result.truncated;
-      } while (truncated);
-
-      yield {
-        if (!entries_index->finish()) {
-          failed = true;
-        }
-      }
-      if (!failed) {
-        for (auto iter = sync_status->sync_markers.begin();
-            iter != sync_status->sync_markers.end();
-            ++iter) {
-          int shard_id = (int)iter->first;
-          rgw_data_sync_marker& marker = iter->second;
-          marker.total_entries = entries_index->get_total_entries(shard_id);
-          spawn(new RGWSimpleRadosWriteCR<rgw_data_sync_marker>(
-                 dpp, sync_env->async_rados, sync_env->svc->sysobj,
-                 rgw_raw_obj(sync_env->svc->zone->get_zone_params().log_pool,
-                             RGWDataSyncStatusManager::shard_obj_name(
-                               sc->source_zone, shard_id)),
-                 marker),
-               true);
-       }
-      } else {
-        yield call(sync_env->error_logger->log_error_cr(
-                    dpp, sc->conn->get_remote_id(), "data.init", "",
-                    EIO, string("failed to build bucket instances map")));
-      }
-      while (collect(&ret, NULL)) {
-       if (ret < 0) {
-          yield call(sync_env->error_logger->log_error_cr(
-                      dpp, sc->conn->get_remote_id(), "data.init", "",
-                      -ret, string("failed to driver sync status: ") +
-                      cpp_strerror(-ret)));
-         req_ret = ret;
-       }
-       yield;
-      }
-      drain_all();
-      if (req_ret < 0) {
-        yield return set_cr_error(req_ret);
-      }
-       yield return set_cr_done();
-    }
-    return 0;
-  }
-};
-
-#define DATA_SYNC_UPDATE_MARKER_WINDOW 1
-
-class RGWDataSyncShardMarkerTrack : public RGWSyncShardMarkerTrack<string, string> {
-  RGWDataSyncCtx *sc;
-  RGWDataSyncEnv *sync_env;
-  string marker_oid;
-  rgw_data_sync_marker sync_marker;
-  RGWSyncTraceNodeRef tn;
-
-public:
-  RGWDataSyncShardMarkerTrack(RGWDataSyncCtx *_sc,
-                         const string& _marker_oid,
-                         const rgw_data_sync_marker& _marker,
-                         RGWSyncTraceNodeRef& _tn) : RGWSyncShardMarkerTrack(DATA_SYNC_UPDATE_MARKER_WINDOW),
-                                                                sc(_sc), sync_env(_sc->env),
-                                                                marker_oid(_marker_oid),
-                                                                sync_marker(_marker),
-                                                                tn(_tn) {}
-
-  RGWCoroutine* store_marker(const string& new_marker, uint64_t index_pos, const real_time& timestamp) override {
-    sync_marker.marker = new_marker;
-    sync_marker.pos = index_pos;
-    sync_marker.timestamp = timestamp;
-
-    tn->log(20, SSTR("updating marker marker_oid=" << marker_oid << " marker=" << new_marker));
-
-    return new RGWSimpleRadosWriteCR<rgw_data_sync_marker>(sync_env->dpp, sync_env->async_rados, sync_env->svc->sysobj,
-                                                           rgw_raw_obj(sync_env->svc->zone->get_zone_params().log_pool, marker_oid),
-                                                           sync_marker);
-  }
-
-  RGWOrderCallCR *allocate_order_control_cr() override {
-    return new RGWLastCallerWinsCR(sync_env->cct);
-  }
-};
-
-// ostream wrappers to print buckets without copying strings
-struct bucket_str {
-  const rgw_bucket& b;
-  explicit bucket_str(const rgw_bucket& b) : b(b) {}
-};
-std::ostream& operator<<(std::ostream& out, const bucket_str& rhs) {
-  auto& b = rhs.b;
-  if (!b.tenant.empty()) {
-    out << b.tenant << '/';
-  }
-  out << b.name;
-  if (!b.bucket_id.empty()) {
-    out << ':' << b.bucket_id;
-  }
-  return out;
-}
-
-struct bucket_str_noinstance {
-  const rgw_bucket& b;
-  explicit bucket_str_noinstance(const rgw_bucket& b) : b(b) {}
-};
-std::ostream& operator<<(std::ostream& out, const bucket_str_noinstance& rhs) {
-  auto& b = rhs.b;
-  if (!b.tenant.empty()) {
-    out << b.tenant << '/';
-  }
-  out << b.name;
-  return out;
-}
-
-struct bucket_shard_str {
-  const rgw_bucket_shard& bs;
-  explicit bucket_shard_str(const rgw_bucket_shard& bs) : bs(bs) {}
-};
-std::ostream& operator<<(std::ostream& out, const bucket_shard_str& rhs) {
-  auto& bs = rhs.bs;
-  out << bucket_str{bs.bucket};
-  if (bs.shard_id >= 0) {
-    out << ':' << bs.shard_id;
-  }
-  return out;
-}
-
-struct all_bucket_info {
-  RGWBucketInfo bucket_info;
-  map<string, bufferlist> attrs;
-};
-
-struct rgw_sync_pipe_info_entity
-{
-private:
-  RGWBucketInfo bucket_info;
-  map<string, bufferlist> bucket_attrs;
-  bool _has_bucket_info{false};
-
-public:
-  rgw_zone_id zone;
-
-  rgw_sync_pipe_info_entity() {}
-  rgw_sync_pipe_info_entity(const rgw_sync_bucket_entity& e,
-                            std::optional<all_bucket_info>& binfo) {
-    if (e.zone) {
-      zone = *e.zone;
-    }
-    if (!e.bucket) {
-      return;
-    }
-    if (!binfo ||
-        binfo->bucket_info.bucket != *e.bucket) {
-      bucket_info.bucket = *e.bucket;
-    } else {
-      set_bucket_info(*binfo);
-    }
-  }
-
-  void update_empty_bucket_info(const std::map<rgw_bucket, all_bucket_info>& buckets_info) {
-    if (_has_bucket_info) {
-      return;
-    }
-    if (bucket_info.bucket.name.empty()) {
-      return;
-    }
-
-    auto iter = buckets_info.find(bucket_info.bucket);
-    if (iter == buckets_info.end()) {
-      return;
-    }
-
-    set_bucket_info(iter->second);
-  }
-
-  bool has_bucket_info() const {
-    return _has_bucket_info;
-  }
-
-  void set_bucket_info(const all_bucket_info& all_info) {
-    bucket_info = all_info.bucket_info;
-    bucket_attrs = all_info.attrs;
-    _has_bucket_info = true;
-  }
-
-  const RGWBucketInfo& get_bucket_info() const {
-    return bucket_info;
-  }
-
-  const rgw_bucket& get_bucket() const {
-    return bucket_info.bucket;
-  }
-
-  bool operator<(const rgw_sync_pipe_info_entity& e) const {
-    if (zone < e.zone) {
-      return false;
-    }
-    if (zone > e.zone) {
-      return true;
-    }
-    return (bucket_info.bucket < e.bucket_info.bucket);
-  }
-};
-
-std::ostream& operator<<(std::ostream& out, const rgw_sync_pipe_info_entity& e) {
-  auto& bucket = e.get_bucket_info().bucket;
-
-  out << e.zone << ":" << bucket.get_key();
-  return out;
-}
-
-struct rgw_sync_pipe_handler_info {
-  RGWBucketSyncFlowManager::pipe_handler handler;
-  rgw_sync_pipe_info_entity source;
-  rgw_sync_pipe_info_entity target;
-
-  rgw_sync_pipe_handler_info() {}
-  rgw_sync_pipe_handler_info(const RGWBucketSyncFlowManager::pipe_handler& _handler,
-                     std::optional<all_bucket_info> source_bucket_info,
-                     std::optional<all_bucket_info> target_bucket_info) : handler(_handler),
-                                                                          source(handler.source, source_bucket_info),
-                                                                          target(handler.dest, target_bucket_info) {
-  }
-
-  bool operator<(const rgw_sync_pipe_handler_info& p) const {
-    if (source < p.source) {
-      return true;
-    }
-    if (p.source < source) {
-      return false;
-    }
-    return (target < p.target);
-  }
-
-  void update_empty_bucket_info(const std::map<rgw_bucket, all_bucket_info>& buckets_info) {
-    source.update_empty_bucket_info(buckets_info);
-    target.update_empty_bucket_info(buckets_info);
-  }
-};
-
-std::ostream& operator<<(std::ostream& out, const rgw_sync_pipe_handler_info& p) {
-  out << p.source << ">" << p.target;
-  return out;
-}
-
-struct rgw_sync_pipe_info_set {
-  std::set<rgw_sync_pipe_handler_info> handlers;
-
-  using iterator = std::set<rgw_sync_pipe_handler_info>::iterator;
-
-  void clear() {
-    handlers.clear();
-  }
-
-  void insert(const RGWBucketSyncFlowManager::pipe_handler& handler,
-              std::optional<all_bucket_info>& source_bucket_info,
-              std::optional<all_bucket_info>& target_bucket_info) {
-    rgw_sync_pipe_handler_info p(handler, source_bucket_info, target_bucket_info);
-    handlers.insert(p);
-  }
-
-  iterator begin() {
-    return handlers.begin();
-  }
-
-  iterator end() {
-    return handlers.end();
-  }
-
-  size_t size() const {
-    return handlers.size();
-  }
-
-  bool empty() const {
-    return handlers.empty();
-  }
-
-  void update_empty_bucket_info(const std::map<rgw_bucket, all_bucket_info>& buckets_info) {
-    if (buckets_info.empty()) {
-      return;
-    }
-
-    std::set<rgw_sync_pipe_handler_info> p;
-
-    for (auto pipe : handlers) {
-      pipe.update_empty_bucket_info(buckets_info);
-      p.insert(pipe);
-    }
-
-    handlers = std::move(p);
-  }
-};
-
-class RGWRunBucketSourcesSyncCR : public RGWCoroutine {
-  RGWDataSyncCtx *sc;
-  RGWDataSyncEnv *sync_env;
-  boost::intrusive_ptr<const RGWContinuousLeaseCR> lease_cr;
-
-  rgw_sync_pipe_info_set pipes;
-  rgw_sync_pipe_info_set::iterator siter;
-
-  rgw_bucket_sync_pair_info sync_pair;
-
-  RGWSyncTraceNodeRef tn;
-  ceph::real_time* progress;
-  std::vector<ceph::real_time> shard_progress;
-  std::vector<ceph::real_time>::iterator cur_shard_progress;
-
-  RGWRESTConn *conn{nullptr};
-  rgw_zone_id last_zone;
-
-  std::optional<uint64_t> gen;
-  rgw_bucket_index_marker_info marker_info;
-  BucketIndexShardsManager marker_mgr;
-
-public:
-  RGWRunBucketSourcesSyncCR(RGWDataSyncCtx *_sc,
-                            boost::intrusive_ptr<const RGWContinuousLeaseCR> lease_cr,
-                            const rgw_bucket_shard& source_bs,
-                            const RGWSyncTraceNodeRef& _tn_parent,
-                           std::optional<uint64_t> gen,
-                            ceph::real_time* progress);
-
-  int operate(const DoutPrefixProvider *dpp) override;
-};
-
-class RGWDataSyncSingleEntryCR : public RGWCoroutine {
-  RGWDataSyncCtx *sc;
-  RGWDataSyncEnv *sync_env;
-  rgw::bucket_sync::Handle state; // cached bucket-shard state
-  rgw_data_sync_obligation obligation; // input obligation
-  std::optional<rgw_data_sync_obligation> complete; // obligation to complete
-  uint32_t obligation_counter = 0;
-  RGWDataSyncShardMarkerTrack *marker_tracker;
-  rgw_raw_obj error_repo;
-  boost::intrusive_ptr<const RGWContinuousLeaseCR> lease_cr;
-  RGWSyncTraceNodeRef tn;
-
-  ceph::real_time progress;
-  int sync_status = 0;
-public:
-  RGWDataSyncSingleEntryCR(RGWDataSyncCtx *_sc, rgw::bucket_sync::Handle state,
-                           rgw_data_sync_obligation _obligation,
-                           RGWDataSyncShardMarkerTrack *_marker_tracker,
-                           const rgw_raw_obj& error_repo,
-                           boost::intrusive_ptr<const RGWContinuousLeaseCR> lease_cr,
-                           const RGWSyncTraceNodeRef& _tn_parent)
-    : RGWCoroutine(_sc->cct), sc(_sc), sync_env(_sc->env),
-      state(std::move(state)), obligation(std::move(_obligation)),
-      marker_tracker(_marker_tracker), error_repo(error_repo),
-      lease_cr(std::move(lease_cr)) {
-    set_description() << "data sync single entry (source_zone=" << sc->source_zone << ") " << obligation;
-    tn = sync_env->sync_tracer->add_node(_tn_parent, "entry", to_string(obligation.bs, obligation.gen));
-  }
-
-  int operate(const DoutPrefixProvider *dpp) override {
-    reenter(this) {
-      if (state->obligation) {
-        // this is already syncing in another DataSyncSingleEntryCR
-        if (state->obligation->timestamp < obligation.timestamp) {
-          // cancel existing obligation and overwrite it
-          tn->log(10, SSTR("canceling existing obligation " << *state->obligation));
-          complete = std::move(*state->obligation);
-          *state->obligation = std::move(obligation);
-          state->counter++;
-        } else {
-          // cancel new obligation
-          tn->log(10, SSTR("canceling new obligation " << obligation));
-          complete = std::move(obligation);
-        }
-      } else {
-        // start syncing a new obligation
-        state->obligation = obligation;
-        obligation_counter = state->counter;
-        state->counter++;
-
-        // loop until the latest obligation is satisfied, because other callers
-        // may update the obligation while we're syncing
-        while ((state->obligation->timestamp == ceph::real_time() ||
-                state->progress_timestamp < state->obligation->timestamp) &&
-               obligation_counter != state->counter) {
-          obligation_counter = state->counter;
-          progress = ceph::real_time{};
-
-          ldout(cct, 4) << "starting sync on " << bucket_shard_str{state->key.first}
-              << ' ' << *state->obligation << " progress timestamp " << state->progress_timestamp
-              << " progress " << progress << dendl;
-          yield call(new RGWRunBucketSourcesSyncCR(sc, lease_cr,
-                                                   state->key.first, tn,
-                                                   state->obligation->gen,
-                                                  &progress));
-          if (retcode < 0) {
-            break;
-          }
-          state->progress_timestamp = std::max(progress, state->progress_timestamp);
-        }
-        // any new obligations will process themselves
-        complete = std::move(*state->obligation);
-        state->obligation.reset();
-
-        tn->log(10, SSTR("sync finished on " << bucket_shard_str{state->key.first}
-                         << " progress=" << progress << ' ' << complete << " r=" << retcode));
-      }
-      sync_status = retcode;
-
-      if (sync_status == -ENOENT) {
-        // this was added when 'tenant/' was added to datalog entries, because
-        // preexisting tenant buckets could never sync and would stay in the
-        // error_repo forever
-        tn->log(0, SSTR("WARNING: skipping data log entry for missing bucket " << complete->bs));
-        sync_status = 0;
-      }
-
-      if (sync_status < 0) {
-        // write actual sync failures for 'radosgw-admin sync error list'
-        if (sync_status != -EBUSY && sync_status != -EAGAIN) {
-          yield call(sync_env->error_logger->log_error_cr(dpp, sc->conn->get_remote_id(), "data",
-                                                          to_string(complete->bs, complete->gen),
-                                                          -sync_status, string("failed to sync bucket instance: ") + cpp_strerror(-sync_status)));
-          if (retcode < 0) {
-            tn->log(0, SSTR("ERROR: failed to log sync failure: retcode=" << retcode));
-          }
-        }
-        if (complete->timestamp != ceph::real_time{}) {
-          tn->log(10, SSTR("writing " << *complete << " to error repo for retry"));
-          yield call(rgw::error_repo::write_cr(sync_env->driver->svc()->rados, error_repo,
-                                              rgw::error_repo::encode_key(complete->bs, complete->gen),
-                                              complete->timestamp));
-          if (retcode < 0) {
-            tn->log(0, SSTR("ERROR: failed to log sync failure in error repo: retcode=" << retcode));
-          }
-        }
-      } else if (complete->retry) {
-        yield call(rgw::error_repo::remove_cr(sync_env->driver->svc()->rados, error_repo,
-                                              rgw::error_repo::encode_key(complete->bs, complete->gen),
-                                              complete->timestamp));
-        if (retcode < 0) {
-          tn->log(0, SSTR("ERROR: failed to remove omap key from error repo ("
-             << error_repo << " retcode=" << retcode));
-        }
-      }
-      /* FIXME: what do do in case of error */
-      if (marker_tracker && !complete->marker.empty()) {
-        /* update marker */
-        yield call(marker_tracker->finish(complete->marker));
-      }
-      if (sync_status == 0) {
-        sync_status = retcode;
-      }
-      if (sync_status < 0) {
-        return set_cr_error(sync_status);
-      }
-      return set_cr_done();
-    }
-    return 0;
-  }
-};
-
-rgw_raw_obj datalog_oid_for_error_repo(RGWDataSyncCtx *sc, rgw::sal::RadosStore* driver,
-                                      rgw_pool& pool, rgw_bucket_shard& bs) {
-  int datalog_shard = driver->svc()->datalog_rados->choose_oid(bs);
-  string oid = RGWDataSyncStatusManager::shard_obj_name(sc->source_zone, datalog_shard);
-  return rgw_raw_obj(pool, oid + ".retry");
-  }
-
-class RGWDataIncrementalSyncFullObligationCR: public RGWCoroutine {
-  RGWDataSyncCtx *sc;
-  RGWDataSyncEnv *sync_env;
-  rgw_bucket_shard source_bs;
-  rgw_raw_obj error_repo;
-  std::string error_marker;
-  ceph::real_time timestamp;
-  RGWSyncTraceNodeRef tn;
-  rgw_bucket_index_marker_info remote_info;
-  rgw_pool pool;
-  uint32_t sid;
-  rgw_bucket_shard bs;
-  std::vector<store_gen_shards>::const_iterator each;
-
-public:
-  RGWDataIncrementalSyncFullObligationCR(RGWDataSyncCtx *_sc, rgw_bucket_shard& _source_bs,
-                                         const rgw_raw_obj& error_repo, const std::string& _error_marker,
-                                         ceph::real_time& _timestamp, RGWSyncTraceNodeRef& _tn)
-    : RGWCoroutine(_sc->cct), sc(_sc), sync_env(_sc->env), source_bs(_source_bs),
-      error_repo(error_repo), error_marker(_error_marker), timestamp(_timestamp),
-      tn(sync_env->sync_tracer->add_node(_tn, "error_repo", SSTR(bucket_shard_str(source_bs))))
-  {}
-
-  int operate(const DoutPrefixProvider *dpp) override {
-    reenter(this) {
-      yield call(new RGWReadRemoteBucketIndexLogInfoCR(sc, source_bs.bucket, &remote_info));
-      if (retcode < 0) {
-        return set_cr_error(retcode);
-      }
-
-      each = remote_info.generations.cbegin();
-      for (; each != remote_info.generations.cend(); each++) {
-        for (sid = 0; sid < each->num_shards; sid++) {
-          bs.bucket = source_bs.bucket;
-          bs.shard_id = sid;
-          error_repo = datalog_oid_for_error_repo(sc, sync_env->driver, pool, source_bs);
-          tn->log(10, SSTR("writing shard_id " << sid << " of gen " << each->gen << " to error repo for retry"));
-          yield_spawn_window(rgw::error_repo::write_cr(sync_env->driver->svc()->rados, error_repo,
-                            rgw::error_repo::encode_key(bs, each->gen),
-                            timestamp), cct->_conf->rgw_data_sync_spawn_window,
-                            [&](uint64_t stack_id, int ret) {
-                              if (ret < 0) {
-                                retcode = ret;
-                              }
-                              return 0;
-                            });
-        }
-      }
-      drain_all_cb([&](uint64_t stack_id, int ret) {
-                   if (ret < 0) {
-                     tn->log(10, SSTR("writing to error repo returned error: " << ret));
-                   }
-                   return ret;
-                 });
-
-      // once everything succeeds, remove the full sync obligation from the error repo
-      yield call(rgw::error_repo::remove_cr(sync_env->driver->svc()->rados, error_repo,
-                                            error_marker, timestamp));
-      return set_cr_done();
-    }
-    return 0;
-  }
-};
-
-RGWCoroutine* data_sync_single_entry(RGWDataSyncCtx *sc, const rgw_bucket_shard& src,
-                                std::optional<uint64_t> gen,
-                                const std::string marker,
-                                ceph::real_time timestamp,
-                                boost::intrusive_ptr<const RGWContinuousLeaseCR> lease_cr,
-                                boost::intrusive_ptr<rgw::bucket_sync::Cache> bucket_shard_cache,
-                                RGWDataSyncShardMarkerTrack* marker_tracker,
-                                rgw_raw_obj error_repo,
-                                RGWSyncTraceNodeRef& tn,
-                                bool retry) {
-  auto state = bucket_shard_cache->get(src, gen);
-  auto obligation = rgw_data_sync_obligation{src, gen, marker, timestamp, retry};
-  return new RGWDataSyncSingleEntryCR(sc, std::move(state), std::move(obligation),
-                                      &*marker_tracker, error_repo,
-                                      lease_cr.get(), tn);
-}
-
-static ceph::real_time timestamp_for_bucket_shard(rgw::sal::RadosStore* driver,
-                                                const rgw_data_sync_status& sync_status,
-                                                const rgw_bucket_shard& bs) {
-  int datalog_shard = driver->svc()->datalog_rados->choose_oid(bs);
-  auto status = sync_status.sync_markers.find(datalog_shard);
-  if (status == sync_status.sync_markers.end()) {
-    return ceph::real_clock::zero();
-  }
-  return status->second.timestamp;
-}
-
-class RGWDataFullSyncSingleEntryCR : public RGWCoroutine {
-  RGWDataSyncCtx *sc;
-  RGWDataSyncEnv *sync_env;
-  rgw_pool pool;
-  rgw_bucket_shard source_bs;
-  const std::string key;
-  rgw_data_sync_status sync_status;
-  rgw_raw_obj error_repo;
-  ceph::real_time timestamp;
-  boost::intrusive_ptr<const RGWContinuousLeaseCR> lease_cr;
-  boost::intrusive_ptr<rgw::bucket_sync::Cache> bucket_shard_cache;
-  RGWDataSyncShardMarkerTrack* marker_tracker;
-  RGWSyncTraceNodeRef tn;
-  rgw_bucket_index_marker_info remote_info;
-  uint32_t sid;
-  std::vector<store_gen_shards>::iterator each;
-  uint64_t i{0};
-  RGWCoroutine* shard_cr = nullptr;
-  bool first_shard = true;
-  bool error_inject;
-
-public:
-  RGWDataFullSyncSingleEntryCR(RGWDataSyncCtx *_sc, const rgw_pool& _pool, const rgw_bucket_shard& _source_bs,
-                      const std::string& _key, const rgw_data_sync_status& sync_status, const rgw_raw_obj& _error_repo,
-                      ceph::real_time _timestamp, boost::intrusive_ptr<const RGWContinuousLeaseCR> _lease_cr,
-                      boost::intrusive_ptr<rgw::bucket_sync::Cache> _bucket_shard_cache,
-                      RGWDataSyncShardMarkerTrack* _marker_tracker,
-                      RGWSyncTraceNodeRef& _tn)
-    : RGWCoroutine(_sc->cct), sc(_sc), sync_env(_sc->env), pool(_pool), source_bs(_source_bs), key(_key),
-      error_repo(_error_repo), timestamp(_timestamp), lease_cr(std::move(_lease_cr)),
-      bucket_shard_cache(_bucket_shard_cache), marker_tracker(_marker_tracker), tn(_tn) {
-        error_inject = (sync_env->cct->_conf->rgw_sync_data_full_inject_err_probability > 0);
-      }
-
-
-  int operate(const DoutPrefixProvider *dpp) override {
-    reenter(this) {
-      if (error_inject &&
-          rand() % 10000 < cct->_conf->rgw_sync_data_full_inject_err_probability * 10000.0) {
-        tn->log(0, SSTR("injecting read bilog info error on key=" << key));
-        retcode = -ENOENT;
-      } else {
-        tn->log(0, SSTR("read bilog info key=" << key));
-        yield call(new RGWReadRemoteBucketIndexLogInfoCR(sc, source_bs.bucket, &remote_info));
-      }
-
-      if (retcode < 0) {
-        tn->log(10, SSTR("full sync: failed to read remote bucket info. Writing "
-                        << source_bs.shard_id << " to error repo for retry"));
-        yield call(rgw::error_repo::write_cr(sync_env->driver->svc()->rados, error_repo,
-                                            rgw::error_repo::encode_key(source_bs, std::nullopt),
-                                            timestamp));
-        if (retcode < 0) {
-          tn->log(0, SSTR("ERROR: failed to log " << source_bs.shard_id << " in error repo: retcode=" << retcode));
-        }
-        yield call(marker_tracker->finish(key));
-        return set_cr_error(retcode);
-      }
-
-      //wait to sync the first shard of the oldest generation and then sync all other shards.
-      //if any of the operations fail at any time, write them into error repo for later retry.
-
-      each = remote_info.generations.begin();
-      for (; each != remote_info.generations.end(); each++) {
-        for (sid = 0; sid < each->num_shards; sid++) {
-          source_bs.shard_id = sid;
-          // use the error repo and sync status timestamp from the datalog shard corresponding to source_bs
-          error_repo = datalog_oid_for_error_repo(sc, sync_env->driver, pool, source_bs);
-          timestamp = timestamp_for_bucket_shard(sync_env->driver, sync_status, source_bs);
-          if (retcode < 0) {
-            tn->log(10, SSTR("Write " << source_bs.shard_id << " to error repo for retry"));
-            yield_spawn_window(rgw::error_repo::write_cr(sync_env->driver->svc()->rados, error_repo,
-                rgw::error_repo::encode_key(source_bs, each->gen),
-                timestamp), cct->_conf->rgw_data_sync_spawn_window, std::nullopt);
-          } else {
-          shard_cr = data_sync_single_entry(sc, source_bs, each->gen, key, timestamp,
-                      lease_cr, bucket_shard_cache, nullptr, error_repo, tn, false);
-          tn->log(10, SSTR("full sync: syncing shard_id " << sid << " of gen " << each->gen));
-          if (first_shard) {
-            yield call(shard_cr);
-            first_shard = false;
-          } else {
-            yield_spawn_window(shard_cr, cct->_conf->rgw_data_sync_spawn_window,
-                              [&](uint64_t stack_id, int ret) {
-                                if (ret < 0) {
-                                  retcode = ret;
-                                }
-                                return retcode;
-                                });
-            }
-          }
-        }
-        drain_all_cb([&](uint64_t stack_id, int ret) {
-                if (ret < 0) {
-                  retcode = ret;
-                }
-                return retcode;
-              });
-      }
-
-      yield call(marker_tracker->finish(key));
-
-      return set_cr_done();
-    }
-    return 0;
-  }
-};
-
-class RGWDataBaseSyncShardCR : public RGWCoroutine {
-protected:
-  RGWDataSyncCtx *const sc;
-  const rgw_pool& pool;
-  const uint32_t shard_id;
-  rgw_data_sync_marker& sync_marker;
-  RGWSyncTraceNodeRef tn;
-  const string& status_oid;
-  const rgw_raw_obj& error_repo;
-  boost::intrusive_ptr<const RGWContinuousLeaseCR> lease_cr;
-  const rgw_data_sync_status& sync_status;
-  boost::intrusive_ptr<rgw::bucket_sync::Cache> bucket_shard_cache;
-
-  std::optional<RGWDataSyncShardMarkerTrack> marker_tracker;
-  RGWRadosGetOmapValsCR::ResultPtr omapvals;
-  rgw_bucket_shard source_bs;
-
-  int parse_bucket_key(const std::string& key, rgw_bucket_shard& bs) const {
-    return rgw_bucket_parse_bucket_key(sc->env->cct, key,
-                                       &bs.bucket, &bs.shard_id);
-  }
-
-  RGWDataBaseSyncShardCR(
-    RGWDataSyncCtx *const _sc, const rgw_pool& pool, const uint32_t shard_id,
-    rgw_data_sync_marker& sync_marker, RGWSyncTraceNodeRef tn,
-    const string& status_oid, const rgw_raw_obj& error_repo,
-    boost::intrusive_ptr<const RGWContinuousLeaseCR> lease_cr,
-    const rgw_data_sync_status& sync_status,
-    const boost::intrusive_ptr<rgw::bucket_sync::Cache>& bucket_shard_cache)
-    : RGWCoroutine(_sc->cct), sc(_sc), pool(pool), shard_id(shard_id),
-      sync_marker(sync_marker), tn(tn), status_oid(status_oid),
-      error_repo(error_repo), lease_cr(std::move(lease_cr)),
-      sync_status(sync_status), bucket_shard_cache(bucket_shard_cache) {}
-};
-
-class RGWDataFullSyncShardCR : public RGWDataBaseSyncShardCR {
-  static constexpr auto OMAP_GET_MAX_ENTRIES = 100;
-
-  string oid;
-  uint64_t total_entries = 0;
-  ceph::real_time entry_timestamp;
-  std::map<std::string, bufferlist> entries;
-  std::map<std::string, bufferlist>::iterator iter;
-  string error_marker;
-
-public:
-
-  RGWDataFullSyncShardCR(
-    RGWDataSyncCtx *const sc, const rgw_pool& pool, const uint32_t shard_id,
-    rgw_data_sync_marker& sync_marker, RGWSyncTraceNodeRef tn,
-    const string& status_oid, const rgw_raw_obj& error_repo,
-    boost::intrusive_ptr<const RGWContinuousLeaseCR> lease_cr,
-    const rgw_data_sync_status& sync_status,
-    const boost::intrusive_ptr<rgw::bucket_sync::Cache>& bucket_shard_cache)
-    : RGWDataBaseSyncShardCR(sc, pool, shard_id, sync_marker, tn,
-                            status_oid, error_repo, std::move(lease_cr),
-                            sync_status, bucket_shard_cache) {}
-
-  int operate(const DoutPrefixProvider *dpp) override {
-    reenter(this) {
-      tn->log(10, "start full sync");
-      oid = full_data_sync_index_shard_oid(sc->source_zone, shard_id);
-      marker_tracker.emplace(sc, status_oid, sync_marker, tn);
-      total_entries = sync_marker.pos;
-      entry_timestamp = sync_marker.timestamp; // time when full sync started
-      do {
-        if (!lease_cr->is_locked()) {
-          drain_all();
-          tn->log(1, "lease is lost, abort");
-          return set_cr_error(-ECANCELED);
-        }
-        omapvals = std::make_shared<RGWRadosGetOmapValsCR::Result>();
-        yield call(new RGWRadosGetOmapValsCR(sc->env->driver,
-                                            rgw_raw_obj(pool, oid),
-                                             sync_marker.marker,
-                                            OMAP_GET_MAX_ENTRIES, omapvals));
-        if (retcode < 0) {
-          drain_all();
-          return set_cr_error(retcode);
-        }
-        entries = std::move(omapvals->entries);
-        if (entries.size() > 0) {
-          tn->set_flag(RGW_SNS_FLAG_ACTIVE); /* actually have entries to sync */
-        }
-        tn->log(20, SSTR("retrieved " << entries.size() << " entries to sync"));
-        iter = entries.begin();
-        for (; iter != entries.end(); ++iter) {
-          retcode = parse_bucket_key(iter->first, source_bs);
-          if (retcode < 0) {
-            tn->log(1, SSTR("failed to parse bucket shard: " << iter->first));
-            marker_tracker->try_update_high_marker(iter->first, 0,
-                                                  entry_timestamp);
-            continue;
-          }
-          tn->log(20, SSTR("full sync: " << iter->first));
-          total_entries++;
-          if (!marker_tracker->start(iter->first, total_entries,
-                                    entry_timestamp)) {
-            tn->log(0, SSTR("ERROR: cannot start syncing " << iter->first
-                           << ". Duplicate entry?"));
-          } else {
-            tn->log(10, SSTR("timestamp for " << iter->first << " is :" << entry_timestamp));
-            yield_spawn_window(new RGWDataFullSyncSingleEntryCR(
-                                sc, pool, source_bs, iter->first, sync_status,
-                                error_repo, entry_timestamp, lease_cr,
-                                bucket_shard_cache, &*marker_tracker, tn),
-                              cct->_conf->rgw_data_sync_spawn_window,
-                              std::nullopt);
-          }
-         sync_marker.marker = iter->first;
-        }
-      } while (omapvals->more);
-      omapvals.reset();
-
-      drain_all();
-
-      tn->unset_flag(RGW_SNS_FLAG_ACTIVE);
-
-      /* update marker to reflect we're done with full sync */
-      sync_marker.state = rgw_data_sync_marker::IncrementalSync;
-      sync_marker.marker = sync_marker.next_step_marker;
-      sync_marker.next_step_marker.clear();
-      yield call(new RGWSimpleRadosWriteCR<rgw_data_sync_marker>(
-             sc->env->dpp,sc->env->async_rados, sc->env->svc->sysobj,
-             rgw_raw_obj(pool, status_oid), sync_marker));
-      if (retcode < 0) {
-        tn->log(0, SSTR("ERROR: failed to set sync marker: retcode=" << retcode));
-        return set_cr_error(retcode);
-      }
-
-      // clean up full sync index, ignoring errors
-      yield call(new RGWRadosRemoveCR(sc->env->driver, {pool, oid}));
-
-      // transition to incremental sync
-      return set_cr_done();
-    }
-    return 0;
-  }
-};
-
-class RGWDataIncSyncShardCR : public RGWDataBaseSyncShardCR {
-  static constexpr int max_error_entries = 10;
-  static constexpr uint32_t retry_backoff_secs = 60;
-
-  ceph::mutex& inc_lock;
-  bc::flat_set<rgw_data_notify_entry>& modified_shards;
-
-  bc::flat_set<rgw_data_notify_entry> current_modified;
-  decltype(current_modified)::iterator modified_iter;
-
-  ceph::coarse_real_time error_retry_time;
-  string error_marker;
-  std::map<std::string, bufferlist> error_entries;
-  decltype(error_entries)::iterator iter;
-  ceph::real_time entry_timestamp;
-  std::optional<uint64_t> gen;
-
-  string next_marker;
-  vector<rgw_data_change_log_entry> log_entries;
-  decltype(log_entries)::iterator log_iter;
-  bool truncated = false;
-
-  utime_t get_idle_interval() const {
-    ceph::timespan interval = std::chrono::seconds(cct->_conf->rgw_data_sync_poll_interval);
-    if (!ceph::coarse_real_clock::is_zero(error_retry_time)) {
-      auto now = ceph::coarse_real_clock::now();
-      if (error_retry_time > now) {
-        auto d = error_retry_time - now;
-        if (interval > d) {
-          interval = d;
-        }
-      }
-    }
-    // convert timespan -> time_point -> utime_t
-    return utime_t(ceph::coarse_real_clock::zero() + interval);
-  }
-
-
-public:
-
-  RGWDataIncSyncShardCR(
-    RGWDataSyncCtx *const sc, const rgw_pool& pool, const uint32_t shard_id,
-    rgw_data_sync_marker& sync_marker, RGWSyncTraceNodeRef tn,
-    const string& status_oid, const rgw_raw_obj& error_repo,
-    boost::intrusive_ptr<const RGWContinuousLeaseCR> lease_cr,
-    const rgw_data_sync_status& sync_status,
-    const boost::intrusive_ptr<rgw::bucket_sync::Cache>& bucket_shard_cache,
-    ceph::mutex& inc_lock,
-    bc::flat_set<rgw_data_notify_entry>& modified_shards)
-    : RGWDataBaseSyncShardCR(sc, pool, shard_id, sync_marker, tn,
-                            status_oid, error_repo, std::move(lease_cr),
-                            sync_status, bucket_shard_cache),
-      inc_lock(inc_lock), modified_shards(modified_shards) {}
-
-  int operate(const DoutPrefixProvider *dpp) override {
-    reenter(this) {
-      tn->log(10, "start incremental sync");
-      marker_tracker.emplace(sc, status_oid, sync_marker, tn);
-      do {
-        if (!lease_cr->is_locked()) {
-          drain_all();
-          tn->log(1, "lease is lost, abort");
-          return set_cr_error(-ECANCELED);
-        }
-       {
-         current_modified.clear();
-         std::unique_lock il(inc_lock);
-         current_modified.swap(modified_shards);
-         il.unlock();
-       }
-
-        if (current_modified.size() > 0) {
-          tn->set_flag(RGW_SNS_FLAG_ACTIVE); /* actually have entries to sync */
-        }
-        /* process out of band updates */
-        for (modified_iter = current_modified.begin();
-            modified_iter != current_modified.end();
-            ++modified_iter) {
-          retcode = parse_bucket_key(modified_iter->key, source_bs);
-          if (retcode < 0) {
-            tn->log(1, SSTR("failed to parse bucket shard: "
-                           << modified_iter->key));
-           continue;
-          }
-          tn->log(20, SSTR("received async update notification: "
-                          << modified_iter->key));
-          spawn(data_sync_single_entry(sc, source_bs, modified_iter->gen, {},
-                                      ceph::real_time{}, lease_cr,
-                                      bucket_shard_cache, &*marker_tracker,
-                                      error_repo, tn, false), false);
-       }
-
-        if (error_retry_time <= ceph::coarse_real_clock::now()) {
-          /* process bucket shards that previously failed */
-          omapvals = std::make_shared<RGWRadosGetOmapValsCR::Result>();
-          yield call(new RGWRadosGetOmapValsCR(sc->env->driver, error_repo,
-                                               error_marker, max_error_entries,
-                                              omapvals));
-          error_entries = std::move(omapvals->entries);
-          tn->log(20, SSTR("read error repo, got " << error_entries.size()
-                          << " entries"));
-          iter = error_entries.begin();
-          for (; iter != error_entries.end(); ++iter) {
-            error_marker = iter->first;
-            entry_timestamp = rgw::error_repo::decode_value(iter->second);
-            retcode = rgw::error_repo::decode_key(iter->first, source_bs, gen);
-            if (retcode == -EINVAL) {
-              // backward compatibility for string keys that don't encode a gen
-              retcode = parse_bucket_key(error_marker, source_bs);
-            }
-            if (retcode < 0) {
-              tn->log(1, SSTR("failed to parse bucket shard: " << error_marker));
-              spawn(rgw::error_repo::remove_cr(sc->env->driver->svc()->rados,
-                                              error_repo, error_marker,
-                                              entry_timestamp),
-                   false);
-              continue;
-            }
-            tn->log(10, SSTR("gen is " << gen));
-            if (!gen) {
-              // write all full sync obligations for the bucket to error repo
-              spawn(new RGWDataIncrementalSyncFullObligationCR(sc, source_bs,
-                     error_repo, error_marker, entry_timestamp, tn), false);
-            } else {
-              tn->log(20, SSTR("handle error entry key="
-                              << to_string(source_bs, gen)
-                              << " timestamp=" << entry_timestamp));
-              spawn(data_sync_single_entry(sc, source_bs, gen, "",
-                                          entry_timestamp, lease_cr,
-                                          bucket_shard_cache, &*marker_tracker,
-                                          error_repo, tn, true), false);
-            }
-          }
-          if (!omapvals->more) {
-            error_retry_time = ceph::coarse_real_clock::now() +
-             make_timespan(retry_backoff_secs);
-            error_marker.clear();
-          }
-        }
-        omapvals.reset();
-
-        tn->log(20, SSTR("shard_id=" << shard_id << " sync_marker="
-                        << sync_marker.marker));
-        yield call(new RGWReadRemoteDataLogShardCR(sc, shard_id,
-                                                  sync_marker.marker,
-                                                   &next_marker, &log_entries,
-                                                  &truncated));
-        if (retcode < 0 && retcode != -ENOENT) {
-          tn->log(0, SSTR("ERROR: failed to read remote data log info: ret="
-                         << retcode));
-          drain_all();
-          return set_cr_error(retcode);
-        }
-
-        if (log_entries.size() > 0) {
-          tn->set_flag(RGW_SNS_FLAG_ACTIVE); /* actually have entries to sync */
-        }
-
-        for (log_iter = log_entries.begin();
-            log_iter != log_entries.end();
-            ++log_iter) {
-          tn->log(20, SSTR("shard_id=" << shard_id << " log_entry: "
-                          << log_iter->log_id << ":" << log_iter->log_timestamp
-                          << ":" << log_iter->entry.key));
-          retcode = parse_bucket_key(log_iter->entry.key, source_bs);
-          if (retcode < 0) {
-            tn->log(1, SSTR("failed to parse bucket shard: "
-                           << log_iter->entry.key));
-            marker_tracker->try_update_high_marker(log_iter->log_id, 0,
-                                                  log_iter->log_timestamp);
-            continue;
-          }
-          if (!marker_tracker->start(log_iter->log_id, 0,
-                                    log_iter->log_timestamp)) {
-            tn->log(0, SSTR("ERROR: cannot start syncing " << log_iter->log_id
-                           << ". Duplicate entry?"));
-          } else {
-            tn->log(1, SSTR("incremental sync on " << log_iter->entry.key
-                           << "shard: " << shard_id << "on gen "
-                           << log_iter->entry.gen));
-            yield_spawn_window(
-             data_sync_single_entry(sc, source_bs,log_iter->entry.gen,
-                                    log_iter->log_id, log_iter->log_timestamp,
-                                    lease_cr,bucket_shard_cache,
-                                    &*marker_tracker, error_repo, tn, false),
-             cct->_conf->rgw_data_sync_spawn_window, std::nullopt);
-          }
-        }
-
-        tn->log(20, SSTR("shard_id=" << shard_id <<
-                        " sync_marker="<< sync_marker.marker
-                        << " next_marker=" << next_marker
-                        << " truncated=" << truncated));
-        if (!next_marker.empty()) {
-          sync_marker.marker = next_marker;
-        } else if (!log_entries.empty()) {
-          sync_marker.marker = log_entries.back().log_id;
-        }
-        if (!truncated) {
-          // we reached the end, wait a while before checking for more
-          tn->unset_flag(RGW_SNS_FLAG_ACTIVE);
-         yield wait(get_idle_interval());
-       }
-      } while (true);
-    }
-    return 0;
-  }
-};
-
-class RGWDataSyncShardCR : public RGWCoroutine {
-  RGWDataSyncCtx *const sc;
-  const rgw_pool pool;
-  const uint32_t shard_id;
-  rgw_data_sync_marker& sync_marker;
-  rgw_data_sync_status sync_status;
-  const RGWSyncTraceNodeRef tn;
-  bool *reset_backoff;
-
-  ceph::mutex inc_lock = ceph::make_mutex("RGWDataSyncShardCR::inc_lock");
-  ceph::condition_variable inc_cond;
-
-  RGWDataSyncEnv *const sync_env{ sc->env };
-
-  const string status_oid{ RGWDataSyncStatusManager::shard_obj_name(
-      sc->source_zone, shard_id) };
-  const rgw_raw_obj error_repo{ pool, status_oid + ".retry" };
-
-  // target number of entries to cache before recycling idle ones
-  static constexpr size_t target_cache_size = 256;
-  boost::intrusive_ptr<rgw::bucket_sync::Cache> bucket_shard_cache {
-    rgw::bucket_sync::Cache::create(target_cache_size) };
-
-  boost::intrusive_ptr<RGWContinuousLeaseCR> lease_cr;
-  boost::intrusive_ptr<RGWCoroutinesStack> lease_stack;
-
-  bc::flat_set<rgw_data_notify_entry> modified_shards;
-
-public:
-  RGWDataSyncShardCR(RGWDataSyncCtx* const _sc, const rgw_pool& pool,
-                     const uint32_t shard_id, rgw_data_sync_marker& marker,
-                     const rgw_data_sync_status& sync_status,
-                     RGWSyncTraceNodeRef& tn, bool *reset_backoff)
-    : RGWCoroutine(_sc->cct), sc(_sc), pool(pool), shard_id(shard_id),
-      sync_marker(marker), sync_status(sync_status), tn(tn),
-      reset_backoff(reset_backoff) {
-    set_description() << "data sync shard source_zone=" << sc->source_zone
-                     << " shard_id=" << shard_id;
-  }
-
-  ~RGWDataSyncShardCR() override {
-    if (lease_cr) {
-      lease_cr->abort();
-    }
-  }
-
-  void append_modified_shards(bc::flat_set<rgw_data_notify_entry>& entries) {
-    std::lock_guard l{inc_lock};
-    modified_shards.insert(entries.begin(), entries.end());
-  }
-
-  int operate(const DoutPrefixProvider *dpp) override {
-    reenter(this) {
-      yield init_lease_cr();
-      while (!lease_cr->is_locked()) {
-        if (lease_cr->is_done()) {
-          tn->log(5, "failed to take lease");
-          set_status("lease lock failed, early abort");
-          drain_all();
-          return set_cr_error(lease_cr->get_ret_status());
-        }
-        set_sleeping(true);
-        yield;
-      }
-      *reset_backoff = true;
-      tn->log(10, "took lease");
-
-      while (true) {
-       if (sync_marker.state == rgw_data_sync_marker::FullSync) {
-         yield call(new RGWDataFullSyncShardCR(sc, pool, shard_id,
-                                               sync_marker, tn,
-                                               status_oid, error_repo,
-                                               lease_cr, sync_status,
-                                               bucket_shard_cache));
-         if (retcode < 0) {
-           if (retcode != -EBUSY) {
-             tn->log(10, SSTR("full sync failed (retcode=" << retcode << ")"));
-           }
-           lease_cr->go_down();
-           drain_all();
-           return set_cr_error(retcode);
-         }
-       } else if (sync_marker.state == rgw_data_sync_marker::IncrementalSync) {
-         yield call(new RGWDataIncSyncShardCR(sc, pool, shard_id,
-                                              sync_marker, tn,
-                                              status_oid, error_repo,
-                                              lease_cr, sync_status,
-                                              bucket_shard_cache,
-                                              inc_lock, modified_shards));
-         if (retcode < 0) {
-           if (retcode != -EBUSY) {
-             tn->log(10, SSTR("incremental sync failed (retcode=" << retcode
-                              << ")"));
-           }
-           lease_cr->go_down();
-           drain_all();
-           return set_cr_error(retcode);
-         }
-       } else {
-         lease_cr->go_down();
-         drain_all();
-         return set_cr_error(-EIO);
-       }
-      }
-    }
-    return 0;
-  }
-
-  void init_lease_cr() {
-    set_status("acquiring sync lock");
-    uint32_t lock_duration = cct->_conf->rgw_sync_lease_period;
-    string lock_name = "sync_lock";
-    if (lease_cr) {
-      lease_cr->abort();
-    }
-    auto driver = sync_env->driver;
-    lease_cr.reset(new RGWContinuousLeaseCR(sync_env->async_rados, driver,
-                                            rgw_raw_obj(pool, status_oid),
-                                            lock_name, lock_duration, this));
-    lease_stack.reset(spawn(lease_cr.get(), false));
-  }
-};
-
-class RGWDataSyncShardControlCR : public RGWBackoffControlCR {
-  RGWDataSyncCtx *sc;
-  RGWDataSyncEnv *sync_env;
-
-  rgw_pool pool;
-
-  uint32_t shard_id;
-  rgw_data_sync_marker sync_marker;
-  rgw_data_sync_status sync_status;
-
-  RGWSyncTraceNodeRef tn;
-public:
-  RGWDataSyncShardControlCR(RGWDataSyncCtx *_sc, const rgw_pool& _pool,
-                    uint32_t _shard_id, rgw_data_sync_marker& _marker, const rgw_data_sync_status& sync_status,
-                     RGWSyncTraceNodeRef& _tn_parent) : RGWBackoffControlCR(_sc->cct, false),
-                                                      sc(_sc), sync_env(_sc->env),
-                                                     pool(_pool),
-                                                     shard_id(_shard_id),
-                                                     sync_marker(_marker) {
-    tn = sync_env->sync_tracer->add_node(_tn_parent, "shard", std::to_string(shard_id));
-  }
-
-  RGWCoroutine *alloc_cr() override {
-    return new RGWDataSyncShardCR(sc, pool, shard_id, sync_marker, sync_status, tn, backoff_ptr());
-  }
-
-  RGWCoroutine *alloc_finisher_cr() override {
-    return new RGWSimpleRadosReadCR<rgw_data_sync_marker>(sync_env->dpp, sync_env->async_rados, sync_env->svc->sysobj,
-                                                          rgw_raw_obj(sync_env->svc->zone->get_zone_params().log_pool, RGWDataSyncStatusManager::shard_obj_name(sc->source_zone, shard_id)),
-                                                          &sync_marker);
-  }
-
-  void append_modified_shards(bc::flat_set<rgw_data_notify_entry>& keys) {
-    std::lock_guard l{cr_lock()};
-
-    RGWDataSyncShardCR *cr = static_cast<RGWDataSyncShardCR *>(get_cr());
-    if (!cr) {
-      return;
-    }
-
-    cr->append_modified_shards(keys);
-  }
-};
-
-class RGWDataSyncCR : public RGWCoroutine {
-  RGWDataSyncCtx *sc;
-  RGWDataSyncEnv *sync_env;
-  uint32_t num_shards;
-
-  rgw_data_sync_status sync_status;
-
-  ceph::mutex shard_crs_lock =
-    ceph::make_mutex("RGWDataSyncCR::shard_crs_lock");
-  map<int, RGWDataSyncShardControlCR *> shard_crs;
-
-  bool *reset_backoff;
-
-  RGWSyncTraceNodeRef tn;
-
-  RGWDataSyncModule *data_sync_module{nullptr};
-public:
-  RGWDataSyncCR(RGWDataSyncCtx *_sc, uint32_t _num_shards, RGWSyncTraceNodeRef& _tn, bool *_reset_backoff) : RGWCoroutine(_sc->cct),
-                                                      sc(_sc), sync_env(_sc->env),
-                                                      num_shards(_num_shards),
-                                                      reset_backoff(_reset_backoff), tn(_tn) {
-
-  }
-
-  ~RGWDataSyncCR() override {
-    for (auto iter : shard_crs) {
-      iter.second->put();
-    }
-  }
-
-  int operate(const DoutPrefixProvider *dpp) override {
-    reenter(this) {
-
-      /* read sync status */
-      yield call(new RGWReadDataSyncStatusCoroutine(sc, &sync_status));
-
-      data_sync_module = sync_env->sync_module->get_data_handler();
-
-      if (retcode < 0 && retcode != -ENOENT) {
-        tn->log(0, SSTR("ERROR: failed to fetch sync status, retcode=" << retcode));
-        return set_cr_error(retcode);
-      }
-
-      /* state: init status */
-      if ((rgw_data_sync_info::SyncState)sync_status.sync_info.state == rgw_data_sync_info::StateInit) {
-        tn->log(20, SSTR("init"));
-        sync_status.sync_info.num_shards = num_shards;
-        uint64_t instance_id;
-        instance_id = ceph::util::generate_random_number<uint64_t>();
-        yield call(new RGWInitDataSyncStatusCoroutine(sc, num_shards, instance_id, tn, &sync_status));
-        if (retcode < 0) {
-          tn->log(0, SSTR("ERROR: failed to init sync, retcode=" << retcode));
-          return set_cr_error(retcode);
-        }
-        // sets state = StateBuildingFullSyncMaps
-
-        *reset_backoff = true;
-      }
-
-      data_sync_module->init(sc, sync_status.sync_info.instance_id);
-
-      if  ((rgw_data_sync_info::SyncState)sync_status.sync_info.state == rgw_data_sync_info::StateBuildingFullSyncMaps) {
-        tn->log(10, SSTR("building full sync maps"));
-        /* call sync module init here */
-        sync_status.sync_info.num_shards = num_shards;
-        yield call(data_sync_module->init_sync(dpp, sc));
-        if (retcode < 0) {
-          tn->log(0, SSTR("ERROR: sync module init_sync() failed, retcode=" << retcode));
-          return set_cr_error(retcode);
-        }
-        /* state: building full sync maps */
-        yield call(new RGWListBucketIndexesCR(sc, &sync_status));
-        if (retcode < 0) {
-          tn->log(0, SSTR("ERROR: failed to build full sync maps, retcode=" << retcode));
-          return set_cr_error(retcode);
-        }
-        sync_status.sync_info.state = rgw_data_sync_info::StateSync;
-
-        /* update new state */
-        yield call(set_sync_info_cr());
-        if (retcode < 0) {
-          tn->log(0, SSTR("ERROR: failed to write sync status, retcode=" << retcode));
-          return set_cr_error(retcode);
-        }
-
-        *reset_backoff = true;
-      }
-
-      yield call(data_sync_module->start_sync(dpp, sc));
-      if (retcode < 0) {
-        tn->log(0, SSTR("ERROR: failed to start sync, retcode=" << retcode));
-        return set_cr_error(retcode);
-      }
-      
-      yield {
-        if  ((rgw_data_sync_info::SyncState)sync_status.sync_info.state == rgw_data_sync_info::StateSync) {
-          tn->log(10, SSTR("spawning " << num_shards << " shards sync"));
-          for (map<uint32_t, rgw_data_sync_marker>::iterator iter = sync_status.sync_markers.begin();
-               iter != sync_status.sync_markers.end(); ++iter) {
-            RGWDataSyncShardControlCR *cr = new RGWDataSyncShardControlCR(sc, sync_env->svc->zone->get_zone_params().log_pool,
-                                                                          iter->first, iter->second, sync_status, tn);
-            cr->get();
-            shard_crs_lock.lock();
-            shard_crs[iter->first] = cr;
-            shard_crs_lock.unlock();
-            spawn(cr, true);
-          }
-        }
-      }
-
-      return set_cr_done();
-    }
-    return 0;
-  }
-
-  RGWCoroutine *set_sync_info_cr() {
-    return new RGWSimpleRadosWriteCR<rgw_data_sync_info>(sync_env->dpp, sync_env->async_rados, sync_env->svc->sysobj,
-                                                         rgw_raw_obj(sync_env->svc->zone->get_zone_params().log_pool, RGWDataSyncStatusManager::sync_status_oid(sc->source_zone)),
-                                                         sync_status.sync_info);
-  }
-
-  void wakeup(int shard_id, bc::flat_set<rgw_data_notify_entry>& entries) {
-    std::lock_guard l{shard_crs_lock};
-    map<int, RGWDataSyncShardControlCR *>::iterator iter = shard_crs.find(shard_id);
-    if (iter == shard_crs.end()) {
-      return;
-    }
-    iter->second->append_modified_shards(entries);
-    iter->second->wakeup();
-  }
-};
-
-class RGWDefaultDataSyncModule : public RGWDataSyncModule {
-public:
-  RGWDefaultDataSyncModule() {}
-
-  RGWCoroutine *sync_object(const DoutPrefixProvider *dpp, RGWDataSyncCtx *sc, rgw_bucket_sync_pipe& sync_pipe, rgw_obj_key& key, std::optional<uint64_t> versioned_epoch, rgw_zone_set *zones_trace) override;
-  RGWCoroutine *remove_object(const DoutPrefixProvider *dpp, RGWDataSyncCtx *sc, rgw_bucket_sync_pipe& sync_pipe, rgw_obj_key& key, real_time& mtime, bool versioned, uint64_t versioned_epoch, rgw_zone_set *zones_trace) override;
-  RGWCoroutine *create_delete_marker(const DoutPrefixProvider *dpp, RGWDataSyncCtx *sc, rgw_bucket_sync_pipe& sync_pipe, rgw_obj_key& key, real_time& mtime,
-                                     rgw_bucket_entry_owner& owner, bool versioned, uint64_t versioned_epoch, rgw_zone_set *zones_trace) override;
-};
-
-class RGWDefaultSyncModuleInstance : public RGWSyncModuleInstance {
-  RGWDefaultDataSyncModule data_handler;
-public:
-  RGWDefaultSyncModuleInstance() {}
-  RGWDataSyncModule *get_data_handler() override {
-    return &data_handler;
-  }
-  bool supports_user_writes() override {
-    return true;
-  }
-};
-
-int RGWDefaultSyncModule::create_instance(const DoutPrefixProvider *dpp, CephContext *cct, const JSONFormattable& config, RGWSyncModuleInstanceRef *instance)
-{
-  instance->reset(new RGWDefaultSyncModuleInstance());
-  return 0;
-}
-
-class RGWUserPermHandler {
-  friend struct Init;
-  friend class Bucket;
-
-  RGWDataSyncEnv *sync_env;
-  rgw_user uid;
-
-  struct _info {
-    RGWUserInfo user_info;
-    rgw::IAM::Environment env;
-    std::unique_ptr<rgw::auth::Identity> identity;
-    RGWAccessControlPolicy user_acl;
-  };
-
-  std::shared_ptr<_info> info;
-
-  struct Init;
-
-  std::shared_ptr<Init> init_action;
-
-  struct Init : public RGWGenericAsyncCR::Action {
-    RGWDataSyncEnv *sync_env;
-
-    rgw_user uid;
-    std::shared_ptr<RGWUserPermHandler::_info> info;
-
-    int ret{0};
-    
-    Init(RGWUserPermHandler *handler) : sync_env(handler->sync_env),
-                                        uid(handler->uid),
-                                        info(handler->info) {}
-    int operate() override {
-      auto user_ctl = sync_env->driver->getRados()->ctl.user;
-
-      ret = user_ctl->get_info_by_uid(sync_env->dpp, uid, &info->user_info, null_yield);
-      if (ret < 0) {
-        return ret;
-      }
-
-      info->identity = rgw::auth::transform_old_authinfo(sync_env->cct,
-                                                         uid,
-                                                         RGW_PERM_FULL_CONTROL,
-                                                         false, /* system_request? */
-                                                         TYPE_RGW);
-
-      map<string, bufferlist> uattrs;
-
-      ret = user_ctl->get_attrs_by_uid(sync_env->dpp, uid, &uattrs, null_yield);
-      if (ret == 0) {
-        ret = RGWUserPermHandler::policy_from_attrs(sync_env->cct, uattrs, &info->user_acl);
-      }
-      if (ret == -ENOENT) {
-        info->user_acl.create_default(uid, info->user_info.display_name);
-      }
-
-      return 0;
-    }
-  };
-
-public:
-  RGWUserPermHandler(RGWDataSyncEnv *_sync_env,
-                     const rgw_user& _uid) : sync_env(_sync_env),
-                                             uid(_uid) {}
-
-  RGWCoroutine *init_cr() {
-    info = make_shared<_info>();
-    init_action = make_shared<Init>(this);
-
-    return new RGWGenericAsyncCR(sync_env->cct,
-                                 sync_env->async_rados,
-                                 init_action);
-  }
-
-  class Bucket {
-    RGWDataSyncEnv *sync_env;
-    std::shared_ptr<_info> info;
-    RGWAccessControlPolicy bucket_acl;
-    std::optional<perm_state> ps;
-  public:
-    Bucket() {}
-
-    int init(RGWUserPermHandler *handler,
-             const RGWBucketInfo& bucket_info,
-             const map<string, bufferlist>& bucket_attrs);
-
-    bool verify_bucket_permission(int perm);
-    bool verify_object_permission(const map<string, bufferlist>& obj_attrs,
-                                  int perm);
-  };
-
-  static int policy_from_attrs(CephContext *cct,
-                               const map<string, bufferlist>& attrs,
-                               RGWAccessControlPolicy *acl) {
-    acl->set_ctx(cct);
-
-    auto aiter = attrs.find(RGW_ATTR_ACL);
-    if (aiter == attrs.end()) {
-      return -ENOENT;
-    }
-    auto iter = aiter->second.begin();
-    try {
-      acl->decode(iter);
-    } catch (buffer::error& err) {
-      ldout(cct, 0) << "ERROR: " << __func__ << "(): could not decode policy, caught buffer::error" << dendl;
-      return -EIO;
-    }
-
-    return 0;
-  }
-
-  int init_bucket(const RGWBucketInfo& bucket_info,
-                  const map<string, bufferlist>& bucket_attrs,
-                  Bucket *bs) {
-    return bs->init(this, bucket_info, bucket_attrs);
-  }
-};
-
-int RGWUserPermHandler::Bucket::init(RGWUserPermHandler *handler,
-                                     const RGWBucketInfo& bucket_info,
-                                     const map<string, bufferlist>& bucket_attrs)
-{
-  sync_env = handler->sync_env;
-  info = handler->info;
-
-  int r = RGWUserPermHandler::policy_from_attrs(sync_env->cct, bucket_attrs, &bucket_acl);
-  if (r < 0) {
-    return r;
-  }
-
-  ps.emplace(sync_env->cct,
-             info->env,
-             info->identity.get(),
-             bucket_info,
-             info->identity->get_perm_mask(),
-             false, /* defer to bucket acls */
-             nullptr, /* referer */
-             false); /* request_payer */
-
-  return 0;
-}
-
-bool RGWUserPermHandler::Bucket::verify_bucket_permission(int perm)
-{
-  return verify_bucket_permission_no_policy(sync_env->dpp,
-                                            &(*ps),
-                                            &info->user_acl,
-                                            &bucket_acl,
-                                            perm);
-}
-
-bool RGWUserPermHandler::Bucket::verify_object_permission(const map<string, bufferlist>& obj_attrs,
-                                                          int perm)
-{
-  RGWAccessControlPolicy obj_acl;
-
-  int r = policy_from_attrs(sync_env->cct, obj_attrs, &obj_acl);
-  if (r < 0) {
-    return r;
-  }
-
-  return verify_bucket_permission_no_policy(sync_env->dpp,
-                                            &(*ps),
-                                            &bucket_acl,
-                                            &obj_acl,
-                                            perm);
-}
-
-class RGWFetchObjFilter_Sync : public RGWFetchObjFilter_Default {
-  rgw_bucket_sync_pipe sync_pipe;
-
-  std::shared_ptr<RGWUserPermHandler::Bucket> bucket_perms;
-  std::optional<rgw_sync_pipe_dest_params> verify_dest_params;
-
-  std::optional<ceph::real_time> mtime;
-  std::optional<string> etag;
-  std::optional<uint64_t> obj_size;
-
-  std::unique_ptr<rgw::auth::Identity> identity;
-
-  std::shared_ptr<bool> need_retry;
-
-public:
-  RGWFetchObjFilter_Sync(rgw_bucket_sync_pipe& _sync_pipe,
-                         std::shared_ptr<RGWUserPermHandler::Bucket>& _bucket_perms,
-                         std::optional<rgw_sync_pipe_dest_params>&& _verify_dest_params,
-                         std::shared_ptr<bool>& _need_retry) : sync_pipe(_sync_pipe),
-                                         bucket_perms(_bucket_perms),
-                                         verify_dest_params(std::move(_verify_dest_params)),
-                                         need_retry(_need_retry) {
-    *need_retry = false;
-  }
-
-  int filter(CephContext *cct,
-             const rgw_obj_key& source_key,
-             const RGWBucketInfo& dest_bucket_info,
-             std::optional<rgw_placement_rule> dest_placement_rule,
-             const map<string, bufferlist>& obj_attrs,
-             std::optional<rgw_user> *poverride_owner,
-             const rgw_placement_rule **prule) override;
-};
-
-int RGWFetchObjFilter_Sync::filter(CephContext *cct,
-                                   const rgw_obj_key& source_key,
-                                   const RGWBucketInfo& dest_bucket_info,
-                                   std::optional<rgw_placement_rule> dest_placement_rule,
-                                   const map<string, bufferlist>& obj_attrs,
-                                   std::optional<rgw_user> *poverride_owner,
-                                   const rgw_placement_rule **prule)
-{
-  int abort_err = -ERR_PRECONDITION_FAILED;
-
-  rgw_sync_pipe_params params;
-
-  RGWObjTags obj_tags;
-
-  auto iter = obj_attrs.find(RGW_ATTR_TAGS);
-  if (iter != obj_attrs.end()) {
-    try {
-      auto it = iter->second.cbegin();
-      obj_tags.decode(it);
-    } catch (buffer::error &err) {
-      ldout(cct, 0) << "ERROR: " << __func__ << ": caught buffer::error couldn't decode TagSet " << dendl;
-    }
-  }
-
-  if (!sync_pipe.info.handler.find_obj_params(source_key,
-                                              obj_tags.get_tags(),
-                                              &params)) {
-    return abort_err;
-  }
-
-  if (verify_dest_params &&
-      !(*verify_dest_params == params.dest)) {
-    /* raced! original dest params were different, will need to retry */
-    ldout(cct, 0) << "WARNING: " << __func__ << ": pipe dest params are different than original params, must have raced with object rewrite, retrying" << dendl;
-    *need_retry = true;
-    return -ECANCELED;
-  }
-
-  std::optional<std::map<string, bufferlist> > new_attrs;
-
-  if (params.dest.acl_translation) {
-    rgw_user& acl_translation_owner = params.dest.acl_translation->owner;
-    if (!acl_translation_owner.empty()) {
-      if (params.mode == rgw_sync_pipe_params::MODE_USER &&
-          acl_translation_owner != dest_bucket_info.owner) {
-        ldout(cct, 0) << "ERROR: " << __func__ << ": acl translation was requested, but user (" << acl_translation_owner
-          << ") is not dest bucket owner (" << dest_bucket_info.owner << ")" << dendl;
-        return -EPERM;
-      }
-      *poverride_owner = acl_translation_owner;
-    }
-  }
-  if (params.mode == rgw_sync_pipe_params::MODE_USER) {
-    if (!bucket_perms->verify_object_permission(obj_attrs, RGW_PERM_READ)) {
-      ldout(cct, 0) << "ERROR: " << __func__ << ": permission check failed: user not allowed to fetch object" << dendl;
-      return -EPERM;
-    }
-  }
-
-  if (!dest_placement_rule &&
-      params.dest.storage_class) {
-    dest_rule.storage_class = *params.dest.storage_class;
-    dest_rule.inherit_from(dest_bucket_info.placement_rule);
-    dest_placement_rule = dest_rule;
-    *prule = &dest_rule;
-  }
-
-  return RGWFetchObjFilter_Default::filter(cct,
-                                           source_key,
-                                           dest_bucket_info,
-                                           dest_placement_rule,
-                                           obj_attrs,
-                                           poverride_owner,
-                                           prule);
-}
-
-class RGWObjFetchCR : public RGWCoroutine {
-  RGWDataSyncCtx *sc;
-  RGWDataSyncEnv *sync_env;
-  rgw_bucket_sync_pipe& sync_pipe;
-  rgw_obj_key& key;
-  std::optional<rgw_obj_key> dest_key;
-  std::optional<uint64_t> versioned_epoch;
-  rgw_zone_set *zones_trace;
-
-  bool need_more_info{false};
-  bool check_change{false};
-
-  ceph::real_time src_mtime;
-  uint64_t src_size;
-  string src_etag;
-  map<string, bufferlist> src_attrs;
-  map<string, string> src_headers;
-
-  std::optional<rgw_user> param_user;
-  rgw_sync_pipe_params::Mode param_mode;
-
-  std::optional<RGWUserPermHandler> user_perms;
-  std::shared_ptr<RGWUserPermHandler::Bucket> source_bucket_perms;
-  RGWUserPermHandler::Bucket dest_bucket_perms;
-
-  std::optional<rgw_sync_pipe_dest_params> dest_params;
-
-  int try_num{0};
-  std::shared_ptr<bool> need_retry;
-public:
-  RGWObjFetchCR(RGWDataSyncCtx *_sc,
-                rgw_bucket_sync_pipe& _sync_pipe,
-                rgw_obj_key& _key,
-                std::optional<rgw_obj_key> _dest_key,
-                std::optional<uint64_t> _versioned_epoch,
-                rgw_zone_set *_zones_trace) : RGWCoroutine(_sc->cct),
-                                              sc(_sc), sync_env(_sc->env),
-                                              sync_pipe(_sync_pipe),
-                                              key(_key),
-                                              dest_key(_dest_key),
-                                              versioned_epoch(_versioned_epoch),
-                                              zones_trace(_zones_trace) {
-  }
-
-
-  int operate(const DoutPrefixProvider *dpp) override {
-    reenter(this) {
-
-#define MAX_RACE_RETRIES_OBJ_FETCH 10
-      for (try_num = 0; try_num < MAX_RACE_RETRIES_OBJ_FETCH; ++try_num) {
-
-        {
-          std::optional<rgw_user> param_acl_translation;
-          std::optional<string> param_storage_class;
-
-          if (!sync_pipe.info.handler.find_basic_info_without_tags(key,
-                                                                   &param_user,
-                                                                   &param_acl_translation,
-                                                                   &param_storage_class,
-                                                                   &param_mode,
-                                                                   &need_more_info)) {
-            if (!need_more_info) {
-              return set_cr_error(-ERR_PRECONDITION_FAILED);
-            }
-          }
-        }
-
-        if (need_more_info) {
-          ldout(cct, 20) << "Could not determine exact policy rule for obj=" << key << ", will read source object attributes" << dendl;
-          /*
-           * we need to fetch info about source object, so that we can determine
-           * the correct policy configuration. This can happen if there are multiple
-           * policy rules, and some depend on the object tagging */
-          yield call(new RGWStatRemoteObjCR(sync_env->async_rados,
-                                            sync_env->driver,
-                                            sc->source_zone,
-                                            sync_pipe.info.source_bs.bucket,
-                                            key,
-                                            &src_mtime,
-                                            &src_size,
-                                            &src_etag,
-                                            &src_attrs,
-                                            &src_headers));
-          if (retcode < 0) {
-            return set_cr_error(retcode);
-          }
-
-          RGWObjTags obj_tags;
-
-          auto iter = src_attrs.find(RGW_ATTR_TAGS);
-          if (iter != src_attrs.end()) {
-            try {
-              auto it = iter->second.cbegin();
-              obj_tags.decode(it);
-            } catch (buffer::error &err) {
-              ldout(cct, 0) << "ERROR: " << __func__ << ": caught buffer::error couldn't decode TagSet " << dendl;
-            }
-          }
-
-          rgw_sync_pipe_params params;
-          if (!sync_pipe.info.handler.find_obj_params(key,
-                                                      obj_tags.get_tags(),
-                                                      &params)) {
-            return set_cr_error(-ERR_PRECONDITION_FAILED);
-          }
-
-          param_user = params.user;
-          param_mode = params.mode;
-
-          dest_params = params.dest;
-        }
-
-        if (param_mode == rgw_sync_pipe_params::MODE_USER) {
-          if (!param_user) {
-            ldout(cct, 20) << "ERROR: " << __func__ << ": user level sync but user param not set" << dendl;
-            return set_cr_error(-EPERM);
-          }
-          user_perms.emplace(sync_env, *param_user);
-
-          yield call(user_perms->init_cr());
-          if (retcode < 0) {
-            ldout(cct, 20) << "ERROR: " << __func__ << ": failed to init user perms manager for uid=" << *param_user << dendl;
-            return set_cr_error(retcode);
-          }
-
-          /* verify that user is allowed to write at the target bucket */
-          int r = user_perms->init_bucket(sync_pipe.dest_bucket_info,
-                                          sync_pipe.dest_bucket_attrs,
-                                          &dest_bucket_perms);
-          if (r < 0) {
-            ldout(cct, 20) << "ERROR: " << __func__ << ": failed to init bucket perms manager for uid=" << *param_user << " bucket=" << sync_pipe.source_bucket_info.bucket.get_key() << dendl;
-            return set_cr_error(retcode);
-          }
-
-          if (!dest_bucket_perms.verify_bucket_permission(RGW_PERM_WRITE)) {
-            ldout(cct, 0) << "ERROR: " << __func__ << ": permission check failed: user not allowed to write into bucket (bucket=" << sync_pipe.info.dest_bucket.get_key() << ")" << dendl;
-            return -EPERM;
-          }
-
-          /* init source bucket permission structure */
-          source_bucket_perms = make_shared<RGWUserPermHandler::Bucket>();
-          r = user_perms->init_bucket(sync_pipe.source_bucket_info,
-                                      sync_pipe.source_bucket_attrs,
-                                      source_bucket_perms.get());
-          if (r < 0) {
-            ldout(cct, 20) << "ERROR: " << __func__ << ": failed to init bucket perms manager for uid=" << *param_user << " bucket=" << sync_pipe.source_bucket_info.bucket.get_key() << dendl;
-            return set_cr_error(retcode);
-          }
-        }
-
-        yield {
-          if (!need_retry) {
-            need_retry = make_shared<bool>();
-          }
-          auto filter = make_shared<RGWFetchObjFilter_Sync>(sync_pipe,
-                                                            source_bucket_perms,
-                                                            std::move(dest_params),
-                                                            need_retry);
-
-          call(new RGWFetchRemoteObjCR(sync_env->async_rados, sync_env->driver, sc->source_zone,
-                                       nullopt,
-                                       sync_pipe.info.source_bs.bucket,
-                                       std::nullopt, sync_pipe.dest_bucket_info,
-                                       key, dest_key, versioned_epoch,
-                                       true,
-                                       std::static_pointer_cast<RGWFetchObjFilter>(filter),
-                                       zones_trace, sync_env->counters, dpp));
-        }
-        if (retcode < 0) {
-          if (*need_retry) {
-            continue;
-          }
-          return set_cr_error(retcode);
-        }
-
-        return set_cr_done();
-      }
-
-      ldout(cct, 0) << "ERROR: " << __func__ << ": Too many retries trying to fetch object, possibly a bug: bucket=" << sync_pipe.source_bucket_info.bucket.get_key() << " key=" << key << dendl;
-
-      return set_cr_error(-EIO);
-    }
-    return 0;
-  }
-};
-
-RGWCoroutine *RGWDefaultDataSyncModule::sync_object(const DoutPrefixProvider *dpp, RGWDataSyncCtx *sc, rgw_bucket_sync_pipe& sync_pipe, rgw_obj_key& key, std::optional<uint64_t> versioned_epoch, rgw_zone_set *zones_trace)
-{
-  return new RGWObjFetchCR(sc, sync_pipe, key, std::nullopt, versioned_epoch, zones_trace);
-}
-
-RGWCoroutine *RGWDefaultDataSyncModule::remove_object(const DoutPrefixProvider *dpp, RGWDataSyncCtx *sc, rgw_bucket_sync_pipe& sync_pipe, rgw_obj_key& key,
-                                                      real_time& mtime, bool versioned, uint64_t versioned_epoch, rgw_zone_set *zones_trace)
-{
-  auto sync_env = sc->env;
-  return new RGWRemoveObjCR(sync_env->dpp, sync_env->async_rados, sync_env->driver, sc->source_zone,
-                            sync_pipe.dest_bucket_info, key, versioned, versioned_epoch,
-                            NULL, NULL, false, &mtime, zones_trace);
-}
-
-RGWCoroutine *RGWDefaultDataSyncModule::create_delete_marker(const DoutPrefixProvider *dpp, RGWDataSyncCtx *sc, rgw_bucket_sync_pipe& sync_pipe, rgw_obj_key& key, real_time& mtime,
-                                                             rgw_bucket_entry_owner& owner, bool versioned, uint64_t versioned_epoch, rgw_zone_set *zones_trace)
-{
-  auto sync_env = sc->env;
-  return new RGWRemoveObjCR(sync_env->dpp, sync_env->async_rados, sync_env->driver, sc->source_zone,
-                            sync_pipe.dest_bucket_info, key, versioned, versioned_epoch,
-                            &owner.id, &owner.display_name, true, &mtime, zones_trace);
-}
-
-class RGWArchiveDataSyncModule : public RGWDefaultDataSyncModule {
-public:
-  RGWArchiveDataSyncModule() {}
-
-  RGWCoroutine *sync_object(const DoutPrefixProvider *dpp, RGWDataSyncCtx *sc, rgw_bucket_sync_pipe& sync_pipe, rgw_obj_key& key, std::optional<uint64_t> versioned_epoch, rgw_zone_set *zones_trace) override;
-  RGWCoroutine *remove_object(const DoutPrefixProvider *dpp, RGWDataSyncCtx *sc, rgw_bucket_sync_pipe& sync_pipe, rgw_obj_key& key, real_time& mtime, bool versioned, uint64_t versioned_epoch, rgw_zone_set *zones_trace) override;
-  RGWCoroutine *create_delete_marker(const DoutPrefixProvider *dpp, RGWDataSyncCtx *sc, rgw_bucket_sync_pipe& sync_pipe, rgw_obj_key& key, real_time& mtime,
-                                     rgw_bucket_entry_owner& owner, bool versioned, uint64_t versioned_epoch, rgw_zone_set *zones_trace) override;
-};
-
-class RGWArchiveSyncModuleInstance : public RGWDefaultSyncModuleInstance {
-  RGWArchiveDataSyncModule data_handler;
-public:
-  RGWArchiveSyncModuleInstance() {}
-  RGWDataSyncModule *get_data_handler() override {
-    return &data_handler;
-  }
-  RGWMetadataHandler *alloc_bucket_meta_handler() override {
-    return RGWArchiveBucketMetaHandlerAllocator::alloc();
-  }
-  RGWBucketInstanceMetadataHandlerBase *alloc_bucket_instance_meta_handler(rgw::sal::Driver* driver) override {
-    return RGWArchiveBucketInstanceMetaHandlerAllocator::alloc(driver);
-  }
-};
-
-int RGWArchiveSyncModule::create_instance(const DoutPrefixProvider *dpp, CephContext *cct, const JSONFormattable& config, RGWSyncModuleInstanceRef *instance)
-{
-  instance->reset(new RGWArchiveSyncModuleInstance());
-  return 0;
-}
-
-RGWCoroutine *RGWArchiveDataSyncModule::sync_object(const DoutPrefixProvider *dpp, RGWDataSyncCtx *sc, rgw_bucket_sync_pipe& sync_pipe, rgw_obj_key& key, std::optional<uint64_t> versioned_epoch, rgw_zone_set *zones_trace)
-{
-  auto sync_env = sc->env;
-  ldout(sc->cct, 5) << "SYNC_ARCHIVE: sync_object: b=" << sync_pipe.info.source_bs.bucket << " k=" << key << " versioned_epoch=" << versioned_epoch.value_or(0) << dendl;
-  if (!sync_pipe.dest_bucket_info.versioned() ||
-     (sync_pipe.dest_bucket_info.flags & BUCKET_VERSIONS_SUSPENDED)) {
-      ldout(sc->cct, 0) << "SYNC_ARCHIVE: sync_object: enabling object versioning for archive bucket" << dendl;
-      sync_pipe.dest_bucket_info.flags = (sync_pipe.dest_bucket_info.flags & ~BUCKET_VERSIONS_SUSPENDED) | BUCKET_VERSIONED;
-      int op_ret = sync_env->driver->getRados()->put_bucket_instance_info(sync_pipe.dest_bucket_info, false, real_time(), NULL, sync_env->dpp);
-      if (op_ret < 0) {
-         ldpp_dout(sync_env->dpp, 0) << "SYNC_ARCHIVE: sync_object: error versioning archive bucket" << dendl;
-         return NULL;
-      }
-  }
-
-  std::optional<rgw_obj_key> dest_key;
-
-  if (versioned_epoch.value_or(0) == 0) { /* force version if not set */
-    versioned_epoch = 0;
-    dest_key = key;
-    if (key.instance.empty()) {
-      sync_env->driver->getRados()->gen_rand_obj_instance_name(&(*dest_key));
-    }
-  }
-
-  return new RGWObjFetchCR(sc, sync_pipe, key, dest_key, versioned_epoch, zones_trace);
-}
-
-RGWCoroutine *RGWArchiveDataSyncModule::remove_object(const DoutPrefixProvider *dpp, RGWDataSyncCtx *sc, rgw_bucket_sync_pipe& sync_pipe, rgw_obj_key& key,
-                                                     real_time& mtime, bool versioned, uint64_t versioned_epoch, rgw_zone_set *zones_trace)
-{
-  ldout(sc->cct, 0) << "SYNC_ARCHIVE: remove_object: b=" << sync_pipe.info.source_bs.bucket << " k=" << key << " versioned_epoch=" << versioned_epoch << dendl;
-  return NULL;
-}
-
-RGWCoroutine *RGWArchiveDataSyncModule::create_delete_marker(const DoutPrefixProvider *dpp, RGWDataSyncCtx *sc, rgw_bucket_sync_pipe& sync_pipe, rgw_obj_key& key, real_time& mtime,
-                                                            rgw_bucket_entry_owner& owner, bool versioned, uint64_t versioned_epoch, rgw_zone_set *zones_trace)
-{
-  ldout(sc->cct, 0) << "SYNC_ARCHIVE: create_delete_marker: b=" << sync_pipe.info.source_bs.bucket << " k=" << key << " mtime=" << mtime
-                                   << " versioned=" << versioned << " versioned_epoch=" << versioned_epoch << dendl;
-  auto sync_env = sc->env;
-  return new RGWRemoveObjCR(sync_env->dpp, sync_env->async_rados, sync_env->driver, sc->source_zone,
-                            sync_pipe.dest_bucket_info, key, versioned, versioned_epoch,
-                            &owner.id, &owner.display_name, true, &mtime, zones_trace);
-}
-
-class RGWDataSyncControlCR : public RGWBackoffControlCR
-{
-  RGWDataSyncCtx *sc;
-  RGWDataSyncEnv *sync_env;
-  uint32_t num_shards;
-
-  RGWSyncTraceNodeRef tn;
-
-  static constexpr bool exit_on_error = false; // retry on all errors
-public:
-  RGWDataSyncControlCR(RGWDataSyncCtx *_sc, uint32_t _num_shards,
-                       RGWSyncTraceNodeRef& _tn_parent) : RGWBackoffControlCR(_sc->cct, exit_on_error),
-                                                          sc(_sc), sync_env(_sc->env), num_shards(_num_shards) {
-    tn = sync_env->sync_tracer->add_node(_tn_parent, "sync");
-  }
-
-  RGWCoroutine *alloc_cr() override {
-    return new RGWDataSyncCR(sc, num_shards, tn, backoff_ptr());
-  }
-
-  void wakeup(int shard_id, bc::flat_set<rgw_data_notify_entry>& entries) {
-    ceph::mutex& m = cr_lock();
-
-    m.lock();
-    RGWDataSyncCR *cr = static_cast<RGWDataSyncCR *>(get_cr());
-    if (!cr) {
-      m.unlock();
-      return;
-    }
-
-    cr->get();
-    m.unlock();
-
-    if (cr) {
-      cr->wakeup(shard_id, entries);
-    }
-
-    cr->put();
-  }
-};
-
-void RGWRemoteDataLog::wakeup(int shard_id, bc::flat_set<rgw_data_notify_entry>& entries) {
-  std::shared_lock rl{lock};
-  if (!data_sync_cr) {
-    return;
-  }
-  data_sync_cr->wakeup(shard_id, entries);
-}
-
-int RGWRemoteDataLog::run_sync(const DoutPrefixProvider *dpp, int num_shards)
-{
-  lock.lock();
-  data_sync_cr = new RGWDataSyncControlCR(&sc, num_shards, tn);
-  data_sync_cr->get(); // run() will drop a ref, so take another
-  lock.unlock();
-
-  int r = run(dpp, data_sync_cr);
-
-  lock.lock();
-  data_sync_cr->put();
-  data_sync_cr = NULL;
-  lock.unlock();
-
-  if (r < 0) {
-    ldpp_dout(dpp, 0) << "ERROR: failed to run sync" << dendl;
-    return r;
-  }
-  return 0;
-}
-
-CephContext *RGWDataSyncStatusManager::get_cct() const
-{
-  return driver->ctx();
-}
-
-int RGWDataSyncStatusManager::init(const DoutPrefixProvider *dpp)
-{
-  RGWZone *zone_def;
-
-  if (!(zone_def = driver->svc()->zone->find_zone(source_zone))) {
-    ldpp_dout(this, 0) << "ERROR: failed to find zone config info for zone=" << source_zone << dendl;
-    return -EIO;
-  }
-
-  if (!driver->svc()->sync_modules->get_manager()->supports_data_export(zone_def->tier_type)) {
-    return -ENOTSUP;
-  }
-
-  const RGWZoneParams& zone_params = driver->svc()->zone->get_zone_params();
-
-  if (sync_module == nullptr) {
-    sync_module = driver->get_sync_module();
-  }
-
-  conn = driver->svc()->zone->get_zone_conn(source_zone);
-  if (!conn) {
-    ldpp_dout(this, 0) << "connection object to zone " << source_zone << " does not exist" << dendl;
-    return -EINVAL;
-  }
-
-  error_logger = new RGWSyncErrorLogger(driver, RGW_SYNC_ERROR_LOG_SHARD_PREFIX, ERROR_LOGGER_SHARDS);
-
-  int r = source_log.init(source_zone, conn, error_logger, driver->getRados()->get_sync_tracer(),
-                          sync_module, counters);
-  if (r < 0) {
-    ldpp_dout(this, 0) << "ERROR: failed to init remote log, r=" << r << dendl;
-    finalize();
-    return r;
-  }
-
-  rgw_datalog_info datalog_info;
-  r = source_log.read_log_info(dpp, &datalog_info);
-  if (r < 0) {
-    ldpp_dout(this, 5) << "ERROR: master.read_log_info() returned r=" << r << dendl;
-    finalize();
-    return r;
-  }
-
-  num_shards = datalog_info.num_shards;
-
-  for (int i = 0; i < num_shards; i++) {
-    shard_objs[i] = rgw_raw_obj(zone_params.log_pool, shard_obj_name(source_zone, i));
-  }
-
-  return 0;
-}
-
-void RGWDataSyncStatusManager::finalize()
-{
-  delete error_logger;
-  error_logger = nullptr;
-}
-
-unsigned RGWDataSyncStatusManager::get_subsys() const
-{
-  return dout_subsys;
-}
-
-std::ostream& RGWDataSyncStatusManager::gen_prefix(std::ostream& out) const
-{
-  auto zone = std::string_view{source_zone.id};
-  return out << "data sync zone:" << zone.substr(0, 8) << ' ';
-}
-
-string RGWDataSyncStatusManager::sync_status_oid(const rgw_zone_id& source_zone)
-{
-  char buf[datalog_sync_status_oid_prefix.size() + source_zone.id.size() + 16];
-  snprintf(buf, sizeof(buf), "%s.%s", datalog_sync_status_oid_prefix.c_str(), source_zone.id.c_str());
-
-  return string(buf);
-}
-
-string RGWDataSyncStatusManager::shard_obj_name(const rgw_zone_id& source_zone, int shard_id)
-{
-  char buf[datalog_sync_status_shard_prefix.size() + source_zone.id.size() + 16];
-  snprintf(buf, sizeof(buf), "%s.%s.%d", datalog_sync_status_shard_prefix.c_str(), source_zone.id.c_str(), shard_id);
-
-  return string(buf);
-}
-
-class RGWInitBucketShardSyncStatusCoroutine : public RGWCoroutine {
-  RGWDataSyncCtx *sc;
-  RGWDataSyncEnv *sync_env;
-
-  const rgw_bucket_sync_pair_info& sync_pair;
-  const string sync_status_oid;
-
-  rgw_bucket_shard_sync_info& status;
-  RGWObjVersionTracker& objv_tracker;
-  const BucketIndexShardsManager& marker_mgr;
-  bool exclusive;
-public:
-  RGWInitBucketShardSyncStatusCoroutine(RGWDataSyncCtx *_sc,
-                                        const rgw_bucket_sync_pair_info& _sync_pair,
-                                        rgw_bucket_shard_sync_info& _status,
-                                        uint64_t gen,
-                                        const BucketIndexShardsManager& _marker_mgr,
-                                        RGWObjVersionTracker& objv_tracker,
-                                        bool exclusive)
-    : RGWCoroutine(_sc->cct), sc(_sc), sync_env(_sc->env),
-      sync_pair(_sync_pair),
-      sync_status_oid(RGWBucketPipeSyncStatusManager::inc_status_oid(sc->source_zone, _sync_pair, gen)),
-      status(_status), objv_tracker(objv_tracker), marker_mgr(_marker_mgr), exclusive(exclusive)
-  {}
-
-  int operate(const DoutPrefixProvider *dpp) override {
-    reenter(this) {
-      yield {
-        rgw_raw_obj obj(sync_env->svc->zone->get_zone_params().log_pool, sync_status_oid);
-
-        // whether or not to do full sync, incremental sync will follow anyway
-        if (sync_env->sync_module->should_full_sync()) {
-          const auto max_marker = marker_mgr.get(sync_pair.source_bs.shard_id, "");
-          status.inc_marker.position = max_marker;
-        }
-        status.inc_marker.timestamp = ceph::real_clock::now();
-        status.state = rgw_bucket_shard_sync_info::StateIncrementalSync;
-
-        map<string, bufferlist> attrs;
-        status.encode_all_attrs(attrs);
-        call(new RGWSimpleRadosWriteAttrsCR(dpp, sync_env->async_rados, sync_env->svc->sysobj,
-                                            obj, attrs, &objv_tracker, exclusive));
-      }
-
-      if (retcode < 0) {
-        ldout(cct, 20) << "ERROR: init marker position failed. error: " << retcode << dendl;
-        return set_cr_error(retcode);
-      }
-      ldout(cct, 20) << "init marker position: " << status.inc_marker.position << 
-        ". written to shard status object: " << sync_status_oid << dendl;
-      return set_cr_done();
-    }
-    return 0;
-  }
-};
-
-#define BUCKET_SYNC_ATTR_PREFIX RGW_ATTR_PREFIX "bucket-sync."
-
-template <class T>
-static bool decode_attr(CephContext *cct, map<string, bufferlist>& attrs, const string& attr_name, T *val)
-{
-  map<string, bufferlist>::iterator iter = attrs.find(attr_name);
-  if (iter == attrs.end()) {
-    *val = T();
-    return false;
-  }
-
-  auto biter = iter->second.cbegin();
-  try {
-    decode(*val, biter);
-  } catch (buffer::error& err) {
-    ldout(cct, 0) << "ERROR: failed to decode attribute: " << attr_name << dendl;
-    return false;
-  }
-  return true;
-}
-
-void rgw_bucket_shard_sync_info::decode_from_attrs(CephContext *cct, map<string, bufferlist>& attrs)
-{
-  if (!decode_attr(cct, attrs, BUCKET_SYNC_ATTR_PREFIX "state", &state)) {
-    decode_attr(cct, attrs, "state", &state);
-  }
-  if (!decode_attr(cct, attrs, BUCKET_SYNC_ATTR_PREFIX "inc_marker", &inc_marker)) {
-    decode_attr(cct, attrs, "inc_marker", &inc_marker);
-  }
-}
-
-void rgw_bucket_shard_sync_info::encode_all_attrs(map<string, bufferlist>& attrs)
-{
-  encode_state_attr(attrs);
-  inc_marker.encode_attr(attrs);
-}
-
-void rgw_bucket_shard_sync_info::encode_state_attr(map<string, bufferlist>& attrs)
-{
-  using ceph::encode;
-  encode(state, attrs[BUCKET_SYNC_ATTR_PREFIX "state"]);
-}
-
-void rgw_bucket_shard_full_sync_marker::encode_attr(map<string, bufferlist>& attrs)
-{
-  using ceph::encode;
-  encode(*this, attrs[BUCKET_SYNC_ATTR_PREFIX "full_marker"]);
-}
-
-void rgw_bucket_shard_inc_sync_marker::encode_attr(map<string, bufferlist>& attrs)
-{
-  using ceph::encode;
-  encode(*this, attrs[BUCKET_SYNC_ATTR_PREFIX "inc_marker"]);
-}
-
-class RGWReadBucketPipeSyncStatusCoroutine : public RGWCoroutine {
-  RGWDataSyncCtx *sc;
-  RGWDataSyncEnv *sync_env;
-  string oid;
-  rgw_bucket_shard_sync_info *status;
-  RGWObjVersionTracker* objv_tracker;
-  map<string, bufferlist> attrs;
-public:
-  RGWReadBucketPipeSyncStatusCoroutine(RGWDataSyncCtx *_sc,
-                                   const rgw_bucket_sync_pair_info& sync_pair,
-                                   rgw_bucket_shard_sync_info *_status,
-                                   RGWObjVersionTracker* objv_tracker,
-                                   uint64_t gen)
-    : RGWCoroutine(_sc->cct), sc(_sc), sync_env(_sc->env),
-      oid(RGWBucketPipeSyncStatusManager::inc_status_oid(sc->source_zone, sync_pair, gen)),
-      status(_status), objv_tracker(objv_tracker)
-  {}
-  int operate(const DoutPrefixProvider *dpp) override;
-};
-
-int RGWReadBucketPipeSyncStatusCoroutine::operate(const DoutPrefixProvider *dpp)
-{
-  reenter(this) {
-    yield call(new RGWSimpleRadosReadAttrsCR(dpp, sync_env->async_rados, sync_env->svc->sysobj,
-                                             rgw_raw_obj(sync_env->svc->zone->get_zone_params().log_pool, oid),
-                                             &attrs, true, objv_tracker));
-    if (retcode == -ENOENT) {
-      *status = rgw_bucket_shard_sync_info();
-      return set_cr_done();
-    }
-    if (retcode < 0) {
-      ldpp_dout(dpp, 0) << "ERROR: failed to call fetch bucket shard info oid=" << oid << " ret=" << retcode << dendl;
-      return set_cr_error(retcode);
-    }
-    status->decode_from_attrs(sync_env->cct, attrs);
-    return set_cr_done();
-  }
-  return 0;
-}
-
-// wrap ReadSyncStatus and set a flag if it's not in incremental
-class CheckBucketShardStatusIsIncremental : public RGWReadBucketPipeSyncStatusCoroutine {
-  bool* result;
-  rgw_bucket_shard_sync_info status;
- public:
-  CheckBucketShardStatusIsIncremental(RGWDataSyncCtx* sc,
-                                      const rgw_bucket_sync_pair_info& sync_pair,
-                                      bool* result)
-    : RGWReadBucketPipeSyncStatusCoroutine(sc, sync_pair, &status, nullptr, 0 /*no gen in compat mode*/),
-      result(result)
-  {}
-
-  int operate(const DoutPrefixProvider *dpp) override {
-    int r = RGWReadBucketPipeSyncStatusCoroutine::operate(dpp);
-    if (state == RGWCoroutine_Done &&
-        status.state != rgw_bucket_shard_sync_info::StateIncrementalSync) {
-      *result = false;
-    }
-    return r;
-  }
-};
-
-class CheckAllBucketShardStatusIsIncremental : public RGWShardCollectCR {
-  // start with 1 shard, and only spawn more if we detect an existing shard.
-  // this makes the backward compatilibility check far less expensive in the
-  // general case where no shards exist
-  static constexpr int initial_concurrent_shards = 1;
-  static constexpr int max_concurrent_shards = 16;
-
-  RGWDataSyncCtx* sc;
-  rgw_bucket_sync_pair_info sync_pair;
-  const int num_shards;
-  bool* result;
-  int shard = 0;
- public:
-  CheckAllBucketShardStatusIsIncremental(RGWDataSyncCtx* sc,
-                                         const rgw_bucket_sync_pair_info& sync_pair,
-                                         int num_shards, bool* result)
-    : RGWShardCollectCR(sc->cct, initial_concurrent_shards),
-      sc(sc), sync_pair(sync_pair), num_shards(num_shards), result(result)
-  {}
-
-  bool spawn_next() override {
-    // stop spawning if we saw any errors or non-incremental shards
-    if (shard >= num_shards || status < 0 || !*result) {
-      return false;
-    }
-    sync_pair.source_bs.shard_id = shard++;
-    spawn(new CheckBucketShardStatusIsIncremental(sc, sync_pair, result), false);
-    return true;
-  }
-
- private:
-  int handle_result(int r) override {
-    if (r < 0) {
-      ldout(cct, 4) << "failed to read bucket shard status: "
-          << cpp_strerror(r) << dendl;
-    } else if (shard == 0) {
-      // enable concurrency once the first shard succeeds
-      max_concurrent = max_concurrent_shards;
-    }
-    return r;
-  }
-};
-
-// wrap InitBucketShardSyncStatus with local storage for 'status' and 'objv'
-// and a loop to retry on racing writes
-class InitBucketShardStatusCR : public RGWCoroutine {
-  RGWDataSyncCtx* sc;
-  rgw_bucket_sync_pair_info pair;
-  rgw_bucket_shard_sync_info status;
-  RGWObjVersionTracker objv;
-  const uint64_t gen;
-  const BucketIndexShardsManager& marker_mgr;
-
- public:
-  InitBucketShardStatusCR(RGWDataSyncCtx* sc,
-                         const rgw_bucket_sync_pair_info& pair,
-                         uint64_t gen,
-                         const BucketIndexShardsManager& marker_mgr)
-    : RGWCoroutine(sc->cct), sc(sc), pair(pair), gen(gen), marker_mgr(marker_mgr)
-  {}
-  int operate(const DoutPrefixProvider *dpp) {
-    reenter(this) {
-      // non exclusive create with empty status
-      objv.generate_new_write_ver(cct);
-      yield call(new RGWInitBucketShardSyncStatusCoroutine(sc, pair, status, gen, marker_mgr, objv, false));
-      if (retcode < 0) {
-        return set_cr_error(retcode);
-      }
-      return set_cr_done();
-    }
-    return 0;
-  }
-};
-
-class InitBucketShardStatusCollectCR : public RGWShardCollectCR {
-  static constexpr int max_concurrent_shards = 16;
-  RGWDataSyncCtx* sc;
-  rgw_bucket_sync_pair_info sync_pair;
-  const uint64_t gen;
-  const BucketIndexShardsManager& marker_mgr;
-
-  const int num_shards;
-  int shard = 0;
-
-  int handle_result(int r) override {
-    if (r < 0) {
-      ldout(cct, 4) << "failed to init bucket shard status: "
-          << cpp_strerror(r) << dendl;
-    }
-    return r;
-  }
- public:
-  InitBucketShardStatusCollectCR(RGWDataSyncCtx* sc,
-                                 const rgw_bucket_sync_pair_info& sync_pair,
-                                 uint64_t gen,
-                                 const BucketIndexShardsManager& marker_mgr,
-                                 int num_shards)
-    : RGWShardCollectCR(sc->cct, max_concurrent_shards),
-      sc(sc), sync_pair(sync_pair), gen(gen), marker_mgr(marker_mgr), num_shards(num_shards)
-  {}
-
-  bool spawn_next() override {
-    if (shard >= num_shards || status < 0) { // stop spawning on any errors
-      return false;
-    }
-    sync_pair.source_bs.shard_id = shard++;
-    spawn(new InitBucketShardStatusCR(sc, sync_pair, gen, marker_mgr), false);
-    return true;
-  }
-};
-
-class RemoveBucketShardStatusCR : public RGWCoroutine {
-  RGWDataSyncCtx* const sc;
-  RGWDataSyncEnv* const sync_env;
-
-  rgw_bucket_sync_pair_info sync_pair;
-  rgw_raw_obj obj;
-  RGWObjVersionTracker objv;
-
-public:
-  RemoveBucketShardStatusCR(RGWDataSyncCtx* sc,
-                             const rgw_bucket_sync_pair_info& sync_pair, uint64_t gen)
-    : RGWCoroutine(sc->cct), sc(sc), sync_env(sc->env),
-      sync_pair(sync_pair),
-      obj(sync_env->svc->zone->get_zone_params().log_pool, 
-          RGWBucketPipeSyncStatusManager::inc_status_oid(sc->source_zone, sync_pair, gen))
-  {}
-
-  int operate(const DoutPrefixProvider *dpp) override {
-    reenter(this) {
-      yield call(new RGWRadosRemoveCR(sync_env->driver, obj, &objv));
-                       if (retcode < 0 && retcode != -ENOENT) {
-        ldout(cct, 20) << "ERROR: failed to remove bucket shard status for: " << sync_pair << 
-          ". with error: " << retcode << dendl;
-        return set_cr_error(retcode);
-      }
-      ldout(cct, 20) << "removed bucket shard status object: " << obj.oid << dendl;
-      return set_cr_done();
-    }
-    return 0;
-  }
-};
-
-class RemoveBucketShardStatusCollectCR : public RGWShardCollectCR {
-  static constexpr int max_concurrent_shards = 16;
-  RGWDataSyncCtx* const sc;
-  RGWDataSyncEnv* const sync_env;
-  rgw_bucket_sync_pair_info sync_pair;
-  const uint64_t gen;
-
-  const int num_shards;
-  int shard = 0;
-
-  int handle_result(int r) override {
-    if (r < 0) {
-      ldout(cct, 4) << "failed to remove bucket shard status object: "
-          << cpp_strerror(r) << dendl;
-    }
-    return r;
-  }
- public:
-  RemoveBucketShardStatusCollectCR(RGWDataSyncCtx* sc,
-                                 const rgw_bucket_sync_pair_info& sync_pair,
-                                 uint64_t gen,
-                                 int num_shards)
-    : RGWShardCollectCR(sc->cct, max_concurrent_shards),
-      sc(sc), sync_env(sc->env), sync_pair(sync_pair), gen(gen), num_shards(num_shards)
-  {}
-
-  bool spawn_next() override {
-    if (shard >= num_shards) {
-      return false;
-    }
-    sync_pair.source_bs.shard_id = shard++;
-    spawn(new RemoveBucketShardStatusCR(sc, sync_pair, gen),  false);
-    return true;
-  }
-};
-
-class InitBucketFullSyncStatusCR : public RGWCoroutine {
-  RGWDataSyncCtx *sc;
-  RGWDataSyncEnv *sync_env;
-
-  const rgw_bucket_sync_pair_info& sync_pair;
-  const rgw_raw_obj& status_obj;
-  rgw_bucket_sync_status& status;
-  RGWObjVersionTracker& objv;
-  const RGWBucketInfo& source_info;
-  const bool check_compat;
-
-  const rgw_bucket_index_marker_info& info;
-  BucketIndexShardsManager marker_mgr;
-
-  bool all_incremental = true;
-  bool no_zero = false;
-
-public:
-  InitBucketFullSyncStatusCR(RGWDataSyncCtx* sc,
-                             const rgw_bucket_sync_pair_info& sync_pair,
-                             const rgw_raw_obj& status_obj,
-                             rgw_bucket_sync_status& status,
-                             RGWObjVersionTracker& objv,
-                            const RGWBucketInfo& source_info,
-                             bool check_compat,
-                             const rgw_bucket_index_marker_info& info)
-    : RGWCoroutine(sc->cct), sc(sc), sync_env(sc->env),
-      sync_pair(sync_pair), status_obj(status_obj),
-      status(status), objv(objv), source_info(source_info),
-      check_compat(check_compat), info(info)
-  {}
-
-  int operate(const DoutPrefixProvider *dpp) override {
-    reenter(this) {
-      retcode = marker_mgr.from_string(info.max_marker, -1);
-      if (retcode < 0) {
-        lderr(cct) << "failed to parse bilog shard markers: "
-            << cpp_strerror(retcode) << dendl;
-        return set_cr_error(retcode);
-      }
-
-      status.state = BucketSyncState::Init;
-
-      if (info.oldest_gen == 0) {
-       if (check_compat) {
-         // use shard count from our log gen=0
-         // try to convert existing per-shard incremental status for backward compatibility
-         if (source_info.layout.logs.empty() ||
-             source_info.layout.logs.front().gen > 0) {
-           ldpp_dout(dpp, 20) << "no generation zero when checking compatibility" << dendl;
-           no_zero = true;
-         } else if (auto& log = source_info.layout.logs.front();
-                     log.layout.type != rgw::BucketLogType::InIndex) {
-           ldpp_dout(dpp, 20) << "unrecognized log layout type when checking compatibility " << log.layout.type << dendl;
-           no_zero = true;
-         }
-         if (!no_zero) {
-           yield {
-             const int num_shards0 =
-               source_info.layout.logs.front().layout.in_index.layout.num_shards;
-             call(new CheckAllBucketShardStatusIsIncremental(sc, sync_pair,
-                                                             num_shards0,
-                                                             &all_incremental));
-           }
-           if (retcode < 0) {
-             return set_cr_error(retcode);
-           }
-           if (all_incremental) {
-             // we can use existing status and resume incremental sync
-             status.state = BucketSyncState::Incremental;
-           }
-         } else {
-           all_incremental = false;
-         }
-       }
-      }
-
-      if (status.state != BucketSyncState::Incremental) {
-       // initialize all shard sync status. this will populate the log marker
-        // positions where incremental sync will resume after full sync
-       yield {
-         const int num_shards = marker_mgr.get().size();
-         call(new InitBucketShardStatusCollectCR(sc, sync_pair, info.latest_gen, marker_mgr, num_shards));
-       }
-       if (retcode < 0) {
-          ldout(cct, 20) << "failed to init bucket shard status: "
-                        << cpp_strerror(retcode) << dendl;
-         return set_cr_error(retcode);
-        }
-
-        if (sync_env->sync_module->should_full_sync()) {
-          status.state = BucketSyncState::Full;
-        } else {
-          status.state = BucketSyncState::Incremental;
-        }
-      }
-
-      status.shards_done_with_gen.resize(marker_mgr.get().size());
-      status.incremental_gen = info.latest_gen;
-
-      ldout(cct, 20) << "writing bucket sync status during init. state=" << status.state << ". marker=" << status.full.position.to_str() << dendl;
-
-      // write bucket sync status
-      using CR = RGWSimpleRadosWriteCR<rgw_bucket_sync_status>;
-      yield call(new CR(dpp, sync_env->async_rados, sync_env->svc->sysobj,
-                       status_obj, status, &objv, false));
-      if (retcode < 0) {
-        ldout(cct, 20) << "failed to write bucket shard status: "
-            << cpp_strerror(retcode) << dendl;
-        return set_cr_error(retcode);
-      }
-      return set_cr_done();
-    }
-    return 0;
-  }
-};
-
-#define OMAP_READ_MAX_ENTRIES 10
-class RGWReadRecoveringBucketShardsCoroutine : public RGWCoroutine {
-  RGWDataSyncCtx *sc;
-  RGWDataSyncEnv *sync_env;
-  rgw::sal::RadosStore* driver;
-  
-  const int shard_id;
-  int max_entries;
-
-  set<string>& recovering_buckets;
-  string marker;
-  string error_oid;
-
-  RGWRadosGetOmapKeysCR::ResultPtr omapkeys;
-  set<string> error_entries;
-  int max_omap_entries;
-  int count;
-
-public:
-  RGWReadRecoveringBucketShardsCoroutine(RGWDataSyncCtx *_sc, const int _shard_id,
-                                      set<string>& _recovering_buckets, const int _max_entries) 
-  : RGWCoroutine(_sc->cct), sc(_sc), sync_env(_sc->env),
-  driver(sync_env->driver), shard_id(_shard_id), max_entries(_max_entries),
-  recovering_buckets(_recovering_buckets), max_omap_entries(OMAP_READ_MAX_ENTRIES)
-  {
-    error_oid = RGWDataSyncStatusManager::shard_obj_name(sc->source_zone, shard_id) + ".retry";
-  }
-
-  int operate(const DoutPrefixProvider *dpp) override;
-};
-
-int RGWReadRecoveringBucketShardsCoroutine::operate(const DoutPrefixProvider *dpp)
-{
-  reenter(this){
-    //read recovering bucket shards
-    count = 0;
-    do {
-      omapkeys = std::make_shared<RGWRadosGetOmapKeysCR::Result>();
-      yield call(new RGWRadosGetOmapKeysCR(driver, rgw_raw_obj(sync_env->svc->zone->get_zone_params().log_pool, error_oid),
-            marker, max_omap_entries, omapkeys));
-
-      if (retcode == -ENOENT) {
-        break;
-      }
-
-      if (retcode < 0) {
-        ldpp_dout(dpp, 0) << "failed to read recovering bucket shards with " 
-          << cpp_strerror(retcode) << dendl;
-        return set_cr_error(retcode);
-      }
-
-      error_entries = std::move(omapkeys->entries);
-      if (error_entries.empty()) {
-        break;
-      }
-
-      count += error_entries.size();
-      marker = *error_entries.rbegin();
-      recovering_buckets.insert(std::make_move_iterator(error_entries.begin()),
-                                std::make_move_iterator(error_entries.end()));
-    } while (omapkeys->more && count < max_entries);
-  
-    return set_cr_done();
-  }
-
-  return 0;
-}
-
-class RGWReadPendingBucketShardsCoroutine : public RGWCoroutine {
-  RGWDataSyncCtx *sc;
-  RGWDataSyncEnv *sync_env;
-  rgw::sal::RadosStore* driver;
-
-  const int shard_id;
-  int max_entries;
-
-  set<string>& pending_buckets;
-  string marker;
-  string status_oid;
-
-  rgw_data_sync_marker* sync_marker;
-  int count;
-
-  std::string next_marker;
-  vector<rgw_data_change_log_entry> log_entries;
-  bool truncated;
-
-public:
-  RGWReadPendingBucketShardsCoroutine(RGWDataSyncCtx *_sc, const int _shard_id,
-                                      set<string>& _pending_buckets,
-                                      rgw_data_sync_marker* _sync_marker, const int _max_entries) 
-  : RGWCoroutine(_sc->cct), sc(_sc), sync_env(_sc->env),
-  driver(sync_env->driver), shard_id(_shard_id), max_entries(_max_entries),
-  pending_buckets(_pending_buckets), sync_marker(_sync_marker)
-  {
-    status_oid = RGWDataSyncStatusManager::shard_obj_name(sc->source_zone, shard_id);
-  }
-
-  int operate(const DoutPrefixProvider *dpp) override;
-};
-
-int RGWReadPendingBucketShardsCoroutine::operate(const DoutPrefixProvider *dpp)
-{
-  reenter(this){
-    //read sync status marker
-    using CR = RGWSimpleRadosReadCR<rgw_data_sync_marker>;
-    yield call(new CR(dpp, sync_env->async_rados, sync_env->svc->sysobj,
-                      rgw_raw_obj(sync_env->svc->zone->get_zone_params().log_pool, status_oid),
-                      sync_marker));
-    if (retcode < 0) {
-      ldpp_dout(dpp, 0) << "failed to read sync status marker with " 
-        << cpp_strerror(retcode) << dendl;
-      return set_cr_error(retcode);
-    }
-
-    //read pending bucket shards
-    marker = sync_marker->marker;
-    count = 0;
-    do{
-      yield call(new RGWReadRemoteDataLogShardCR(sc, shard_id, marker,
-                                                 &next_marker, &log_entries, &truncated));
-
-      if (retcode == -ENOENT) {
-        break;
-      }
-
-      if (retcode < 0) {
-        ldpp_dout(dpp, 0) << "failed to read remote data log info with " 
-          << cpp_strerror(retcode) << dendl;
-        return set_cr_error(retcode);
-      }
-
-      if (log_entries.empty()) {
-        break;
-      }
-
-      count += log_entries.size();
-      for (const auto& entry : log_entries) {
-        pending_buckets.insert(entry.entry.key);
-      }
-    }while(truncated && count < max_entries);
-
-    return set_cr_done();
-  }
-
-  return 0;
-}
-
-int RGWRemoteDataLog::read_shard_status(const DoutPrefixProvider *dpp, int shard_id, set<string>& pending_buckets, set<string>& recovering_buckets, rgw_data_sync_marker *sync_marker, const int max_entries)
-{
-  // cannot run concurrently with run_sync(), so run in a separate manager
-  RGWCoroutinesManager crs(driver->ctx(), driver->getRados()->get_cr_registry());
-  RGWHTTPManager http_manager(driver->ctx(), crs.get_completion_mgr());
-  int ret = http_manager.start();
-  if (ret < 0) {
-    ldpp_dout(dpp, 0) << "failed in http_manager.start() ret=" << ret << dendl;
-    return ret;
-  }
-  RGWDataSyncEnv sync_env_local = sync_env;
-  sync_env_local.http_manager = &http_manager;
-  RGWDataSyncCtx sc_local = sc;
-  sc_local.env = &sync_env_local;
-  list<RGWCoroutinesStack *> stacks;
-  RGWCoroutinesStack* recovering_stack = new RGWCoroutinesStack(driver->ctx(), &crs);
-  recovering_stack->call(new RGWReadRecoveringBucketShardsCoroutine(&sc_local, shard_id, recovering_buckets, max_entries));
-  stacks.push_back(recovering_stack);
-  RGWCoroutinesStack* pending_stack = new RGWCoroutinesStack(driver->ctx(), &crs);
-  pending_stack->call(new RGWReadPendingBucketShardsCoroutine(&sc_local, shard_id, pending_buckets, sync_marker, max_entries));
-  stacks.push_back(pending_stack);
-  ret = crs.run(dpp, stacks);
-  http_manager.stop();
-  return ret;
-}
-
-CephContext *RGWBucketPipeSyncStatusManager::get_cct() const
-{
-  return driver->ctx();
-}
-
-void rgw_bucket_entry_owner::decode_json(JSONObj *obj)
-{
-  JSONDecoder::decode_json("ID", id, obj);
-  JSONDecoder::decode_json("DisplayName", display_name, obj);
-}
-
-struct bucket_list_entry {
-  bool delete_marker;
-  rgw_obj_key key;
-  bool is_latest;
-  real_time mtime;
-  string etag;
-  uint64_t size;
-  string storage_class;
-  rgw_bucket_entry_owner owner;
-  uint64_t versioned_epoch;
-  string rgw_tag;
-
-  bucket_list_entry() : delete_marker(false), is_latest(false), size(0), versioned_epoch(0) {}
-
-  void decode_json(JSONObj *obj) {
-    JSONDecoder::decode_json("IsDeleteMarker", delete_marker, obj);
-    JSONDecoder::decode_json("Key", key.name, obj);
-    JSONDecoder::decode_json("VersionId", key.instance, obj);
-    JSONDecoder::decode_json("IsLatest", is_latest, obj);
-    string mtime_str;
-    JSONDecoder::decode_json("RgwxMtime", mtime_str, obj);
-
-    struct tm t;
-    uint32_t nsec;
-    if (parse_iso8601(mtime_str.c_str(), &t, &nsec)) {
-      ceph_timespec ts;
-      ts.tv_sec = (uint64_t)internal_timegm(&t);
-      ts.tv_nsec = nsec;
-      mtime = real_clock::from_ceph_timespec(ts);
-    }
-    JSONDecoder::decode_json("ETag", etag, obj);
-    JSONDecoder::decode_json("Size", size, obj);
-    JSONDecoder::decode_json("StorageClass", storage_class, obj);
-    JSONDecoder::decode_json("Owner", owner, obj);
-    JSONDecoder::decode_json("VersionedEpoch", versioned_epoch, obj);
-    JSONDecoder::decode_json("RgwxTag", rgw_tag, obj);
-    if (key.instance == "null" && !versioned_epoch) {
-      key.instance.clear();
-    }
-  }
-
-  RGWModifyOp get_modify_op() const {
-    if (delete_marker) {
-      return CLS_RGW_OP_LINK_OLH_DM;
-    } else if (!key.instance.empty() && key.instance != "null") {
-      return CLS_RGW_OP_LINK_OLH;
-    } else {
-      return CLS_RGW_OP_ADD;
-    }
-  }
-};
-
-struct bucket_list_result {
-  string name;
-  string prefix;
-  string key_marker;
-  string version_id_marker;
-  int max_keys;
-  bool is_truncated;
-  list<bucket_list_entry> entries;
-
-  bucket_list_result() : max_keys(0), is_truncated(false) {}
-
-  void decode_json(JSONObj *obj) {
-    JSONDecoder::decode_json("Name", name, obj);
-    JSONDecoder::decode_json("Prefix", prefix, obj);
-    JSONDecoder::decode_json("KeyMarker", key_marker, obj);
-    JSONDecoder::decode_json("VersionIdMarker", version_id_marker, obj);
-    JSONDecoder::decode_json("MaxKeys", max_keys, obj);
-    JSONDecoder::decode_json("IsTruncated", is_truncated, obj);
-    JSONDecoder::decode_json("Entries", entries, obj);
-  }
-};
-
-class RGWListRemoteBucketCR: public RGWCoroutine {
-  RGWDataSyncCtx *sc;
-  RGWDataSyncEnv *sync_env;
-  const rgw_bucket_shard& bs;
-  rgw_obj_key marker_position;
-
-  bucket_list_result *result;
-
-public:
-  RGWListRemoteBucketCR(RGWDataSyncCtx *_sc, const rgw_bucket_shard& bs,
-                        rgw_obj_key& _marker_position, bucket_list_result *_result)
-    : RGWCoroutine(_sc->cct), sc(_sc), sync_env(_sc->env), bs(bs),
-      marker_position(_marker_position), result(_result) {}
-
-  int operate(const DoutPrefixProvider *dpp) override {
-    reenter(this) {
-      yield {
-        rgw_http_param_pair pairs[] = { { "versions" , NULL },
-                                       { "format" , "json" },
-                                       { "objs-container" , "true" },
-                                       { "key-marker" , marker_position.name.c_str() },
-                                       { "version-id-marker" , marker_position.instance.c_str() },
-                                       { NULL, NULL } };
-        string p = string("/") + bs.bucket.get_key(':', 0);
-        call(new RGWReadRESTResourceCR<bucket_list_result>(sync_env->cct, sc->conn, sync_env->http_manager, p, pairs, result));
-      }
-      if (retcode < 0) {
-        return set_cr_error(retcode);
-      }
-      return set_cr_done();
-    }
-    return 0;
-  }
-};
-
-struct next_bilog_result {
-  uint64_t generation = 0;
-  int num_shards = 0;
-
-  void decode_json(JSONObj *obj) {
-    JSONDecoder::decode_json("generation", generation, obj);
-    JSONDecoder::decode_json("num_shards", num_shards, obj);
-  }
-};
-
-struct bilog_list_result {
-  list<rgw_bi_log_entry> entries;
-  bool truncated{false};
-  std::optional<next_bilog_result> next_log;
-
-  void decode_json(JSONObj *obj) {
-    JSONDecoder::decode_json("entries", entries, obj);
-    JSONDecoder::decode_json("truncated", truncated, obj);
-    JSONDecoder::decode_json("next_log", next_log, obj);
-  }
-};
-
-class RGWListBucketIndexLogCR: public RGWCoroutine {
-  RGWDataSyncCtx *sc;
-  RGWDataSyncEnv *sync_env;
-  const string instance_key;
-  string marker;
-
-  bilog_list_result *result;
-  std::optional<PerfGuard> timer;
-  uint64_t generation;
-  std::string gen_str = std::to_string(generation);
-  uint32_t format_ver{1};
-
-public:
-  RGWListBucketIndexLogCR(RGWDataSyncCtx *_sc, const rgw_bucket_shard& bs, string& _marker,
-                          uint64_t _generation, bilog_list_result *_result)
-    : RGWCoroutine(_sc->cct), sc(_sc), sync_env(_sc->env),
-      instance_key(bs.get_key()), marker(_marker), result(_result), generation(_generation) {}
-
-  int operate(const DoutPrefixProvider *dpp) override {
-    reenter(this) {
-      if (sync_env->counters) {
-        timer.emplace(sync_env->counters, sync_counters::l_poll);
-      }
-      yield {
-        rgw_http_param_pair pairs[] = { { "bucket-instance", instance_key.c_str() },
-                                       { "format" , "json" },
-                                       { "marker" , marker.c_str() },
-                                       { "type", "bucket-index" },
-                                       { "generation", gen_str.c_str() },
-                                       { "format-ver", "2"},
-                                       { NULL, NULL } };
-
-        call(new RGWReadRESTResourceCR<bilog_list_result>(sync_env->cct, sc->conn, sync_env->http_manager,
-                                                      "/admin/log", pairs, result));
-      }
-      timer.reset();
-      if (retcode < 0) {
-        if (sync_env->counters) {
-          sync_env->counters->inc(sync_counters::l_poll_err);
-        }
-        return set_cr_error(retcode);
-      }
-      return set_cr_done();
-    }
-    return 0;
-  }
-};
-
-#define BUCKET_SYNC_UPDATE_MARKER_WINDOW 10
-
-class RGWBucketFullSyncMarkerTrack : public RGWSyncShardMarkerTrack<rgw_obj_key, rgw_obj_key> {
-  RGWDataSyncCtx *sc;
-  RGWDataSyncEnv *sync_env;
-
-  const rgw_raw_obj& status_obj;
-  rgw_bucket_sync_status& sync_status;
-  RGWSyncTraceNodeRef tn;
-  RGWObjVersionTracker& objv_tracker;
-
-public:
-  RGWBucketFullSyncMarkerTrack(RGWDataSyncCtx *_sc,
-                               const rgw_raw_obj& status_obj,
-                               rgw_bucket_sync_status& sync_status,
-                               RGWSyncTraceNodeRef tn,
-                               RGWObjVersionTracker& objv_tracker)
-    : RGWSyncShardMarkerTrack(BUCKET_SYNC_UPDATE_MARKER_WINDOW),
-      sc(_sc), sync_env(_sc->env), status_obj(status_obj),
-      sync_status(sync_status), tn(std::move(tn)), objv_tracker(objv_tracker)
-  {}
-
-
-  RGWCoroutine *store_marker(const rgw_obj_key& new_marker, uint64_t index_pos, const real_time& timestamp) override {
-    sync_status.full.position = new_marker;
-    sync_status.full.count = index_pos;
-
-    tn->log(20, SSTR("updating marker oid=" << status_obj.oid << " marker=" << new_marker));
-    return new RGWSimpleRadosWriteCR<rgw_bucket_sync_status>(
-        sync_env->dpp, sync_env->async_rados, sync_env->svc->sysobj,
-       status_obj, sync_status, &objv_tracker);
-  }
-
-  RGWOrderCallCR *allocate_order_control_cr() override {
-    return new RGWLastCallerWinsCR(sync_env->cct);
-  }
-};
-
-// write the incremental sync status and update 'stable_timestamp' on success
-class RGWWriteBucketShardIncSyncStatus : public RGWCoroutine {
-  RGWDataSyncEnv *sync_env;
-  rgw_raw_obj obj;
-  rgw_bucket_shard_inc_sync_marker sync_marker;
-  ceph::real_time* stable_timestamp;
-  RGWObjVersionTracker& objv_tracker;
-  std::map<std::string, bufferlist> attrs;
- public:
-  RGWWriteBucketShardIncSyncStatus(RGWDataSyncEnv *sync_env,
-                                   const rgw_raw_obj& obj,
-                                   const rgw_bucket_shard_inc_sync_marker& sync_marker,
-                                   ceph::real_time* stable_timestamp,
-                                   RGWObjVersionTracker& objv_tracker)
-    : RGWCoroutine(sync_env->cct), sync_env(sync_env), obj(obj),
-      sync_marker(sync_marker), stable_timestamp(stable_timestamp),
-      objv_tracker(objv_tracker)
-  {}
-  int operate(const DoutPrefixProvider *dpp) {
-    reenter(this) {
-      sync_marker.encode_attr(attrs);
-
-      yield call(new RGWSimpleRadosWriteAttrsCR(sync_env->dpp, sync_env->async_rados, sync_env->svc->sysobj,
-                                                obj, attrs, &objv_tracker));
-      if (retcode < 0) {
-        return set_cr_error(retcode);
-      }
-      if (stable_timestamp) {
-        *stable_timestamp = sync_marker.timestamp;
-      }
-      return set_cr_done();
-    }
-    return 0;
-  }
-};
-
-class RGWBucketIncSyncShardMarkerTrack : public RGWSyncShardMarkerTrack<string, rgw_obj_key> {
-  RGWDataSyncCtx *sc;
-  RGWDataSyncEnv *sync_env;
-
-  rgw_raw_obj obj;
-  rgw_bucket_shard_inc_sync_marker sync_marker;
-
-  map<rgw_obj_key, string> key_to_marker;
-
-  struct operation {
-    rgw_obj_key key;
-    bool is_olh;
-  };
-  map<string, operation> marker_to_op;
-  std::set<std::string> pending_olh; // object names with pending olh operations
-
-  RGWSyncTraceNodeRef tn;
-  RGWObjVersionTracker& objv_tracker;
-  ceph::real_time* stable_timestamp;
-
-  void handle_finish(const string& marker) override {
-    auto iter = marker_to_op.find(marker);
-    if (iter == marker_to_op.end()) {
-      return;
-    }
-    auto& op = iter->second;
-    key_to_marker.erase(op.key);
-    reset_need_retry(op.key);
-    if (op.is_olh) {
-      pending_olh.erase(op.key.name);
-    }
-    marker_to_op.erase(iter);
-  }
-
-public:
-  RGWBucketIncSyncShardMarkerTrack(RGWDataSyncCtx *_sc,
-                         const string& _marker_oid,
-                         const rgw_bucket_shard_inc_sync_marker& _marker,
-                         RGWSyncTraceNodeRef tn,
-                         RGWObjVersionTracker& objv_tracker,
-                         ceph::real_time* stable_timestamp)
-    : RGWSyncShardMarkerTrack(BUCKET_SYNC_UPDATE_MARKER_WINDOW),
-      sc(_sc), sync_env(_sc->env),
-      obj(sync_env->svc->zone->get_zone_params().log_pool, _marker_oid),
-      sync_marker(_marker), tn(std::move(tn)), objv_tracker(objv_tracker),
-      stable_timestamp(stable_timestamp)
-  {}
-
-  const rgw_raw_obj& get_obj() const { return obj; }
-
-  RGWCoroutine* store_marker(const string& new_marker, uint64_t index_pos, const real_time& timestamp) override {
-    sync_marker.position = new_marker;
-    sync_marker.timestamp = timestamp;
-
-    tn->log(20, SSTR("updating marker marker_oid=" << obj.oid << " marker=" << new_marker << " timestamp=" << timestamp));
-    return new RGWWriteBucketShardIncSyncStatus(sync_env, obj, sync_marker,
-                                                stable_timestamp, objv_tracker);
-  }
-
-  /*
-   * create index from key -> <op, marker>, and from marker -> key
-   * this is useful so that we can insure that we only have one
-   * entry for any key that is used. This is needed when doing
-   * incremenatl sync of data, and we don't want to run multiple
-   * concurrent sync operations for the same bucket shard 
-   * Also, we should make sure that we don't run concurrent operations on the same key with
-   * different ops.
-   */
-  bool index_key_to_marker(const rgw_obj_key& key, const string& marker, bool is_olh) {
-    auto result = key_to_marker.emplace(key, marker);
-    if (!result.second) { // exists
-      set_need_retry(key);
-      return false;
-    }
-    marker_to_op[marker] = operation{key, is_olh};
-    if (is_olh) {
-      // prevent other olh ops from starting on this object name
-      pending_olh.insert(key.name);
-    }
-    return true;
-  }
-
-  bool can_do_op(const rgw_obj_key& key, bool is_olh) {
-    // serialize olh ops on the same object name
-    if (is_olh && pending_olh.count(key.name)) {
-      tn->log(20, SSTR("sync of " << key << " waiting for pending olh op"));
-      return false;
-    }
-    return (key_to_marker.find(key) == key_to_marker.end());
-  }
-
-  RGWOrderCallCR *allocate_order_control_cr() override {
-    return new RGWLastCallerWinsCR(sync_env->cct);
-  }
-};
-
-static bool ignore_sync_error(int err) {
-  switch (err) {
-    case -ENOENT:
-    case -EPERM:
-      return true;
-    default:
-      break;
-  }
-  return false;
-}
-
-template <class T, class K>
-class RGWBucketSyncSingleEntryCR : public RGWCoroutine {
-  RGWDataSyncCtx *sc;
-  RGWDataSyncEnv *sync_env;
-
-  rgw_bucket_sync_pipe& sync_pipe;
-  rgw_bucket_shard& bs;
-
-  rgw_obj_key key;
-  bool versioned;
-  std::optional<uint64_t> versioned_epoch;
-  rgw_bucket_entry_owner owner;
-  real_time timestamp;
-  RGWModifyOp op;
-  RGWPendingState op_state;
-
-  T entry_marker;
-  RGWSyncShardMarkerTrack<T, K> *marker_tracker;
-
-  int sync_status;
-
-  stringstream error_ss;
-
-  bool error_injection;
-
-  RGWDataSyncModule *data_sync_module;
-  
-  rgw_zone_set zones_trace;
-
-  RGWSyncTraceNodeRef tn;
-  std::string zone_name;
-
-public:
-  RGWBucketSyncSingleEntryCR(RGWDataSyncCtx *_sc,
-                             rgw_bucket_sync_pipe& _sync_pipe,
-                             const rgw_obj_key& _key, bool _versioned,
-                             std::optional<uint64_t> _versioned_epoch,
-                             real_time& _timestamp,
-                             const rgw_bucket_entry_owner& _owner,
-                             RGWModifyOp _op, RGWPendingState _op_state,
-                            const T& _entry_marker, RGWSyncShardMarkerTrack<T, K> *_marker_tracker, rgw_zone_set& _zones_trace,
-                             RGWSyncTraceNodeRef& _tn_parent) : RGWCoroutine(_sc->cct),
-                                                     sc(_sc), sync_env(_sc->env),
-                                                      sync_pipe(_sync_pipe), bs(_sync_pipe.info.source_bs),
-                                                      key(_key), versioned(_versioned), versioned_epoch(_versioned_epoch),
-                                                      owner(_owner),
-                                                      timestamp(_timestamp), op(_op),
-                                                      op_state(_op_state),
-                                                      entry_marker(_entry_marker),
-                                                      marker_tracker(_marker_tracker),
-                                                      sync_status(0){
-    stringstream ss;
-    ss << bucket_shard_str{bs} << "/" << key << "[" << versioned_epoch.value_or(0) << "]";
-    set_description() << "bucket sync single entry (source_zone=" << sc->source_zone << ") b=" << ss.str() << " log_entry=" << entry_marker << " op=" << (int)op << " op_state=" << (int)op_state;
-    set_status("init");
-
-    tn = sync_env->sync_tracer->add_node(_tn_parent, "entry", SSTR(key));
-
-    tn->log(20, SSTR("bucket sync single entry (source_zone=" << sc->source_zone << ") b=" << ss.str() << " log_entry=" << entry_marker << " op=" << (int)op << " op_state=" << (int)op_state));
-    error_injection = (sync_env->cct->_conf->rgw_sync_data_inject_err_probability > 0);
-
-    data_sync_module = sync_env->sync_module->get_data_handler();
-    
-    zones_trace = _zones_trace;
-    zones_trace.insert(sync_env->svc->zone->get_zone().id, _sync_pipe.info.dest_bucket.get_key());
-
-    if (sc->env->ostr) {
-      RGWZone* z;
-      if ((z = sc->env->driver->svc()->zone->find_zone(sc->source_zone))) {
-       zone_name = z->name;
-      }
-    }
-  }
-
-  int operate(const DoutPrefixProvider *dpp) override {
-    reenter(this) {
-      /* skip entries that are not complete */
-      if (op_state != CLS_RGW_STATE_COMPLETE) {
-        goto done;
-      }
-      tn->set_flag(RGW_SNS_FLAG_ACTIVE);
-      do {
-        yield {
-          marker_tracker->reset_need_retry(key);
-          if (key.name.empty()) {
-            /* shouldn't happen */
-            set_status("skipping empty entry");
-            tn->log(0, "entry with empty obj name, skipping");
-            goto done;
-          }
-          if (error_injection &&
-              rand() % 10000 < cct->_conf->rgw_sync_data_inject_err_probability * 10000.0) {
-            tn->log(0, SSTR(": injecting data sync error on key=" << key.name));
-            retcode = -EIO;
-          } else if (op == CLS_RGW_OP_ADD ||
-                     op == CLS_RGW_OP_LINK_OLH) {
-            set_status("syncing obj");
-            tn->log(5, SSTR("bucket sync: sync obj: " << sc->source_zone << "/" << bs.bucket << "/" << key << "[" << versioned_epoch.value_or(0) << "]"));
-           if (versioned_epoch) {
-             pretty_print(sc->env, "Syncing object s3://{}/{} version {} in sync from zone {}\n", 
-                          bs.bucket.name, key, *versioned_epoch, zone_name);
-           } else {
-             pretty_print(sc->env, "Syncing object s3://{}/{} in sync from zone {}\n",
-                          bs.bucket.name, key, zone_name);
-           }
-            call(data_sync_module->sync_object(dpp, sc, sync_pipe, key, versioned_epoch, &zones_trace));
-          } else if (op == CLS_RGW_OP_DEL || op == CLS_RGW_OP_UNLINK_INSTANCE) {
-            set_status("removing obj");
-           if (versioned_epoch) {
-             pretty_print(sc->env, "Deleting object s3://{}/{} version {} in sync from zone {}\n",
-                          bs.bucket.name, key, *versioned_epoch, zone_name);
-           } else {
-             pretty_print(sc->env, "Deleting object s3://{}/{} in sync from zone {}\n",
-                          bs.bucket.name, key, zone_name);
-           }
-            if (op == CLS_RGW_OP_UNLINK_INSTANCE) {
-              versioned = true;
-            }
-            tn->log(10, SSTR("removing obj: " << sc->source_zone << "/" << bs.bucket << "/" << key << "[" << versioned_epoch.value_or(0) << "]"));
-            call(data_sync_module->remove_object(dpp, sc, sync_pipe, key, timestamp, versioned, versioned_epoch.value_or(0), &zones_trace));
-            // our copy of the object is more recent, continue as if it succeeded
-          } else if (op == CLS_RGW_OP_LINK_OLH_DM) {
-            set_status("creating delete marker");
-            tn->log(10, SSTR("creating delete marker: obj: " << sc->source_zone << "/" << bs.bucket << "/" << key << "[" << versioned_epoch.value_or(0) << "]"));
-            call(data_sync_module->create_delete_marker(dpp, sc, sync_pipe, key, timestamp, owner, versioned, versioned_epoch.value_or(0), &zones_trace));
-          }
-          tn->set_resource_name(SSTR(bucket_str_noinstance(bs.bucket) << "/" << key));
-        }
-        if (retcode == -ERR_PRECONDITION_FAILED) {
-         pretty_print(sc->env, "Skipping object s3://{}/{} in sync from zone {}\n",
-                      bs.bucket.name, key, zone_name);
-          set_status("Skipping object sync: precondition failed (object contains newer change or policy doesn't allow sync)");
-          tn->log(0, "Skipping object sync: precondition failed (object contains newer change or policy doesn't allow sync)");
-          retcode = 0;
-        }
-      } while (marker_tracker->need_retry(key));
-      {
-        tn->unset_flag(RGW_SNS_FLAG_ACTIVE);
-        if (retcode >= 0) {
-          tn->log(10, "success");
-        } else {
-          tn->log(10, SSTR("failed, retcode=" << retcode << " (" << cpp_strerror(-retcode) << ")"));
-        }
-      }
-
-      if (retcode < 0 && retcode != -ENOENT) {
-        set_status() << "failed to sync obj; retcode=" << retcode;
-        tn->log(0, SSTR("ERROR: failed to sync object: "
-            << bucket_shard_str{bs} << "/" << key.name));
-        if (!ignore_sync_error(retcode)) {
-          error_ss << bucket_shard_str{bs} << "/" << key.name;
-          sync_status = retcode;
-        }
-      }
-      if (!error_ss.str().empty()) {
-        yield call(sync_env->error_logger->log_error_cr(dpp, sc->conn->get_remote_id(), "data", error_ss.str(), -retcode, string("failed to sync object") + cpp_strerror(-sync_status)));
-      }
-done:
-      if (sync_status == 0) {
-        /* update marker */
-        set_status() << "calling marker_tracker->finish(" << entry_marker << ")";
-        yield call(marker_tracker->finish(entry_marker));
-        sync_status = retcode;
-      }
-      if (sync_status < 0) {
-        return set_cr_error(sync_status);
-      }
-      return set_cr_done();
-    }
-    return 0;
-  }
-};
-
-class RGWBucketFullSyncCR : public RGWCoroutine {
-  RGWDataSyncCtx *sc;
-  RGWDataSyncEnv *sync_env;
-  rgw_bucket_sync_pipe& sync_pipe;
-  rgw_bucket_sync_status& sync_status;
-  rgw_bucket_shard& bs;
-  boost::intrusive_ptr<const RGWContinuousLeaseCR> lease_cr;
-  bucket_list_result list_result;
-  list<bucket_list_entry>::iterator entries_iter;
-  rgw_obj_key list_marker;
-  bucket_list_entry *entry{nullptr};
-
-  int total_entries{0};
-
-  int sync_result{0};
-
-  const rgw_raw_obj& status_obj;
-  RGWObjVersionTracker& objv;
-
-  rgw_zone_set zones_trace;
-
-  RGWSyncTraceNodeRef tn;
-  RGWBucketFullSyncMarkerTrack marker_tracker;
-
-  struct _prefix_handler {
-    RGWBucketSyncFlowManager::pipe_rules_ref rules;
-    RGWBucketSyncFlowManager::pipe_rules::prefix_map_t::const_iterator iter;
-    std::optional<string> cur_prefix;
-
-    void set_rules(RGWBucketSyncFlowManager::pipe_rules_ref& _rules) {
-      rules = _rules;
-    }
-
-    bool revalidate_marker(rgw_obj_key *marker) {
-      if (cur_prefix &&
-          boost::starts_with(marker->name, *cur_prefix)) {
-        return true;
-      }
-      if (!rules) {
-        return false;
-      }
-      iter = rules->prefix_search(marker->name);
-      if (iter == rules->prefix_end()) {
-        return false;
-      }
-      cur_prefix = iter->first;
-      marker->name = *cur_prefix;
-      marker->instance.clear();
-      return true;
-    }
-
-    bool check_key_handled(const rgw_obj_key& key) {
-      if (!rules) {
-        return false;
-      }
-      if (cur_prefix &&
-          boost::starts_with(key.name, *cur_prefix)) {
-        return true;
-      }
-      iter = rules->prefix_search(key.name);
-      if (iter == rules->prefix_end()) {
-        return false;
-      }
-      cur_prefix = iter->first;
-      return boost::starts_with(key.name, iter->first);
-    }
-  } prefix_handler;
-
-public:
-  RGWBucketFullSyncCR(RGWDataSyncCtx *_sc,
-                      rgw_bucket_sync_pipe& _sync_pipe,
-                      const rgw_raw_obj& status_obj,
-                      boost::intrusive_ptr<const RGWContinuousLeaseCR> lease_cr,
-                      rgw_bucket_sync_status& sync_status,
-                      RGWSyncTraceNodeRef tn_parent,
-                      RGWObjVersionTracker& objv_tracker)
-    : RGWCoroutine(_sc->cct), sc(_sc), sync_env(_sc->env),
-      sync_pipe(_sync_pipe), sync_status(sync_status),
-      bs(_sync_pipe.info.source_bs),
-      lease_cr(std::move(lease_cr)), status_obj(status_obj), objv(objv_tracker),
-      tn(sync_env->sync_tracer->add_node(tn_parent, "full_sync",
-                                         SSTR(bucket_shard_str{bs}))),
-      marker_tracker(sc, status_obj, sync_status, tn, objv_tracker)
-  {
-    zones_trace.insert(sc->source_zone.id, sync_pipe.info.dest_bucket.get_key());
-    prefix_handler.set_rules(sync_pipe.get_rules());
-  }
-
-  int operate(const DoutPrefixProvider *dpp) override;
-};
-
-int RGWBucketFullSyncCR::operate(const DoutPrefixProvider *dpp)
-{
-  reenter(this) {
-    list_marker = sync_status.full.position;
-
-    total_entries = sync_status.full.count;
-    do {
-      if (lease_cr && !lease_cr->is_locked()) {
-        drain_all();
-        tn->log(1, "no lease or lease is lost, abort");
-        return set_cr_error(-ECANCELED);
-      }
-      set_status("listing remote bucket");
-      tn->log(20, "listing bucket for full sync");
-
-      if (!prefix_handler.revalidate_marker(&list_marker)) {
-        set_status() << "finished iterating over all available prefixes: last marker=" << list_marker;
-        tn->log(20, SSTR("finished iterating over all available prefixes: last marker=" << list_marker));
-        break;
-      }
-
-      yield call(new RGWListRemoteBucketCR(sc, bs, list_marker, &list_result));
-      if (retcode < 0 && retcode != -ENOENT) {
-        set_status("failed bucket listing, going down");
-        drain_all();
-        return set_cr_error(retcode);
-      }
-      if (list_result.entries.size() > 0) {
-        tn->set_flag(RGW_SNS_FLAG_ACTIVE); /* actually have entries to sync */
-      }
-      entries_iter = list_result.entries.begin();
-      for (; entries_iter != list_result.entries.end(); ++entries_iter) {
-        if (lease_cr && !lease_cr->is_locked()) {
-          drain_all();
-          tn->log(1, "no lease or lease is lost, abort");
-          return set_cr_error(-ECANCELED);
-        }
-        tn->log(20, SSTR("[full sync] syncing object: "
-            << bucket_shard_str{bs} << "/" << entries_iter->key));
-        entry = &(*entries_iter);
-        list_marker = entries_iter->key;
-        if (!prefix_handler.check_key_handled(entries_iter->key)) {
-          set_status() << "skipping entry due to policy rules: " << entries_iter->key;
-          tn->log(20, SSTR("skipping entry due to policy rules: " << entries_iter->key));
-          continue;
-        }
-        total_entries++;
-        if (!marker_tracker.start(entry->key, total_entries, real_time())) {
-          tn->log(0, SSTR("ERROR: cannot start syncing " << entry->key << ". Duplicate entry?"));
-        } else {
-          using SyncCR = RGWBucketSyncSingleEntryCR<rgw_obj_key, rgw_obj_key>;
-          yield spawn(new SyncCR(sc, sync_pipe, entry->key,
-                                 false, /* versioned, only matters for object removal */
-                                 entry->versioned_epoch, entry->mtime,
-                                 entry->owner, entry->get_modify_op(), CLS_RGW_STATE_COMPLETE,
-                                 entry->key, &marker_tracker, zones_trace, tn),
-                      false);
-        }
-        drain_with_cb(cct->_conf->rgw_bucket_sync_spawn_window,
-                      [&](uint64_t stack_id, int ret) {
-                if (ret < 0) {
-                  tn->log(10, "a sync operation returned error");
-                  sync_result = ret;
-                }
-                return 0;
-              });
-      }
-    } while (list_result.is_truncated && sync_result == 0);
-    set_status("done iterating over all objects");
-
-    /* wait for all operations to complete */
-    drain_all_cb([&](uint64_t stack_id, int ret) {
-      if (ret < 0) {
-        tn->log(10, "a sync operation returned error");
-        sync_result = ret;
-      }
-      return 0;
-    });
-    tn->unset_flag(RGW_SNS_FLAG_ACTIVE);
-    if (lease_cr && !lease_cr->is_locked()) {
-      tn->log(1, "no lease or lease is lost, abort");
-      return set_cr_error(-ECANCELED);
-    }
-    yield call(marker_tracker.flush());
-    if (retcode < 0) {
-      tn->log(0, SSTR("ERROR: marker_tracker.flush() returned retcode=" << retcode));
-      return set_cr_error(retcode);
-    }
-    /* update sync state to incremental */
-    if (sync_result == 0) {
-      sync_status.state = BucketSyncState::Incremental;
-      tn->log(5, SSTR("set bucket state=" << sync_status.state));
-      yield call(new RGWSimpleRadosWriteCR<rgw_bucket_sync_status>(
-             dpp, sync_env->async_rados, sync_env->svc->sysobj,
-              status_obj, sync_status, &objv));
-      tn->log(5, SSTR("bucket status objv=" << objv));
-    } else {
-      tn->log(10, SSTR("backing out with sync_status=" << sync_result));
-    }
-    if (retcode < 0 && sync_result == 0) { /* actually tried to set incremental state and failed */
-      tn->log(0, SSTR("ERROR: failed to set sync state on bucket "
-          << bucket_shard_str{bs} << " retcode=" << retcode));
-      return set_cr_error(retcode);
-    }
-    if (sync_result < 0) {
-      return set_cr_error(sync_result);
-    }
-    return set_cr_done();
-  }
-  return 0;
-}
-
-static bool has_olh_epoch(RGWModifyOp op) {
-  return op == CLS_RGW_OP_LINK_OLH || op == CLS_RGW_OP_UNLINK_INSTANCE;
-}
-
-class RGWBucketShardIsDoneCR : public RGWCoroutine {
-  RGWDataSyncCtx *sc;
-  RGWDataSyncEnv *sync_env;
-  rgw_bucket_sync_status bucket_status;
-  const rgw_raw_obj& bucket_status_obj;
-  const int shard_id;
-  RGWObjVersionTracker objv_tracker;
-  const next_bilog_result& next_log;
-  const uint64_t generation;
-
-public:
-  RGWBucketShardIsDoneCR(RGWDataSyncCtx *_sc, const rgw_raw_obj& _bucket_status_obj,
-                         int _shard_id, const next_bilog_result& _next_log, const uint64_t _gen)
-    : RGWCoroutine(_sc->cct), sc(_sc), sync_env(_sc->env),
-      bucket_status_obj(_bucket_status_obj),
-      shard_id(_shard_id), next_log(_next_log), generation(_gen) {}
-
-  int operate(const DoutPrefixProvider* dpp) override
-  {
-    reenter(this) {
-      do {
-        // read bucket sync status
-        objv_tracker.clear();
-        using ReadCR = RGWSimpleRadosReadCR<rgw_bucket_sync_status>;
-        yield call(new ReadCR(dpp, sync_env->async_rados, sync_env->svc->sysobj,
-                              bucket_status_obj, &bucket_status, false, &objv_tracker));
-        if (retcode < 0) {
-          ldpp_dout(dpp, 20) << "failed to read bucket shard status: "
-              << cpp_strerror(retcode) << dendl;
-          return set_cr_error(retcode);
-        }
-
-        if (bucket_status.state != BucketSyncState::Incremental) {
-          // exit with success to avoid stale shard being
-          // retried in error repo if we lost a race
-          ldpp_dout(dpp, 20) << "RGWBucketShardIsDoneCR found sync state = " << bucket_status.state << dendl;
-          return set_cr_done();
-        }
-
-        if (bucket_status.incremental_gen != generation) {
-          // exit with success to avoid stale shard being
-          // retried in error repo if we lost a race
-          ldpp_dout(dpp, 20) << "RGWBucketShardIsDoneCR expected gen: " << generation
-              << ", got: " << bucket_status.incremental_gen << dendl;
-          return set_cr_done();
-        }
-
-        yield {
-          // update bucket_status after a shard is done with current gen
-          auto& done = bucket_status.shards_done_with_gen;
-          done[shard_id] = true;
-
-          // increment gen if all shards are already done with current gen
-          if (std::all_of(done.begin(), done.end(),
-            [] (const bool done){return done; } )) {
-            bucket_status.incremental_gen = next_log.generation;
-            done.clear();
-            done.resize(next_log.num_shards, false);
-          }
-          ldpp_dout(dpp, 20) << "bucket status incremental gen is " << bucket_status.incremental_gen << dendl;
-          using WriteCR = RGWSimpleRadosWriteCR<rgw_bucket_sync_status>;
-          call(new WriteCR(dpp, sync_env->async_rados, sync_env->svc->sysobj,
-                            bucket_status_obj, bucket_status, &objv_tracker, false));
-        }
-        if (retcode < 0 && retcode != -ECANCELED) {
-          ldpp_dout(dpp, 20) << "failed to write bucket sync status: " << cpp_strerror(retcode) << dendl;
-          return set_cr_error(retcode);
-        } else if (retcode >= 0) {
-          return set_cr_done();
-        }
-      } while (retcode == -ECANCELED);
-    }
-    return 0;
-  }
-};
-
-class RGWBucketShardIncrementalSyncCR : public RGWCoroutine {
-  RGWDataSyncCtx *sc;
-  RGWDataSyncEnv *sync_env;
-  rgw_bucket_sync_pipe& sync_pipe;
-  RGWBucketSyncFlowManager::pipe_rules_ref rules;
-  rgw_bucket_shard& bs;
-  const rgw_raw_obj& bucket_status_obj;
-  boost::intrusive_ptr<const RGWContinuousLeaseCR> lease_cr;
-  bilog_list_result extended_result;
-  list<rgw_bi_log_entry> list_result;
-  int next_num_shards;
-  uint64_t next_gen;
-  bool truncated;
-
-  list<rgw_bi_log_entry>::iterator entries_iter, entries_end;
-  map<pair<string, string>, pair<real_time, RGWModifyOp> > squash_map;
-  rgw_bucket_shard_sync_info& sync_info;
-  uint64_t generation;
-  rgw_obj_key key;
-  rgw_bi_log_entry *entry{nullptr};
-  bool updated_status{false};
-  rgw_zone_id zone_id;
-  string target_location_key;
-
-  string cur_id;
-
-  int sync_status{0};
-  bool syncstopped{false};
-
-  RGWSyncTraceNodeRef tn;
-  RGWBucketIncSyncShardMarkerTrack marker_tracker;
-
-public:
-  RGWBucketShardIncrementalSyncCR(RGWDataSyncCtx *_sc,
-                                  rgw_bucket_sync_pipe& _sync_pipe,
-                                  const std::string& shard_status_oid,
-                                  const rgw_raw_obj& _bucket_status_obj,
-                                  boost::intrusive_ptr<const RGWContinuousLeaseCR> lease_cr,
-                                  rgw_bucket_shard_sync_info& sync_info,
-                                  uint64_t generation,
-                                  RGWSyncTraceNodeRef& _tn_parent,
-                                  RGWObjVersionTracker& objv_tracker,
-                                  ceph::real_time* stable_timestamp)
-    : RGWCoroutine(_sc->cct), sc(_sc), sync_env(_sc->env),
-      sync_pipe(_sync_pipe), bs(_sync_pipe.info.source_bs),
-      bucket_status_obj(_bucket_status_obj), lease_cr(std::move(lease_cr)),
-      sync_info(sync_info), generation(generation), zone_id(sync_env->svc->zone->get_zone().id),
-      tn(sync_env->sync_tracer->add_node(_tn_parent, "inc_sync",
-                                         SSTR(bucket_shard_str{bs}))),
-      marker_tracker(sc, shard_status_oid, sync_info.inc_marker, tn,
-                     objv_tracker, stable_timestamp)
-  {
-    set_description() << "bucket shard incremental sync bucket="
-        << bucket_shard_str{bs};
-    set_status("init");
-    rules = sync_pipe.get_rules();
-    target_location_key = sync_pipe.info.dest_bucket.get_key();
-  }
-
-  bool check_key_handled(const rgw_obj_key& key) {
-    if (!rules) {
-      return false;
-    }
-    auto iter = rules->prefix_search(key.name);
-    if (iter == rules->prefix_end()) {
-      return false;
-    }
-    return boost::starts_with(key.name, iter->first);
-  }
-
-  int operate(const DoutPrefixProvider *dpp) override;
-};
-
-int RGWBucketShardIncrementalSyncCR::operate(const DoutPrefixProvider *dpp)
-{
-  int ret;
-  reenter(this) {
-    do {
-      if (lease_cr && !lease_cr->is_locked()) {
-        drain_all();
-        tn->log(1, "no lease or lease is lost, abort");
-        return set_cr_error(-ECANCELED);
-      }
-      tn->log(20, SSTR("listing bilog for incremental sync; position=" << sync_info.inc_marker.position));
-      set_status() << "listing bilog; position=" << sync_info.inc_marker.position;
-      yield call(new RGWListBucketIndexLogCR(sc, bs, sync_info.inc_marker.position, generation, &extended_result));
-      if (retcode < 0 && retcode != -ENOENT) {
-        /* wait for all operations to complete */
-        drain_all();
-        return set_cr_error(retcode);
-      }
-      list_result = std::move(extended_result.entries);
-      truncated = extended_result.truncated;
-      if (extended_result.next_log) {
-        next_gen = extended_result.next_log->generation;
-        next_num_shards = extended_result.next_log->num_shards;
-      }
-
-      squash_map.clear();
-      entries_iter = list_result.begin();
-      entries_end = list_result.end();
-      for (; entries_iter != entries_end; ++entries_iter) {
-        auto e = *entries_iter;
-        if (e.op == RGWModifyOp::CLS_RGW_OP_SYNCSTOP) {
-          ldpp_dout(dpp, 20) << "syncstop at: " << e.timestamp << ". marker: " << e.id << dendl;
-          syncstopped = true;
-          entries_end = std::next(entries_iter); // stop after this entry
-          break;
-        }
-        if (e.op == RGWModifyOp::CLS_RGW_OP_RESYNC) {
-          ldpp_dout(dpp, 20) << "syncstart at: " << e.timestamp << ". marker: " << e.id << dendl;
-          continue;
-        }
-        if (e.op == CLS_RGW_OP_CANCEL) {
-          continue;
-        }
-        if (e.state != CLS_RGW_STATE_COMPLETE) {
-          continue;
-        }
-        if (e.zones_trace.exists(zone_id.id, target_location_key)) {
-          continue;
-        }
-        auto& squash_entry = squash_map[make_pair(e.object, e.instance)];
-        // don't squash over olh entries - we need to apply their olh_epoch
-        if (has_olh_epoch(squash_entry.second) && !has_olh_epoch(e.op)) {
-          continue;
-        }
-        if (squash_entry.first <= e.timestamp) {
-          squash_entry = make_pair<>(e.timestamp, e.op);
-        }
-      }
-
-      entries_iter = list_result.begin();
-      for (; entries_iter != entries_end; ++entries_iter) {
-        if (lease_cr && !lease_cr->is_locked()) {
-          drain_all();
-          tn->log(1, "no lease or lease is lost, abort");
-          return set_cr_error(-ECANCELED);
-        }
-        entry = &(*entries_iter);
-        {
-          ssize_t p = entry->id.find('#'); /* entries might have explicit shard info in them, e.g., 6#00000000004.94.3 */
-          if (p < 0) {
-            cur_id = entry->id;
-          } else {
-            cur_id = entry->id.substr(p + 1);
-          }
-        }
-        sync_info.inc_marker.position = cur_id;
-
-        if (entry->op == RGWModifyOp::CLS_RGW_OP_SYNCSTOP || entry->op == RGWModifyOp::CLS_RGW_OP_RESYNC) {
-          ldpp_dout(dpp, 20) << "detected syncstop or resync on " << entries_iter->timestamp << ", skipping entry" << dendl;
-          marker_tracker.try_update_high_marker(cur_id, 0, entry->timestamp);
-          continue;
-        }
-
-        if (!key.set(rgw_obj_index_key{entry->object, entry->instance})) {
-          set_status() << "parse_raw_oid() on " << entry->object << " returned false, skipping entry";
-          tn->log(20, SSTR("parse_raw_oid() on " << entry->object << " returned false, skipping entry"));
-          marker_tracker.try_update_high_marker(cur_id, 0, entry->timestamp);
-          continue;
-        }
-
-        tn->log(20, SSTR("parsed entry: id=" << cur_id << " iter->object=" << entry->object << " iter->instance=" << entry->instance << " name=" << key.name << " instance=" << key.instance << " ns=" << key.ns));
-
-        if (!key.ns.empty()) {
-          set_status() << "skipping entry in namespace: " << entry->object;
-          tn->log(20, SSTR("skipping entry in namespace: " << entry->object));
-          marker_tracker.try_update_high_marker(cur_id, 0, entry->timestamp);
-          continue;
-        }
-
-        if (!check_key_handled(key)) {
-          set_status() << "skipping entry due to policy rules: " << entry->object;
-          tn->log(20, SSTR("skipping entry due to policy rules: " << entry->object));
-          marker_tracker.try_update_high_marker(cur_id, 0, entry->timestamp);
-          continue;
-        }
-
-        set_status() << "got entry.id=" << cur_id << " key=" << key << " op=" << (int)entry->op;
-        if (entry->op == CLS_RGW_OP_CANCEL) {
-          set_status() << "canceled operation, skipping";
-          tn->log(20, SSTR("skipping object: "
-              << bucket_shard_str{bs} << "/" << key << ": canceled operation"));
-          marker_tracker.try_update_high_marker(cur_id, 0, entry->timestamp);
-          continue;
-        }
-        if (entry->state != CLS_RGW_STATE_COMPLETE) {
-          set_status() << "non-complete operation, skipping";
-          tn->log(20, SSTR("skipping object: "
-              << bucket_shard_str{bs} << "/" << key << ": non-complete operation"));
-          marker_tracker.try_update_high_marker(cur_id, 0, entry->timestamp);
-          continue;
-        }
-        if (entry->zones_trace.exists(zone_id.id, target_location_key)) {
-          set_status() << "redundant operation, skipping";
-          tn->log(20, SSTR("skipping object: "
-              <<bucket_shard_str{bs} <<"/"<<key<<": redundant operation"));
-          marker_tracker.try_update_high_marker(cur_id, 0, entry->timestamp);
-          continue;
-        }
-        if (make_pair<>(entry->timestamp, entry->op) != squash_map[make_pair(entry->object, entry->instance)]) {
-          set_status() << "squashed operation, skipping";
-          tn->log(20, SSTR("skipping object: "
-              << bucket_shard_str{bs} << "/" << key << ": squashed operation"));
-          marker_tracker.try_update_high_marker(cur_id, 0, entry->timestamp);
-          continue;
-        }
-        tn->set_flag(RGW_SNS_FLAG_ACTIVE);
-        tn->log(20, SSTR("syncing object: "
-            << bucket_shard_str{bs} << "/" << key));
-        updated_status = false;
-        while (!marker_tracker.can_do_op(key, has_olh_epoch(entry->op))) {
-          if (!updated_status) {
-            set_status() << "can't do op, conflicting inflight operation";
-            updated_status = true;
-          }
-          tn->log(5, SSTR("can't do op on key=" << key << " need to wait for conflicting operation to complete"));
-          yield wait_for_child();
-          bool again = true;
-          while (again) {
-            again = collect(&ret, nullptr);
-            if (ret < 0) {
-              tn->log(0, SSTR("ERROR: a child operation returned error (ret=" << ret << ")"));
-              sync_status = ret;
-              /* we have reported this error */
-            }
-          }
-          if (sync_status != 0)
-            break;
-        }
-        if (sync_status != 0) {
-          /* get error, stop */
-          break;
-        }
-        if (!marker_tracker.index_key_to_marker(key, cur_id, has_olh_epoch(entry->op))) {
-          set_status() << "can't do op, sync already in progress for object";
-          tn->log(20, SSTR("skipping sync of entry: " << cur_id << ":" << key << " sync already in progress for object"));
-          marker_tracker.try_update_high_marker(cur_id, 0, entry->timestamp);
-          continue;
-        }
-        // yield {
-          set_status() << "start object sync";
-          if (!marker_tracker.start(cur_id, 0, entry->timestamp)) {
-            tn->log(0, SSTR("ERROR: cannot start syncing " << cur_id << ". Duplicate entry?"));
-          } else {
-            std::optional<uint64_t> versioned_epoch;
-            rgw_bucket_entry_owner owner(entry->owner, entry->owner_display_name);
-            if (entry->ver.pool < 0) {
-              versioned_epoch = entry->ver.epoch;
-            }
-            tn->log(20, SSTR("entry->timestamp=" << entry->timestamp));
-            using SyncCR = RGWBucketSyncSingleEntryCR<string, rgw_obj_key>;
-            spawn(new SyncCR(sc, sync_pipe, key,
-                             entry->is_versioned(), versioned_epoch,
-                             entry->timestamp, owner, entry->op, entry->state,
-                             cur_id, &marker_tracker, entry->zones_trace, tn),
-                  false);
-          }
-        // }
-        drain_with_cb(cct->_conf->rgw_bucket_sync_spawn_window,
-                      [&](uint64_t stack_id, int ret) {
-                if (ret < 0) {
-                  tn->log(10, "a sync operation returned error");
-                  sync_status = ret;
-                }
-                return 0;
-              });
-      }
-
-    } while (!list_result.empty() && sync_status == 0 && !syncstopped);
-
-    drain_all_cb([&](uint64_t stack_id, int ret) {
-      if (ret < 0) {
-        tn->log(10, "a sync operation returned error");
-        sync_status = ret;
-      }
-      return 0;
-    });
-    tn->unset_flag(RGW_SNS_FLAG_ACTIVE);
-
-    if (syncstopped) {
-      // transition to StateStopped in RGWSyncBucketShardCR. if sync is
-      // still disabled, we'll delete the sync status object. otherwise we'll
-      // restart full sync to catch any changes that happened while sync was
-      // disabled
-      sync_info.state = rgw_bucket_shard_sync_info::StateStopped;
-      return set_cr_done();
-    }
-
-    yield call(marker_tracker.flush());
-    if (retcode < 0) {
-      tn->log(0, SSTR("ERROR: marker_tracker.flush() returned retcode=" << retcode));
-      return set_cr_error(retcode);
-    }
-    if (sync_status < 0) {
-      tn->log(10, SSTR("backing out with sync_status=" << sync_status));
-      return set_cr_error(sync_status);
-    }
-
-    if (!truncated && extended_result.next_log) {
-      yield call(new RGWBucketShardIsDoneCR(sc, bucket_status_obj, bs.shard_id, *extended_result.next_log, generation));
-      if (retcode < 0) {
-        ldout(cct, 20) << "failed to update bucket sync status: "
-            << cpp_strerror(retcode) << dendl;
-        return set_cr_error(retcode);
-      }
-      yield {
-        // delete the shard status object
-        auto status_obj = sync_env->svc->rados->obj(marker_tracker.get_obj());
-        retcode = status_obj.open(dpp);
-        if (retcode < 0) {
-          return set_cr_error(retcode);
-        }
-        call(new RGWRadosRemoveOidCR(sync_env->driver, std::move(status_obj)));
-        if (retcode < 0) {
-          ldpp_dout(dpp, 20) << "failed to remove shard status object: " << cpp_strerror(retcode) << dendl;
-          return set_cr_error(retcode);
-        }
-      }
-    }
-
-    return set_cr_done();
-  }
-  return 0;
-}
-
-class RGWGetBucketPeersCR : public RGWCoroutine {
-  RGWDataSyncEnv *sync_env;
-
-  std::optional<rgw_bucket> target_bucket;
-  std::optional<rgw_zone_id> source_zone;
-  std::optional<rgw_bucket> source_bucket;
-
-  rgw_sync_pipe_info_set *pipes;
-  map<rgw_bucket, all_bucket_info> buckets_info;
-  map<rgw_bucket, all_bucket_info>::iterator siiter;
-  std::optional<all_bucket_info> target_bucket_info;
-  std::optional<all_bucket_info> source_bucket_info;
-
-  rgw_sync_pipe_info_set::iterator siter;
-
-  std::shared_ptr<rgw_bucket_get_sync_policy_result> source_policy;
-  std::shared_ptr<rgw_bucket_get_sync_policy_result> target_policy;
-
-  RGWSyncTraceNodeRef tn;
-
-  using pipe_const_iter = map<rgw_zone_id, RGWBucketSyncFlowManager::pipe_set>::const_iterator;
-
-  static pair<pipe_const_iter, pipe_const_iter> get_pipe_iters(const map<rgw_zone_id, RGWBucketSyncFlowManager::pipe_set>& m, std::optional<rgw_zone_id> zone) {
-    if (!zone) {
-      return { m.begin(), m.end() };
-    }
-
-    auto b = m.find(*zone);
-    if (b == m.end()) {
-      return { b, b };
-    }
-    return { b, std::next(b) };
-  }
-
-  void filter_sources(std::optional<rgw_zone_id> source_zone,
-                      std::optional<rgw_bucket> source_bucket,
-                      const map<rgw_zone_id, RGWBucketSyncFlowManager::pipe_set>& all_sources,
-                      rgw_sync_pipe_info_set *result) {
-    ldpp_dout(sync_env->dpp, 20) << __func__ << ": source_zone=" << source_zone.value_or(rgw_zone_id("*")).id
-                                << " source_bucket=" << source_bucket.value_or(rgw_bucket())
-                                << " all_sources.size()=" << all_sources.size() << dendl;
-    auto iters = get_pipe_iters(all_sources, source_zone);
-    for (auto i = iters.first; i != iters.second; ++i) {
-      for (auto& handler : i->second) {
-        if (!handler.specific()) {
-          ldpp_dout(sync_env->dpp, 20) << __func__ << ": pipe_handler=" << handler << ": skipping" << dendl;
-          continue;
-        }
-        if (source_bucket &&
-            !source_bucket->match(*handler.source.bucket)) {
-          continue;
-        }
-        ldpp_dout(sync_env->dpp, 20) << __func__ << ": pipe_handler=" << handler << ": adding" << dendl;
-        result->insert(handler, source_bucket_info, target_bucket_info);
-      }
-    }
-  }
-
-  void filter_targets(std::optional<rgw_zone_id> target_zone,
-                      std::optional<rgw_bucket> target_bucket,
-                      const map<rgw_zone_id, RGWBucketSyncFlowManager::pipe_set>& all_targets,
-                      rgw_sync_pipe_info_set *result) {
-    ldpp_dout(sync_env->dpp, 20) << __func__ << ": target_zone=" << source_zone.value_or(rgw_zone_id("*")).id
-                                << " target_bucket=" << source_bucket.value_or(rgw_bucket())
-                                << " all_targets.size()=" << all_targets.size() << dendl;
-    auto iters = get_pipe_iters(all_targets, target_zone);
-    for (auto i = iters.first; i != iters.second; ++i) {
-      for (auto& handler : i->second) {
-        if (target_bucket &&
-            handler.dest.bucket &&
-            !target_bucket->match(*handler.dest.bucket)) {
-          ldpp_dout(sync_env->dpp, 20) << __func__ << ": pipe_handler=" << handler << ": skipping" << dendl;
-          continue;
-        }
-        ldpp_dout(sync_env->dpp, 20) << __func__ << ": pipe_handler=" << handler << ": adding" << dendl;
-        result->insert(handler, source_bucket_info, target_bucket_info);
-      }
-    }
-  }
-
-  void update_from_target_bucket_policy();
-  void update_from_source_bucket_policy();
-
-  struct GetHintTargets : public RGWGenericAsyncCR::Action {
-    RGWDataSyncEnv *sync_env;
-    rgw_bucket source_bucket;
-    std::set<rgw_bucket> targets;
-    
-    GetHintTargets(RGWDataSyncEnv *_sync_env,
-                   const rgw_bucket& _source_bucket) : sync_env(_sync_env),
-                                                       source_bucket(_source_bucket) {}
-    int operate() override {
-      int r = sync_env->svc->bucket_sync->get_bucket_sync_hints(sync_env->dpp, 
-                                                                source_bucket,
-                                                                nullptr,
-                                                                &targets,
-                                                                null_yield);
-      if (r < 0) {
-        ldpp_dout(sync_env->dpp, 0) << "ERROR: " << __func__ << "(): failed to fetch bucket sync hints for bucket=" << source_bucket << dendl;
-        return r;
-      }
-
-      return 0;
-    }
-  };
-
-  std::shared_ptr<GetHintTargets> get_hint_targets_action;
-  std::set<rgw_bucket>::iterator hiter;
-
-public:
-  RGWGetBucketPeersCR(RGWDataSyncEnv *_sync_env,
-                      std::optional<rgw_bucket> _target_bucket,
-                      std::optional<rgw_zone_id> _source_zone,
-                      std::optional<rgw_bucket> _source_bucket,
-                      rgw_sync_pipe_info_set *_pipes,
-                      const RGWSyncTraceNodeRef& _tn_parent)
-    : RGWCoroutine(_sync_env->cct),
-      sync_env(_sync_env),
-      target_bucket(_target_bucket),
-      source_zone(_source_zone),
-      source_bucket(_source_bucket),
-      pipes(_pipes),
-      tn(sync_env->sync_tracer->add_node(_tn_parent, "get_bucket_peers",
-                                         SSTR( "target=" << target_bucket.value_or(rgw_bucket())
-                                               << ":source=" << target_bucket.value_or(rgw_bucket())
-                                               << ":source_zone=" << source_zone.value_or(rgw_zone_id("*")).id))) {
-      }
-
-  int operate(const DoutPrefixProvider *dpp) override;
-};
-
-std::ostream& operator<<(std::ostream& out, std::optional<rgw_bucket_shard>& bs) {
-  if (!bs) {
-    out << "*";
-  } else {
-    out << *bs;
-  }
-  return out;
-}
-
-static RGWCoroutine* sync_bucket_shard_cr(RGWDataSyncCtx* sc,
-                                          boost::intrusive_ptr<const RGWContinuousLeaseCR> lease,
-                                          const rgw_bucket_sync_pair_info& sync_pair,
-                                          std::optional<uint64_t> gen,
-                                          const RGWSyncTraceNodeRef& tn,
-                                          ceph::real_time* progress);
-
-RGWRunBucketSourcesSyncCR::RGWRunBucketSourcesSyncCR(RGWDataSyncCtx *_sc,
-                                                     boost::intrusive_ptr<const RGWContinuousLeaseCR> lease_cr,
-                                                     const rgw_bucket_shard& source_bs,
-                                                     const RGWSyncTraceNodeRef& _tn_parent,
-                                                    std::optional<uint64_t> gen,
-                                                     ceph::real_time* progress)
-  : RGWCoroutine(_sc->env->cct), sc(_sc), sync_env(_sc->env),
-    lease_cr(std::move(lease_cr)),
-    tn(sync_env->sync_tracer->add_node(
-        _tn_parent, "bucket_sync_sources",
-        SSTR( "source=" << source_bs << ":source_zone=" << sc->source_zone))),
-    progress(progress),
-    gen(gen)
-{
-  sync_pair.source_bs = source_bs;
-}
-
-int RGWRunBucketSourcesSyncCR::operate(const DoutPrefixProvider *dpp)
-{
-  reenter(this) {
-    yield call(new RGWGetBucketPeersCR(sync_env, std::nullopt, sc->source_zone,
-                                       sync_pair.source_bs.bucket, &pipes, tn));
-    if (retcode < 0 && retcode != -ENOENT) {
-      tn->log(0, SSTR("ERROR: failed to read sync status for bucket. error: " << retcode));
-      return set_cr_error(retcode);
-    }
-
-    ldpp_dout(dpp, 20) << __func__ << "(): requested source_bs=" << sync_pair.source_bs << dendl;
-
-    if (pipes.empty()) {
-      ldpp_dout(dpp, 20) << __func__ << "(): no relevant sync pipes found" << dendl;
-      return set_cr_done();
-    }
-
-    shard_progress.resize(pipes.size());
-    cur_shard_progress = shard_progress.begin();
-
-    for (siter = pipes.begin(); siter != pipes.end(); ++siter, ++cur_shard_progress) {
-      ldpp_dout(dpp, 20) << __func__ << "(): sync pipe=" << *siter << dendl;
-
-      sync_pair.dest_bucket = siter->target.get_bucket();
-      sync_pair.handler = siter->handler;
-
-      ldpp_dout(dpp, 20) << __func__ << "(): sync_pair=" << sync_pair << dendl;
-
-      yield_spawn_window(sync_bucket_shard_cr(sc, lease_cr, sync_pair,
-                                              gen, tn, &*cur_shard_progress),
-                         cct->_conf->rgw_bucket_sync_spawn_window,
-                         [&](uint64_t stack_id, int ret) {
-                           if (ret < 0) {
-                             tn->log(10, SSTR("ERROR: a sync operation returned error: " << ret));
-                           }
-                           return ret;
-                         });
-    }
-    drain_all_cb([&](uint64_t stack_id, int ret) {
-                   if (ret < 0) {
-                     tn->log(10, SSTR("a sync operation returned error: " << ret));
-                   }
-                   return ret;
-                 });
-    if (progress) {
-      *progress = *std::min_element(shard_progress.begin(), shard_progress.end());
-    }
-    return set_cr_done();
-  }
-
-  return 0;
-}
-
-class RGWSyncGetBucketInfoCR : public RGWCoroutine {
-  RGWDataSyncEnv *sync_env;
-  rgw_bucket bucket;
-  RGWBucketInfo *pbucket_info;
-  map<string, bufferlist> *pattrs;
-  RGWMetaSyncEnv meta_sync_env;
-
-  RGWSyncTraceNodeRef tn;
-
-public:
-  RGWSyncGetBucketInfoCR(RGWDataSyncEnv *_sync_env,
-                         const rgw_bucket& _bucket,
-                         RGWBucketInfo *_pbucket_info,
-                         map<string, bufferlist> *_pattrs,
-                         const RGWSyncTraceNodeRef& _tn_parent)
-    : RGWCoroutine(_sync_env->cct),
-      sync_env(_sync_env),
-      bucket(_bucket),
-      pbucket_info(_pbucket_info),
-      pattrs(_pattrs),
-      tn(sync_env->sync_tracer->add_node(_tn_parent, "get_bucket_info",
-                                         SSTR(bucket))) {
-  }
-
-  int operate(const DoutPrefixProvider *dpp) override;
-};
-
-int RGWSyncGetBucketInfoCR::operate(const DoutPrefixProvider *dpp)
-{
-  reenter(this) {
-    yield call(new RGWGetBucketInstanceInfoCR(sync_env->async_rados, sync_env->driver, bucket, pbucket_info, pattrs, dpp));
-    if (retcode == -ENOENT) {
-      /* bucket instance info has not been synced in yet, fetch it now */
-      yield {
-        tn->log(10, SSTR("no local info for bucket:" << ": fetching metadata"));
-        string raw_key = string("bucket.instance:") + bucket.get_key();
-
-        meta_sync_env.init(dpp, cct, sync_env->driver, sync_env->svc->zone->get_master_conn(), sync_env->async_rados,
-                           sync_env->http_manager, sync_env->error_logger, sync_env->sync_tracer);
-
-        call(new RGWMetaSyncSingleEntryCR(&meta_sync_env, raw_key,
-                                          string() /* no marker */,
-                                          MDLOG_STATUS_COMPLETE,
-                                          NULL /* no marker tracker */,
-                                          tn));
-      }
-      if (retcode < 0) {
-        tn->log(0, SSTR("ERROR: failed to fetch bucket instance info for " << bucket_str{bucket}));
-        return set_cr_error(retcode);
-      }
-
-      yield call(new RGWGetBucketInstanceInfoCR(sync_env->async_rados, sync_env->driver, bucket, pbucket_info, pattrs, dpp));
-    }
-    if (retcode < 0) {
-      tn->log(0, SSTR("ERROR: failed to retrieve bucket info for bucket=" << bucket_str{bucket}));
-      return set_cr_error(retcode);
-    }
-
-    return set_cr_done();
-  }
-
-  return 0;
-}
-
-void RGWGetBucketPeersCR::update_from_target_bucket_policy()
-{
-  if (!target_policy ||
-      !target_policy->policy_handler ||
-      !pipes) {
-    return;
-  }
-
-  auto handler = target_policy->policy_handler.get();
-
-  filter_sources(source_zone,
-                 source_bucket,
-                 handler->get_sources(),
-                 pipes);
-
-  for (siter = pipes->begin(); siter != pipes->end(); ++siter) {
-    if (!siter->source.has_bucket_info()) {
-      buckets_info.emplace(siter->source.get_bucket(), all_bucket_info());
-    }
-    if (!siter->target.has_bucket_info()) {
-      buckets_info.emplace(siter->target.get_bucket(), all_bucket_info());
-    }
-  }
-}
-
-void RGWGetBucketPeersCR::update_from_source_bucket_policy()
-{
-  if (!source_policy ||
-      !source_policy->policy_handler ||
-      !pipes) {
-    return;
-  }
-
-  auto handler = source_policy->policy_handler.get();
-
-  filter_targets(sync_env->svc->zone->get_zone().id,
-                 target_bucket,
-                 handler->get_targets(),
-                 pipes);
-
-  for (siter = pipes->begin(); siter != pipes->end(); ++siter) {
-    if (!siter->source.has_bucket_info()) {
-      buckets_info.emplace(siter->source.get_bucket(), all_bucket_info());
-    }
-    if (!siter->target.has_bucket_info()) {
-      buckets_info.emplace(siter->target.get_bucket(), all_bucket_info());
-    }
-  }
-}
-
-
-class RGWSyncGetBucketSyncPolicyHandlerCR : public RGWCoroutine {
-  RGWDataSyncEnv *sync_env;
-  rgw_bucket bucket;
-  rgw_bucket_get_sync_policy_params get_policy_params;
-
-  std::shared_ptr<rgw_bucket_get_sync_policy_result> policy;
-
-  RGWSyncTraceNodeRef tn;
-
-  int i;
-
-public:
-  RGWSyncGetBucketSyncPolicyHandlerCR(RGWDataSyncEnv *_sync_env,
-                         std::optional<rgw_zone_id> zone,
-                         const rgw_bucket& _bucket,
-                         std::shared_ptr<rgw_bucket_get_sync_policy_result>& _policy,
-                         const RGWSyncTraceNodeRef& _tn_parent)
-    : RGWCoroutine(_sync_env->cct),
-      sync_env(_sync_env),
-      bucket(_bucket),
-      policy(_policy),
-      tn(sync_env->sync_tracer->add_node(_tn_parent, "get_sync_policy_handler",
-                                         SSTR(bucket))) {
-    get_policy_params.zone = zone;
-    get_policy_params.bucket = bucket;
-  }
-
-  int operate(const DoutPrefixProvider *dpp) override {
-    reenter(this) {
-      for (i = 0; i < 2; ++i) {
-        yield call(new RGWBucketGetSyncPolicyHandlerCR(sync_env->async_rados,
-                                                       sync_env->driver,
-                                                       get_policy_params,
-                                                       policy,
-                                                       dpp));
-        if (retcode < 0 &&
-            retcode != -ENOENT) {
-          return set_cr_error(retcode);
-        }
-
-        if (retcode == 0) {
-          return set_cr_done();
-        }
-
-        /* bucket instance was not found,
-         * try to get bucket instance info, can trigger
-         * metadata sync of bucket instance
-         */
-        yield call(new RGWSyncGetBucketInfoCR(sync_env, 
-                                              bucket, 
-                                              nullptr,
-                                              nullptr,
-                                              tn));
-        if (retcode < 0) {
-          return set_cr_error(retcode);
-        }
-      }
-    }
-
-    return 0;
-  }
-};
-
-
-int RGWGetBucketPeersCR::operate(const DoutPrefixProvider *dpp)
-{
-  reenter(this) {
-    if (pipes) {
-      pipes->clear();
-    }
-    if (target_bucket) {
-      target_policy = make_shared<rgw_bucket_get_sync_policy_result>();
-      yield call(new RGWSyncGetBucketSyncPolicyHandlerCR(sync_env,
-                                                         nullopt,
-                                                         *target_bucket,
-                                                         target_policy,
-                                                         tn));
-      if (retcode < 0 &&
-          retcode != -ENOENT) {
-        return set_cr_error(retcode);
-      }
-
-      update_from_target_bucket_policy();
-    }
-
-    if (source_bucket && source_zone) {
-      source_policy = make_shared<rgw_bucket_get_sync_policy_result>();
-      yield call(new RGWSyncGetBucketSyncPolicyHandlerCR(sync_env,
-                                                         source_zone,
-                                                         *source_bucket,
-                                                         source_policy,
-                                                         tn));
-      if (retcode < 0 &&
-          retcode != -ENOENT) {
-        return set_cr_error(retcode);
-      }
-
-      if (source_policy->policy_handler) {
-        auto& opt_bucket_info = source_policy->policy_handler->get_bucket_info();
-        auto& opt_attrs = source_policy->policy_handler->get_bucket_attrs();
-        if (opt_bucket_info && opt_attrs) {
-          source_bucket_info.emplace();
-          source_bucket_info->bucket_info = *opt_bucket_info;
-          source_bucket_info->attrs = *opt_attrs;
-        }
-      }
-
-      if (!target_bucket) {
-        get_hint_targets_action = make_shared<GetHintTargets>(sync_env, *source_bucket);
-
-        yield call(new RGWGenericAsyncCR(cct, sync_env->async_rados,
-                                         get_hint_targets_action));
-        if (retcode < 0) {
-          return set_cr_error(retcode);
-        }
-
-        /* hints might have incomplete bucket ids,
-         * in which case we need to figure out the current
-         * bucket_id
-         */
-        for (hiter = get_hint_targets_action->targets.begin();
-             hiter != get_hint_targets_action->targets.end();
-             ++hiter) {
-          ldpp_dout(dpp, 20) << "Got sync hint for bucket=" << *source_bucket << ": " << hiter->get_key() << dendl;
-
-          target_policy = make_shared<rgw_bucket_get_sync_policy_result>();
-          yield call(new RGWSyncGetBucketSyncPolicyHandlerCR(sync_env,
-                                                             nullopt,
-                                                             *hiter,
-                                                             target_policy,
-                                                             tn));
-          if (retcode < 0 &&
-              retcode != -ENOENT) {
-            return set_cr_error(retcode);
-          }
-          update_from_target_bucket_policy();
-        }
-      }
-    }
-
-    update_from_source_bucket_policy();
-
-    for (siiter = buckets_info.begin(); siiter != buckets_info.end(); ++siiter) {
-      if (siiter->second.bucket_info.bucket.name.empty()) {
-        yield call(new RGWSyncGetBucketInfoCR(sync_env, siiter->first,
-                                              &siiter->second.bucket_info,
-                                              &siiter->second.attrs,
-                                              tn));
-      }
-    }
-
-    if (pipes) {
-      pipes->update_empty_bucket_info(buckets_info);
-    }
-
-    return set_cr_done();
-  }
-
-  return 0;
-}
-
-class RGWSyncBucketShardCR : public RGWCoroutine {
-  RGWDataSyncCtx *sc;
-  RGWDataSyncEnv *sync_env;
-  boost::intrusive_ptr<const RGWContinuousLeaseCR> lease_cr;
-  rgw_bucket_sync_pair_info sync_pair;
-  rgw_bucket_sync_pipe& sync_pipe;
-  bool& bucket_stopped;
-  uint64_t generation;
-  ceph::real_time* progress;
-
-  const std::string shard_status_oid;
-  const rgw_raw_obj bucket_status_obj;
-  rgw_bucket_shard_sync_info sync_status;
-  RGWObjVersionTracker objv_tracker;
-
-  RGWSyncTraceNodeRef tn;
-
-public:
-  RGWSyncBucketShardCR(RGWDataSyncCtx *_sc,
-                       boost::intrusive_ptr<const RGWContinuousLeaseCR> lease_cr,
-                       const rgw_bucket_sync_pair_info& _sync_pair,
-                       rgw_bucket_sync_pipe& sync_pipe,
-                       bool& bucket_stopped,
-                       uint64_t generation,
-                       const RGWSyncTraceNodeRef& tn,
-                       ceph::real_time* progress)
-    : RGWCoroutine(_sc->cct), sc(_sc), sync_env(_sc->env),
-      lease_cr(std::move(lease_cr)), sync_pair(_sync_pair),
-      sync_pipe(sync_pipe), bucket_stopped(bucket_stopped), generation(generation), progress(progress),
-      shard_status_oid(RGWBucketPipeSyncStatusManager::inc_status_oid(sc->source_zone, sync_pair, generation)),
-      bucket_status_obj(sc->env->svc->zone->get_zone_params().log_pool,
-                 RGWBucketPipeSyncStatusManager::full_status_oid(sc->source_zone,
-                                                                 sync_pair.source_bs.bucket,
-                                                                 sync_pair.dest_bucket)),
-      tn(tn) {
-  }
-
-  int operate(const DoutPrefixProvider *dpp) override;
-};
-
-int RGWSyncBucketShardCR::operate(const DoutPrefixProvider *dpp)
-{
-  reenter(this) {
-    yield call(new RGWReadBucketPipeSyncStatusCoroutine(sc, sync_pair, &sync_status, &objv_tracker, generation));
-    if (retcode < 0 && retcode != -ENOENT) {
-      tn->log(0, SSTR("ERROR: failed to read sync status for bucket. error: " << retcode));
-      return set_cr_error(retcode);
-    }
-
-    tn->log(20, SSTR("sync status for source bucket shard: " << sync_status.state));
-    sync_status.state = rgw_bucket_shard_sync_info::StateIncrementalSync;
-    if (progress) {
-      *progress = sync_status.inc_marker.timestamp;
-    }
-
-    yield call(new RGWBucketShardIncrementalSyncCR(sc, sync_pipe,
-                                                   shard_status_oid, bucket_status_obj, lease_cr,
-                                                   sync_status, generation, tn,
-                                                   objv_tracker, progress));
-    if (retcode < 0) {
-      tn->log(5, SSTR("incremental sync on bucket failed, retcode=" << retcode));
-      return set_cr_error(retcode);
-    }
-
-    if (sync_status.state == rgw_bucket_shard_sync_info::StateStopped) {
-      tn->log(20, SSTR("syncstopped indication for source bucket shard"));
-      bucket_stopped = true;
-    }
-
-    return set_cr_done();
-  }
-
-  return 0;
-}
-
-class RGWSyncBucketCR : public RGWCoroutine {
-  RGWDataSyncCtx *sc;
-  RGWDataSyncEnv *env;
-  boost::intrusive_ptr<const RGWContinuousLeaseCR> data_lease_cr;
-  boost::intrusive_ptr<RGWContinuousLeaseCR> bucket_lease_cr;
-  rgw_bucket_sync_pair_info sync_pair;
-  rgw_bucket_sync_pipe sync_pipe;
-  std::optional<uint64_t> gen;
-  ceph::real_time* progress;
-
-  const std::string lock_name = "bucket sync";
-  const uint32_t lock_duration;
-  const rgw_raw_obj status_obj;
-  rgw_bucket_sync_status bucket_status;
-  bool bucket_stopped = false;
-  RGWObjVersionTracker objv;
-  bool init_check_compat = false;
-  rgw_bucket_index_marker_info info;
-
-  RGWSyncTraceNodeRef tn;
-
-public:
-  RGWSyncBucketCR(RGWDataSyncCtx *_sc,
-                  boost::intrusive_ptr<const RGWContinuousLeaseCR> lease_cr,
-                  const rgw_bucket_sync_pair_info& _sync_pair,
-                  std::optional<uint64_t> gen,
-                  const RGWSyncTraceNodeRef& _tn_parent,
-                  ceph::real_time* progress)
-    : RGWCoroutine(_sc->cct), sc(_sc), env(_sc->env),
-      data_lease_cr(std::move(lease_cr)), sync_pair(_sync_pair),
-      gen(gen), progress(progress),
-      lock_duration(cct->_conf->rgw_sync_lease_period),
-      status_obj(env->svc->zone->get_zone_params().log_pool,
-                 RGWBucketPipeSyncStatusManager::full_status_oid(sc->source_zone,
-                                                                 sync_pair.source_bs.bucket,
-                                                                 sync_pair.dest_bucket)),
-      tn(env->sync_tracer->add_node(_tn_parent, "bucket",
-                                    SSTR(bucket_str{_sync_pair.dest_bucket} << "<-" << bucket_shard_str{_sync_pair.source_bs} ))) {
-  }
-
-  int operate(const DoutPrefixProvider *dpp) override;
-};
-
-static RGWCoroutine* sync_bucket_shard_cr(RGWDataSyncCtx* sc,
-                                          boost::intrusive_ptr<const RGWContinuousLeaseCR> lease,
-                                          const rgw_bucket_sync_pair_info& sync_pair,
-                                          std::optional<uint64_t> gen,
-                                          const RGWSyncTraceNodeRef& tn,
-                                          ceph::real_time* progress)
-{
-  return new RGWSyncBucketCR(sc, std::move(lease), sync_pair,
-                             gen, tn, progress);
-}
-
-#define RELEASE_LOCK(cr) \
-       if (cr) {cr->go_down(); drain_all(); cr.reset();}
-
-int RGWSyncBucketCR::operate(const DoutPrefixProvider *dpp)
-{
-  reenter(this) {
-    // read source/destination bucket info
-    yield call(new RGWSyncGetBucketInfoCR(env, sync_pair.source_bs.bucket, &sync_pipe.source_bucket_info,
-                                          &sync_pipe.source_bucket_attrs, tn));
-    if (retcode < 0) {
-      tn->log(0, SSTR("ERROR: failed to retrieve bucket info for bucket=" << bucket_str{sync_pair.source_bs.bucket}));
-      return set_cr_error(retcode);
-    }
-
-    yield call(new RGWSyncGetBucketInfoCR(env, sync_pair.dest_bucket, &sync_pipe.dest_bucket_info,
-                                          &sync_pipe.dest_bucket_attrs, tn));
-    if (retcode < 0) {
-      tn->log(0, SSTR("ERROR: failed to retrieve bucket info for bucket=" << bucket_str{sync_pair.source_bs.bucket}));
-      return set_cr_error(retcode);
-    }
-
-    sync_pipe.info = sync_pair;
-
-    // read bucket sync status
-    using ReadCR = RGWSimpleRadosReadCR<rgw_bucket_sync_status>;
-    using WriteCR = RGWSimpleRadosWriteCR<rgw_bucket_sync_status>;
-
-    yield call(new ReadCR(dpp, env->async_rados, env->svc->sysobj,
-                          status_obj, &bucket_status, false, &objv));
-    if (retcode == -ENOENT) {
-      // use exclusive create to set state=Init
-      objv.generate_new_write_ver(cct);
-      yield call(new WriteCR(dpp, env->async_rados, env->svc->sysobj,
-                             status_obj, bucket_status, &objv, true));
-      tn->log(20, "bucket status object does not exist, create a new one");
-      if (retcode == -EEXIST) {
-        // raced with another create, read its status
-        tn->log(20, "raced with another create, read its status");
-        yield call(new ReadCR(dpp, env->async_rados, env->svc->sysobj,
-                              status_obj, &bucket_status, false, &objv));
-      }
-    }
-    if (retcode < 0) {
-      tn->log(20, SSTR("ERROR: failed to read bucket status object. error: " << retcode));
-      return set_cr_error(retcode);
-    }
-
-    do {
-      tn->log(20, SSTR("sync status for source bucket: " << bucket_status.state << 
-            ". lease is: " << (bucket_lease_cr ? "taken" : "not taken") << ". stop indications is: " << bucket_stopped));
-
-      if (bucket_status.state != BucketSyncState::Incremental ||
-          bucket_stopped) { 
-        // if state is Init or Stopped, we query the remote RGW for ther state
-        yield call(new RGWReadRemoteBucketIndexLogInfoCR(sc, sync_pair.source_bs.bucket, &info));
-        if (retcode < 0) {
-          return set_cr_error(retcode);
-        }
-        if (info.syncstopped) {
-          // remote indicates stopped state
-          tn->log(20, "remote bilog indicates that sync was stopped");
-          if (!bucket_lease_cr) {
-            bucket_lease_cr.reset(new RGWContinuousLeaseCR(env->async_rados, env->driver, status_obj,
-                                                         lock_name, lock_duration, this));
-            yield spawn(bucket_lease_cr.get(), false);
-            while (!bucket_lease_cr->is_locked()) {
-              if (bucket_lease_cr->is_done()) {
-                tn->log(5, "failed to take lease");
-                set_status("lease lock failed, early abort");
-                drain_all();
-                return set_cr_error(bucket_lease_cr->get_ret_status());
-              }
-              tn->log(5, "waiting on bucket lease");
-              yield set_sleeping(true);
-            }
-          }
-
-          // if state was incremental, remove all per-shard status objects
-          if (bucket_status.state == BucketSyncState::Incremental) {
-            yield {
-              const auto num_shards = bucket_status.shards_done_with_gen.size();
-              const auto gen = bucket_status.incremental_gen;
-              call(new RemoveBucketShardStatusCollectCR(sc, sync_pair, gen, num_shards));
-            }
-          }
-
-          // check if local state is "stopped"
-          yield call(new ReadCR(dpp, env->async_rados, env->svc->sysobj,
-                status_obj, &bucket_status, false, &objv));
-          if (retcode < 0) {
-            tn->log(20, SSTR("ERROR: failed to read status before writing 'stopped'. error: " << retcode));
-            RELEASE_LOCK(bucket_lease_cr);
-            return set_cr_error(retcode);
-          }
-          if (bucket_status.state != BucketSyncState::Stopped) {
-            // make sure that state is changed to stopped localy
-            bucket_status.state = BucketSyncState::Stopped;
-            yield call(new WriteCR(dpp, env->async_rados, env->svc->sysobj,
-                  status_obj, bucket_status, &objv, false));
-            if (retcode < 0) {
-              tn->log(20, SSTR("ERROR: failed to write 'stopped' status. error: " << retcode));
-              RELEASE_LOCK(bucket_lease_cr);
-              return set_cr_error(retcode);
-            }
-          }
-          RELEASE_LOCK(bucket_lease_cr);
-          return set_cr_done();
-        }
-        if (bucket_stopped) {
-          tn->log(20, SSTR("ERROR: switched from 'stop' to 'start' sync. while state is: " << bucket_status.state));
-          bucket_stopped = false;
-          bucket_status.state = BucketSyncState::Init;
-        }
-      }
-
-      if (bucket_status.state != BucketSyncState::Incremental) {
-        // if the state wasn't Incremental, take a bucket-wide lease to prevent
-        // different shards from duplicating the init and full sync
-        if (!bucket_lease_cr) {
-          bucket_lease_cr.reset(new RGWContinuousLeaseCR(env->async_rados, env->driver, status_obj,
-                                                       lock_name, lock_duration, this));
-          yield spawn(bucket_lease_cr.get(), false);
-          while (!bucket_lease_cr->is_locked()) {
-            if (bucket_lease_cr->is_done()) {
-              tn->log(5, "failed to take lease");
-              set_status("lease lock failed, early abort");
-              drain_all();
-              return set_cr_error(bucket_lease_cr->get_ret_status());
-            }
-            tn->log(5, "waiting on bucket lease");
-            yield set_sleeping(true);
-          }
-        }
-
-        // reread the status after acquiring the lock
-        yield call(new ReadCR(dpp, env->async_rados, env->svc->sysobj,
-                            status_obj, &bucket_status, false, &objv));
-        if (retcode < 0) {
-          RELEASE_LOCK(bucket_lease_cr);
-          tn->log(20, SSTR("ERROR: reading the status after acquiring the lock failed. error: " << retcode));
-          return set_cr_error(retcode);
-        }
-        tn->log(20, SSTR("status after acquiring the lock is: " << bucket_status.state));
-
-       yield call(new InitBucketFullSyncStatusCR(sc, sync_pair, status_obj,
-                                                 bucket_status, objv,
-                                                 sync_pipe.source_bucket_info,
-                                                 init_check_compat, info));
-
-        if (retcode < 0) {
-          tn->log(20, SSTR("ERROR: init full sync failed. error: " << retcode));
-          RELEASE_LOCK(bucket_lease_cr);
-          return set_cr_error(retcode);
-        }
-      }
-
-      assert(bucket_status.state == BucketSyncState::Incremental || 
-          bucket_status.state == BucketSyncState::Full);
-
-      if (bucket_status.state == BucketSyncState::Full) {
-        assert(bucket_lease_cr);
-        yield call(new RGWBucketFullSyncCR(sc, sync_pipe, status_obj,
-                                           bucket_lease_cr, bucket_status,
-                                           tn, objv));
-        if (retcode < 0) {
-          tn->log(20, SSTR("ERROR: full sync failed. error: " << retcode));
-          RELEASE_LOCK(bucket_lease_cr);
-          return set_cr_error(retcode);
-        }
-      }
-
-      if (bucket_status.state == BucketSyncState::Incremental) {
-        // lease not required for incremental sync
-        RELEASE_LOCK(bucket_lease_cr);
-
-        // if a specific gen was requested, compare that to the sync status
-        if (gen) {
-          const auto current_gen = bucket_status.incremental_gen;
-          if (*gen > current_gen) {
-            retcode = -EAGAIN;
-            tn->log(10, SSTR("ERROR: requested sync of future generation "
-                             << *gen << " > " << current_gen
-                             << ", returning " << retcode << " for later retry"));
-            return set_cr_error(retcode);
-          } else if (*gen < current_gen) {
-            tn->log(10, SSTR("WARNING: requested sync of past generation "
-                             << *gen << " < " << current_gen
-                             << ", returning success"));
-            return set_cr_done();
-          }
-        }
-
-        assert(sync_pair.source_bs.shard_id >= 0);
-        if (static_cast<std::size_t>(sync_pair.source_bs.shard_id) >= bucket_status.shards_done_with_gen.size()) {
-          tn->log(1, SSTR("bucket shard " << sync_pair.source_bs << " index out of bounds"));
-          return set_cr_done(); // return success so we don't retry
-        }
-        if (bucket_status.shards_done_with_gen[sync_pair.source_bs.shard_id]) {
-          tn->log(10, SSTR("bucket shard " << sync_pair.source_bs << " of gen " <<
-                          gen << " already synced."));
-          return set_cr_done();
-        }
-
-        yield call(new RGWSyncBucketShardCR(sc, data_lease_cr, sync_pair,
-                                            sync_pipe, bucket_stopped,
-                                            bucket_status.incremental_gen, tn, progress));
-        if (retcode < 0) {
-          tn->log(20, SSTR("ERROR: incremental sync failed. error: " << retcode));
-          return set_cr_error(retcode);
-        }
-      }
-      // loop back to previous states unless incremental sync returns normally
-    } while (bucket_status.state != BucketSyncState::Incremental || bucket_stopped);
-
-    return set_cr_done();
-  }
-
-  return 0;
-}
-
-int RGWBucketPipeSyncStatusManager::do_init(const DoutPrefixProvider *dpp,
-                                           std::ostream* ostr)
-{
-  int ret = http_manager.start();
-  if (ret < 0) {
-    ldpp_dout(this, 0) << "failed in http_manager.start() ret=" << ret << dendl;
-    return ret;
-  }
-
-  sync_module.reset(new RGWDefaultSyncModuleInstance());
-  auto async_rados = driver->svc()->rados->get_async_processor();
-
-  sync_env.init(this, driver->ctx(), driver,
-                driver->svc(), async_rados, &http_manager,
-                error_logger.get(), driver->getRados()->get_sync_tracer(),
-                sync_module, nullptr);
-
-  sync_env.ostr = ostr;
-
-  rgw_sync_pipe_info_set pipes;
-
-  ret = cr_mgr.run(dpp, new RGWGetBucketPeersCR(&sync_env,
-                                           dest_bucket,
-                                           source_zone,
-                                           source_bucket,
-                                           &pipes,
-                                           sync_env.sync_tracer->root_node));
-  if (ret < 0) {
-    ldpp_dout(this, 0) << "failed to get bucket source peers info: (ret=" << ret << "): " << cpp_strerror(-ret) << dendl;
-    return ret;
-  }
-
-  if (pipes.empty()) {
-    ldpp_dout(this, 0) << "No peers. This is not a valid multisite configuration." << dendl;
-    return -EINVAL;
-  }
-
-  for (auto& pipe : pipes) {
-    auto& szone = pipe.source.zone;
-
-    auto conn = driver->svc()->zone->get_zone_conn(szone);
-    if (!conn) {
-      ldpp_dout(this, 0) << "connection object to zone " << szone << " does not exist" << dendl;
-      return -EINVAL;
-    }
-
-    RGWZone* z;
-    if (!(z = driver->svc()->zone->find_zone(szone))) {
-      ldpp_dout(this, 0) << "zone " << szone << " does not exist" << dendl;
-      return -EINVAL;
-    }
-    sources.emplace_back(&sync_env, szone, conn,
-                        pipe.source.get_bucket_info(),
-                        pipe.target.get_bucket(),
-                        pipe.handler, z->name);
-  }
-
-  return 0;
-}
-
-int RGWBucketPipeSyncStatusManager::remote_info(const DoutPrefixProvider *dpp,
-                                               source& s,
-                                               uint64_t* oldest_gen,
-                                               uint64_t* latest_gen,
-                                               uint64_t* num_shards)
-{
-  rgw_bucket_index_marker_info remote_info;
-  BucketIndexShardsManager remote_markers;
-  auto r = rgw_read_remote_bilog_info(dpp, s.sc.conn, s.info.bucket,
-                                     remote_info, remote_markers,
-                                     null_yield);
-
-  if (r < 0) {
-    ldpp_dout(dpp, 0) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                     << " rgw_read_remote_bilog_info: r="
-                     << r << dendl;
-    return r;
-  }
-  if (oldest_gen)
-    *oldest_gen = remote_info.oldest_gen;
-
-  if (latest_gen)
-    *latest_gen = remote_info.latest_gen;
-
-  if (num_shards)
-    *num_shards = remote_markers.get().size();
-
-  return 0;
-}
-
-tl::expected<std::unique_ptr<RGWBucketPipeSyncStatusManager>, int>
-RGWBucketPipeSyncStatusManager::construct(
-  const DoutPrefixProvider* dpp,
-  rgw::sal::RadosStore* driver,
-  std::optional<rgw_zone_id> source_zone,
-  std::optional<rgw_bucket> source_bucket,
-  const rgw_bucket& dest_bucket,
-  std::ostream* ostr)
-{
-  std::unique_ptr<RGWBucketPipeSyncStatusManager> self{
-    new RGWBucketPipeSyncStatusManager(driver, source_zone, source_bucket,
-                                      dest_bucket)};
-  auto r = self->do_init(dpp, ostr);
-  if (r < 0) {
-    return tl::unexpected(r);
-  }
-  return self;
-}
-
-int RGWBucketPipeSyncStatusManager::init_sync_status(
-  const DoutPrefixProvider *dpp)
-{
-  // Just running one at a time saves us from buildup/teardown and in
-  // practice we only do one zone at a time.
-  for (auto& source : sources) {
-    list<RGWCoroutinesStack*> stacks;
-    RGWCoroutinesStack *stack = new RGWCoroutinesStack(driver->ctx(), &cr_mgr);
-    pretty_print(source.sc.env, "Initializing sync state of bucket {} with zone {}.\n",
-                source.info.bucket.name, source.zone_name);
-    stack->call(new RGWSimpleRadosWriteCR<rgw_bucket_sync_status>(
-                 dpp, source.sc.env->async_rados, source.sc.env->svc->sysobj,
-                 {sync_env.svc->zone->get_zone_params().log_pool,
-                   full_status_oid(source.sc.source_zone,
-                                  source.info.bucket,
-                                  source.dest)},
-                 rgw_bucket_sync_status{}));
-    stacks.push_back(stack);
-    auto r = cr_mgr.run(dpp, stacks);
-    if (r < 0) {
-      pretty_print(source.sc.env,
-                  "Initialization of sync state for bucket {} with zone {} "
-                  "failed with error {}\n",
-                  source.info.bucket.name, source.zone_name, cpp_strerror(r));
-    }
-  }
-  return 0;
-}
-
-tl::expected<std::map<int, rgw_bucket_shard_sync_info>, int>
-RGWBucketPipeSyncStatusManager::read_sync_status(
-  const DoutPrefixProvider *dpp)
-{
-  std::map<int, rgw_bucket_shard_sync_info> sync_status;
-  list<RGWCoroutinesStack *> stacks;
-
-  auto sz = sources.begin();
-
-  if (source_zone) {
-    sz = std::find_if(sources.begin(), sources.end(),
-                     [this](const source& s) {
-                       return s.sc.source_zone == *source_zone;
-                     }
-      );
-    if (sz == sources.end()) {
-      ldpp_dout(this, 0) << "ERROR: failed to find source zone: "
-                        << *source_zone << dendl;
-      return tl::unexpected(-ENOENT);
-    }
-  } else {
-    ldpp_dout(this, 5) << "No source zone specified, using source zone: "
-                      << sz->sc.source_zone << dendl;
-    return tl::unexpected(-ENOENT);
-  }
-  uint64_t num_shards, latest_gen;
-  auto ret = remote_info(dpp, *sz, nullptr, &latest_gen, &num_shards);
-  if (!ret) {
-    ldpp_dout(this, 5) << "Unable to get remote info: "
-                      << ret << dendl;
-    return tl::unexpected(ret);
-  }
-  auto stack = new RGWCoroutinesStack(driver->ctx(), &cr_mgr);
-  std::vector<rgw_bucket_sync_pair_info> pairs(num_shards);
-  for (auto shard = 0u; shard < num_shards; ++shard) {
-    auto& pair = pairs[shard];
-    pair.source_bs.bucket = sz->info.bucket;
-    pair.dest_bucket = sz->dest;
-    pair.source_bs.shard_id = shard;
-    stack->call(new RGWReadBucketPipeSyncStatusCoroutine(
-                 &sz->sc, pair, &sync_status[shard],
-                 nullptr, latest_gen));
-  }
-
-  stacks.push_back(stack);
-
-  ret = cr_mgr.run(dpp, stacks);
-  if (ret < 0) {
-    ldpp_dout(this, 0) << "ERROR: failed to read sync status for "
-                      << bucket_str{dest_bucket} << dendl;
-    return tl::unexpected(ret);
-  }
-
-  return sync_status;
-}
-
-namespace rgw::bucket_sync_run {
-// Retry-loop over calls to sync_bucket_shard_cr
-class ShardCR : public RGWCoroutine {
-  static constexpr auto allowed_retries = 10u;
-
-  RGWDataSyncCtx& sc;
-  const rgw_bucket_sync_pair_info& pair;
-  const uint64_t gen;
-  unsigned retries = 0;
-
-  ceph::real_time prev_progress;
-  ceph::real_time progress;
-
-public:
-
-  ShardCR(RGWDataSyncCtx& sc, const rgw_bucket_sync_pair_info& pair,
-         const uint64_t gen)
-    : RGWCoroutine(sc.cct), sc(sc), pair(pair), gen(gen) {}
-
-  int operate(const DoutPrefixProvider *dpp) override {
-    reenter(this) {
-      // Since all errors (except ECANCELED) are considered retryable,
-      // retry other errors so long as we're making progress.
-      for (retries = 0u, retcode = -EDOM;
-          (retries < allowed_retries) && (retcode != 0);
-          ++retries) {
-       ldpp_dout(dpp, 5) << "ShardCR: syncing bucket shard on: "
-                         << "zone=" << sc.source_zone
-                         << ", bucket=" << pair.source_bs.bucket.name
-                         << ", shard=" << pair.source_bs.shard_id
-                         << ", gen=" << gen
-                         << dendl;
-       yield call(sync_bucket_shard_cr(&sc, nullptr, pair, gen,
-                                       sc.env->sync_tracer->root_node,
-                                       &progress));
-
-       if (retcode == -ECANCELED) {
-         ldpp_dout(dpp, -1) << "ERROR: Got -ECANCELED for "
-                            << pair.source_bs << dendl;
-         drain_all();
-         return set_cr_error(retcode);
-       } else if (retcode < 0) {
-         ldpp_dout(dpp, 5) << "WARNING: Got error, retcode=" << retcode << " for "
-                           << pair.source_bs << "on retry "
-                           << retries + 1 << " of " << allowed_retries
-                           << " allowed" << dendl;
-         // Reset the retry counter if we made any progress
-         if (progress != prev_progress) {
-           retries = 0;
-         }
-         prev_progress = progress;
-       }
-      }
-      if (retcode < 0) {
-       ldpp_dout(dpp, -1) << "ERROR: Exhausted retries for "
-                          << pair.source_bs << " retcode="
-                          << retcode << dendl;
-       drain_all();
-       return set_cr_error(retcode);
-      }
-
-      drain_all();
-      return set_cr_done();
-    }
-    return 0;
-  }
-};
-
-// Loop over calls to ShardCR with limited concurrency
-class GenCR : public RGWShardCollectCR {
-  static constexpr auto MAX_CONCURRENT_SHARDS = 64;
-
-  RGWDataSyncCtx& sc;
-  const uint64_t gen;
-
-  std::vector<rgw_bucket_sync_pair_info> pairs;
-  decltype(pairs)::const_iterator iter;
-
-public:
-  GenCR(RGWDataSyncCtx& sc, const rgw_bucket& source, const rgw_bucket& dest,
-       const uint64_t gen, const uint64_t shards,
-       const RGWBucketSyncFlowManager::pipe_handler& handler)
-    : RGWShardCollectCR(sc.cct, MAX_CONCURRENT_SHARDS),
-      sc(sc), gen(gen) {
-    pairs.resize(shards);
-    for (auto shard = 0u; shard < shards; ++shard) {
-      auto& pair = pairs[shard];
-      pair.handler = handler;
-      pair.source_bs.bucket = source;
-      pair.dest_bucket = dest;
-      pair.source_bs.shard_id = shard;
-    }
-    iter = pairs.cbegin();
-    assert(pairs.size() == shards);
-  }
-
-  virtual bool spawn_next() override {
-    if (iter == pairs.cend()) {
-      return false;
-    }
-    spawn(new ShardCR(sc, *iter, gen), false);
-    ++iter;
-    return true;
-  }
-
-  int handle_result(int r) override {
-    if (r < 0) {
-      ldpp_dout(sc.env->dpp, 4) << "ERROR: Error syncing shard: "
-                               << cpp_strerror(r) << dendl;
-    }
-    return r;
-  }
-};
-
-// Read sync status, loop over calls to GenCR
-class SourceCR : public RGWCoroutine {
-  RGWDataSyncCtx& sc;
-  const RGWBucketInfo& info;
-  const rgw_bucket& dest;
-  const RGWBucketSyncFlowManager::pipe_handler& handler;
-  const rgw_raw_obj status_obj{
-    sc.env->svc->zone->get_zone_params().log_pool,
-    RGWBucketPipeSyncStatusManager::full_status_oid(sc.source_zone, info.bucket,
-                                                   dest)};
-
-  BucketSyncState state = BucketSyncState::Incremental;
-  uint64_t gen = 0;
-  uint64_t num_shards = 0;
-  rgw_bucket_sync_status status;
-  std::string zone_name;
-
-public:
-
-  SourceCR(RGWDataSyncCtx& sc, const RGWBucketInfo& info,
-          const rgw_bucket& dest,
-          const RGWBucketSyncFlowManager::pipe_handler& handler,
-          const std::string& zone_name)
-    : RGWCoroutine(sc.cct), sc(sc), info(info), dest(dest), handler(handler),
-      zone_name(zone_name) {}
-
-  int operate(const DoutPrefixProvider *dpp) override {
-    reenter(this) {
-      // Get the source's status. In incremental sync, this gives us
-      // the generation and shard count that is next needed to be run.
-      yield call(new RGWSimpleRadosReadCR<rgw_bucket_sync_status>(
-                  dpp, sc.env->async_rados, sc.env->svc->sysobj,
-                  status_obj, &status));
-      if (retcode < 0) {
-       ldpp_dout(dpp, -1) << "ERROR: Unable to fetch status for zone="
-                          << sc.source_zone << " retcode="
-                          << retcode << dendl;
-       drain_all();
-       return set_cr_error(retcode);
-      }
-
-      if (status.state == BucketSyncState::Stopped) {
-       // Nothing to do.
-       pretty_print(sc.env, "Sync of bucket {} from source zone {} is in state Stopped. "
-                    "Nothing to do.\n", dest.name, zone_name);
-       ldpp_dout(dpp, 5) << "SourceCR: Bucket is in state Stopped, returning."
-                         << dendl;
-       drain_all();
-       return set_cr_done();
-      }
-
-      do {
-       state = status.state;
-       gen = status.incremental_gen;
-       num_shards = status.shards_done_with_gen.size();
-
-       ldpp_dout(dpp, 5) << "SourceCR: "
-                         << "state=" << state
-                         << ", gen=" << gen
-                         << ", num_shards=" << num_shards
-                         << dendl;
-
-       // Special case to handle full sync. Since full sync no longer
-       // uses shards and has no generations, we sync shard zero,
-       // though use the current generation so a following
-       // incremental sync can carry on.
-       if (state != BucketSyncState::Incremental) {
-         pretty_print(sc.env, "Beginning full sync of bucket {} from source zone {}.\n",
-                      dest.name, zone_name);
-         ldpp_dout(dpp, 5)  << "SourceCR: Calling GenCR with "
-                            << "gen=" << gen
-                            << ", num_shards=" << 1
-                            << dendl;
-         yield call(new GenCR(sc, info.bucket, dest, gen, 1, handler));
-       } else {
-         pretty_print(sc.env, "Beginning incremental sync of bucket {}, generation {} from source zone {}.\n",
-                      dest.name, gen, zone_name);
-         ldpp_dout(dpp, 5) << "SourceCR: Calling GenCR with "
-                           << "gen=" << gen
-                           << ", num_shards=" << num_shards
-                           << dendl;
-         yield call(new GenCR(sc, info.bucket, dest, gen, num_shards,
-                              handler));
-       }
-       if (retcode < 0) {
-         ldpp_dout(dpp, -1) << "ERROR: Giving up syncing from "
-                            << sc.source_zone << " retcode="
-                            << retcode << dendl;
-         drain_all();
-         return set_cr_error(retcode);
-       }
-
-       pretty_print(sc.env, "Completed.\n");
-
-       yield call(new RGWSimpleRadosReadCR<rgw_bucket_sync_status>(
-                    dpp, sc.env->async_rados, sc.env->svc->sysobj,
-                    status_obj, &status));
-       if (retcode < 0) {
-         ldpp_dout(dpp, -1) << "ERROR: Unable to fetch status for zone="
-                            << sc.source_zone << " retcode="
-                            << retcode << dendl;
-         drain_all();
-         return set_cr_error(retcode);
-       }
-       // Repeat until we have done an incremental run and the
-       // generation remains unchanged.
-       ldpp_dout(dpp, 5) << "SourceCR: "
-                         << "state=" << state
-                         << ", gen=" << gen
-                         << ", num_shards=" << num_shards
-                         << ", status.state=" << status.state
-                         << ", status.incremental_gen=" << status.incremental_gen
-                         << ", status.shards_done_with_gen.size()=" << status.shards_done_with_gen.size()
-                         << dendl;
-      } while (state != BucketSyncState::Incremental ||
-              gen != status.incremental_gen);
-      drain_all();
-      return set_cr_done();
-    }
-    return 0;
-  }
-};
-} // namespace rgw::bucket_sync_run
-
-int RGWBucketPipeSyncStatusManager::run(const DoutPrefixProvider *dpp)
-{
-  list<RGWCoroutinesStack *> stacks;
-  for (auto& source : sources) {
-    auto stack = new RGWCoroutinesStack(driver->ctx(), &cr_mgr);
-    stack->call(new rgw::bucket_sync_run::SourceCR(
-                 source.sc, source.info, source.dest, source.handler,
-                 source.zone_name));
-    stacks.push_back(stack);
-  }
-  auto ret = cr_mgr.run(dpp, stacks);
-  if (ret < 0) {
-    ldpp_dout(this, 0) << "ERROR: Sync unsuccessful on bucket "
-                      << bucket_str{dest_bucket} << dendl;
-  }
-  return ret;
-}
-
-unsigned RGWBucketPipeSyncStatusManager::get_subsys() const
-{
-  return dout_subsys;
-}
-
-std::ostream& RGWBucketPipeSyncStatusManager::gen_prefix(std::ostream& out) const
-{
-  auto zone = std::string_view{source_zone.value_or(rgw_zone_id("*")).id};
-  return out << "bucket sync zone:" << zone.substr(0, 8)
-    << " bucket:" << dest_bucket << ' ';
-}
-
-string RGWBucketPipeSyncStatusManager::full_status_oid(const rgw_zone_id& source_zone,
-                                                       const rgw_bucket& source_bucket,
-                                                       const rgw_bucket& dest_bucket)
-{
-  if (source_bucket == dest_bucket) {
-    return bucket_full_status_oid_prefix + "." + source_zone.id + ":"
-        + dest_bucket.get_key();
-  } else {
-    return bucket_full_status_oid_prefix + "." + source_zone.id + ":"
-        + dest_bucket.get_key() + ":" + source_bucket.get_key();
-  }
-}
-
-inline std::string generation_token(uint64_t gen) {
-  return (gen == 0) ? "" : (":" + std::to_string(gen));
-}
-
-string RGWBucketPipeSyncStatusManager::inc_status_oid(const rgw_zone_id& source_zone,
-                                                      const rgw_bucket_sync_pair_info& sync_pair,
-                                                      uint64_t gen)
-{
-  if (sync_pair.source_bs.bucket == sync_pair.dest_bucket) {
-    return bucket_status_oid_prefix + "." + source_zone.id + ":" + sync_pair.source_bs.get_key() + 
-      generation_token(gen);
-  } else {
-    return bucket_status_oid_prefix + "." + source_zone.id + ":" + sync_pair.dest_bucket.get_key() + ":" + sync_pair.source_bs.get_key() +
-      generation_token(gen);
-  }
-}
-
-string RGWBucketPipeSyncStatusManager::obj_status_oid(const rgw_bucket_sync_pipe& sync_pipe,
-                                                      const rgw_zone_id& source_zone,
-                                                      const rgw::sal::Object* obj)
-{
-  string prefix = object_status_oid_prefix + "." + source_zone.id + ":" + obj->get_bucket()->get_key().get_key();
-  if (sync_pipe.source_bucket_info.bucket !=
-      sync_pipe.dest_bucket_info.bucket) {
-    prefix += string("/") + sync_pipe.dest_bucket_info.bucket.get_key();
-  }
-  return prefix + ":" + obj->get_name() + ":" + obj->get_instance();
-}
-
-int rgw_read_remote_bilog_info(const DoutPrefixProvider *dpp,
-                               RGWRESTConn* conn,
-                               const rgw_bucket& bucket,
-                               rgw_bucket_index_marker_info& info,
-                               BucketIndexShardsManager& markers,
-                               optional_yield y)
-{
-  const auto instance_key = bucket.get_key();
-  const rgw_http_param_pair params[] = {
-    { "type" , "bucket-index" },
-    { "bucket-instance", instance_key.c_str() },
-    { "info" , nullptr },
-    { nullptr, nullptr }
-  };
-  int r = conn->get_json_resource(dpp, "/admin/log/", params, y, info);
-  if (r < 0) {
-    ldpp_dout(dpp, -1) << "failed to fetch remote log markers: " << cpp_strerror(r) << dendl;
-    return r;
-  }
-  // parse shard markers
-  r = markers.from_string(info.max_marker, -1);
-  if (r < 0) {
-    ldpp_dout(dpp, -1) << "failed to decode remote log markers" << dendl;
-    return r;
-  }
-  return 0;
-}
-
-class RGWCollectBucketSyncStatusCR : public RGWShardCollectCR {
-  static constexpr int max_concurrent_shards = 16;
-  rgw::sal::RadosStore* const driver;
-  RGWDataSyncCtx *const sc;
-  RGWDataSyncEnv *const env;
-  const uint64_t gen;
-
-  rgw_bucket_sync_pair_info sync_pair;
-  using Vector = std::vector<rgw_bucket_shard_sync_info>;
-  Vector::iterator i, end;
-
-  int handle_result(int r) override {
-    if (r == -ENOENT) { // ENOENT is not a fatal error
-      return 0;
-    }
-    if (r < 0) {
-      ldout(cct, 4) << "failed to read bucket shard sync status: "
-          << cpp_strerror(r) << dendl;
-    }
-    return r;
-  }
- public:
-  RGWCollectBucketSyncStatusCR(rgw::sal::RadosStore* driver, RGWDataSyncCtx *sc,
-                               const rgw_bucket_sync_pair_info& sync_pair,
-                               uint64_t gen,
-                               Vector *status)
-    : RGWShardCollectCR(sc->cct, max_concurrent_shards),
-      driver(driver), sc(sc), env(sc->env), gen(gen), sync_pair(sync_pair),
-      i(status->begin()), end(status->end())
-  {}
-
-  bool spawn_next() override {
-    if (i == end) {
-      return false;
-    }
-    spawn(new RGWReadBucketPipeSyncStatusCoroutine(sc, sync_pair, &*i, nullptr, gen), false);
-    ++i;
-    ++sync_pair.source_bs.shard_id;
-    return true;
-  }
-};
-
-int rgw_read_bucket_full_sync_status(const DoutPrefixProvider *dpp,
-                                     rgw::sal::RadosStore *driver,
-                                     const rgw_sync_bucket_pipe& pipe,
-                                     rgw_bucket_sync_status *status,
-                                     optional_yield y)
-{
-  auto get_oid = RGWBucketPipeSyncStatusManager::full_status_oid;
-  const rgw_raw_obj obj{driver->svc()->zone->get_zone_params().log_pool,
-                        get_oid(*pipe.source.zone, *pipe.source.bucket, *pipe.dest.bucket)};
-
-  auto svc = driver->svc()->sysobj;
-  auto sysobj = svc->get_obj(obj);
-  bufferlist bl;
-  int ret = sysobj.rop().read(dpp, &bl, y);
-  if (ret < 0)
-    return ret;
-
-  try {
-    auto iter = bl.cbegin();
-    using ceph::decode;
-    rgw_bucket_sync_status result;
-    decode(result, iter);
-    *status = result;
-    return 0;
-  } catch (const buffer::error& err) {
-    lderr(svc->ctx()) << "error decoding " << obj << ": " << err.what() << dendl;
-    return -EIO;
-  }
-}
-
-int rgw_read_bucket_inc_sync_status(const DoutPrefixProvider *dpp,
-                                    rgw::sal::RadosStore *driver,
-                                    const rgw_sync_bucket_pipe& pipe,
-                                    uint64_t gen,
-                                    std::vector<rgw_bucket_shard_sync_info> *status)
-{
-  if (!pipe.source.zone ||
-      !pipe.source.bucket ||
-      !pipe.dest.zone ||
-      !pipe.dest.bucket) {
-    return -EINVAL;
-  }
-
-  rgw_bucket_sync_pair_info sync_pair;
-  sync_pair.source_bs.bucket = *pipe.source.bucket;
-  sync_pair.source_bs.shard_id = 0;
-  sync_pair.dest_bucket = *pipe.dest.bucket;
-
-  RGWDataSyncEnv env;
-  RGWSyncModuleInstanceRef module; // null sync module
-  env.init(dpp, driver->ctx(), driver, driver->svc(), driver->svc()->rados->get_async_processor(),
-           nullptr, nullptr, nullptr, module, nullptr);
-
-  RGWDataSyncCtx sc;
-  sc.init(&env, nullptr, *pipe.source.zone);
-
-  RGWCoroutinesManager crs(driver->ctx(), driver->getRados()->get_cr_registry());
-  return crs.run(dpp, new RGWCollectBucketSyncStatusCR(driver, &sc,
-                                                  sync_pair,
-                                                  gen,
-                                                  status));
-}
-
-void rgw_data_sync_info::generate_test_instances(list<rgw_data_sync_info*>& o)
-{
-  auto info = new rgw_data_sync_info;
-  info->state = rgw_data_sync_info::StateBuildingFullSyncMaps;
-  info->num_shards = 8;
-  o.push_back(info);
-  o.push_back(new rgw_data_sync_info);
-}
-
-void rgw_data_sync_marker::generate_test_instances(list<rgw_data_sync_marker*>& o)
-{
-  auto marker = new rgw_data_sync_marker;
-  marker->state = rgw_data_sync_marker::IncrementalSync;
-  marker->marker = "01234";
-  marker->pos = 5;
-  o.push_back(marker);
-  o.push_back(new rgw_data_sync_marker);
-}
-
-void rgw_data_sync_status::generate_test_instances(list<rgw_data_sync_status*>& o)
-{
-  o.push_back(new rgw_data_sync_status);
-}
-
-void rgw_bucket_shard_full_sync_marker::dump(Formatter *f) const
-{
-  encode_json("position", position, f);
-  encode_json("count", count, f);
-}
-
-void rgw_bucket_shard_inc_sync_marker::decode_json(JSONObj *obj)
-{
-  JSONDecoder::decode_json("position", position, obj);
-  JSONDecoder::decode_json("timestamp", timestamp, obj);
-}
-
-void rgw_bucket_shard_inc_sync_marker::dump(Formatter *f) const
-{
-  encode_json("position", position, f);
-  encode_json("timestamp", timestamp, f);
-}
-
-void rgw_bucket_shard_sync_info::decode_json(JSONObj *obj)
-{
-  std::string s;
-  JSONDecoder::decode_json("status", s, obj);
-  if (s == "full-sync") {
-    state = StateFullSync;
-  } else if (s == "incremental-sync") {
-    state = StateIncrementalSync;
-  } else if (s == "stopped") {
-    state = StateStopped;
-  } else {
-    state = StateInit;
-  }
-  JSONDecoder::decode_json("inc_marker", inc_marker, obj);
-}
-
-void rgw_bucket_shard_full_sync_marker::decode_json(JSONObj *obj)
-{
-  JSONDecoder::decode_json("position", position, obj);
-  JSONDecoder::decode_json("count", count, obj);
-}
-
-void rgw_bucket_shard_sync_info::dump(Formatter *f) const
-{
-  const char *s{nullptr};
-  switch ((SyncState)state) {
-    case StateInit:
-    s = "init";
-    break;
-  case StateFullSync:
-    s = "full-sync";
-    break;
-  case StateIncrementalSync:
-    s = "incremental-sync";
-    break;
-  case StateStopped:
-    s = "stopped";
-    break;
-  default:
-    s = "unknown";
-    break;
-  }
-  encode_json("status", s, f);
-  encode_json("inc_marker", inc_marker, f);
-}
-
-void rgw_bucket_full_sync_status::decode_json(JSONObj *obj)
-{
-  JSONDecoder::decode_json("position", position, obj);
-  JSONDecoder::decode_json("count", count, obj);
-}
-
-void rgw_bucket_full_sync_status::dump(Formatter *f) const
-{
-  encode_json("position", position, f);
-  encode_json("count", count, f);
-}
-
-void encode_json(const char *name, BucketSyncState state, Formatter *f)
-{
-  switch (state) {
-  case BucketSyncState::Init:
-    encode_json(name, "init", f);
-    break;
-  case BucketSyncState::Full:
-    encode_json(name, "full-sync", f);
-    break;
-  case BucketSyncState::Incremental:
-    encode_json(name, "incremental-sync", f);
-    break;
-  case BucketSyncState::Stopped:
-    encode_json(name, "stopped", f);
-    break;
-  default:
-    encode_json(name, "unknown", f);
-    break;
-  }
-}
-
-void decode_json_obj(BucketSyncState& state, JSONObj *obj)
-{
-  std::string s;
-  decode_json_obj(s, obj);
-  if (s == "full-sync") {
-    state = BucketSyncState::Full;
-  } else if (s == "incremental-sync") {
-    state = BucketSyncState::Incremental;
-  } else if (s == "stopped") {
-    state = BucketSyncState::Stopped;
-  } else {
-    state = BucketSyncState::Init;
-  }
-}
-
-void rgw_bucket_sync_status::decode_json(JSONObj *obj)
-{
-  JSONDecoder::decode_json("state", state, obj);
-  JSONDecoder::decode_json("full", full, obj);
-  JSONDecoder::decode_json("incremental_gen", incremental_gen, obj);
-}
-
-void rgw_bucket_sync_status::dump(Formatter *f) const
-{
-  encode_json("state", state, f);
-  encode_json("full", full, f);
-  encode_json("incremental_gen", incremental_gen, f);
-}
-
-
-void bilog_status_v2::dump(Formatter *f) const
-{
-  encode_json("sync_status", sync_status, f);
-  encode_json("inc_status", inc_status, f);
-}
-
-void bilog_status_v2::decode_json(JSONObj *obj)
-{
-  JSONDecoder::decode_json("sync_status", sync_status, obj);
-  JSONDecoder::decode_json("inc_status", inc_status, obj);
-}
diff --git a/src/rgw/store/rados/rgw_data_sync.h b/src/rgw/store/rados/rgw_data_sync.h
deleted file mode 100644 (file)
index 6cc714d..0000000
+++ /dev/null
@@ -1,823 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab ft=cpp
-
-#ifndef CEPH_RGW_DATA_SYNC_H
-#define CEPH_RGW_DATA_SYNC_H
-
-#undef FMT_HEADER_ONLY
-#define FMT_HEADER_ONLY 1
-#include <fmt/format.h>
-#include <fmt/ostream.h>
-
-#include "include/encoding.h"
-
-#include "common/ceph_json.h"
-#include "common/likely.h"
-
-#include "rgw_coroutine.h"
-#include "rgw_http_client.h"
-#include "rgw_sal_rados.h"
-
-#include "rgw_datalog.h"
-#include "rgw_sync.h"
-#include "rgw_sync_module.h"
-#include "rgw_sync_trace.h"
-#include "rgw_sync_policy.h"
-
-#include "rgw_bucket_sync.h"
-
-// represents an obligation to sync an entry up a given time
-struct rgw_data_sync_obligation {
-  rgw_bucket_shard bs;
-  std::optional<uint64_t> gen;
-  std::string marker;
-  ceph::real_time timestamp;
-  bool retry = false;
-};
-
-inline std::ostream& operator<<(std::ostream& out, const rgw_data_sync_obligation& o) {
-  out << "key=" << o.bs;
-  if (o.gen) {
-    out << '[' << *o.gen << ']';
-  }
-  if (!o.marker.empty()) {
-    out << " marker=" << o.marker;
-  }
-  if (o.timestamp != ceph::real_time{}) {
-    out << " timestamp=" << o.timestamp;
-  }
-  if (o.retry) {
-    out << " retry";
-  }
-  return out;
-}
-
-class JSONObj;
-struct rgw_sync_bucket_pipe;
-
-struct rgw_bucket_sync_pair_info {
-  RGWBucketSyncFlowManager::pipe_handler handler; /* responsible for sync filters */
-  rgw_bucket_shard source_bs;
-  rgw_bucket dest_bucket;
-};
-
-inline std::ostream& operator<<(std::ostream& out, const rgw_bucket_sync_pair_info& p) {
-  if (p.source_bs.bucket == p.dest_bucket) {
-    return out << p.source_bs;
-  }
-  return out << p.source_bs << "->" << p.dest_bucket;
-}
-
-struct rgw_bucket_sync_pipe {
-  rgw_bucket_sync_pair_info info;
-  RGWBucketInfo source_bucket_info;
-  std::map<std::string, bufferlist> source_bucket_attrs;
-  RGWBucketInfo dest_bucket_info;
-  std::map<std::string, bufferlist> dest_bucket_attrs;
-
-  RGWBucketSyncFlowManager::pipe_rules_ref& get_rules() {
-    return info.handler.rules;
-  }
-};
-
-inline std::ostream& operator<<(std::ostream& out, const rgw_bucket_sync_pipe& p) {
-  return out << p.info;
-}
-
-struct rgw_datalog_info {
-  uint32_t num_shards;
-
-  rgw_datalog_info() : num_shards(0) {}
-
-  void decode_json(JSONObj *obj);
-};
-
-struct rgw_data_sync_info {
-  enum SyncState {
-    StateInit = 0,
-    StateBuildingFullSyncMaps = 1,
-    StateSync = 2,
-  };
-
-  uint16_t state;
-  uint32_t num_shards;
-
-  uint64_t instance_id{0};
-
-  void encode(bufferlist& bl) const {
-    ENCODE_START(2, 1, bl);
-    encode(state, bl);
-    encode(num_shards, bl);
-    encode(instance_id, bl);
-    ENCODE_FINISH(bl);
-  }
-
-  void decode(bufferlist::const_iterator& bl) {
-     DECODE_START(2, bl);
-     decode(state, bl);
-     decode(num_shards, bl);
-     if (struct_v >= 2) {
-       decode(instance_id, bl);
-     }
-     DECODE_FINISH(bl);
-  }
-
-  void dump(Formatter *f) const {
-    std::string s;
-    switch ((SyncState)state) {
-      case StateInit:
-       s = "init";
-       break;
-      case StateBuildingFullSyncMaps:
-       s = "building-full-sync-maps";
-       break;
-      case StateSync:
-       s = "sync";
-       break;
-      default:
-       s = "unknown";
-       break;
-    }
-    encode_json("status", s, f);
-    encode_json("num_shards", num_shards, f);
-    encode_json("instance_id", instance_id, f);
-  }
-  void decode_json(JSONObj *obj) {
-    std::string s;
-    JSONDecoder::decode_json("status", s, obj);
-    if (s == "building-full-sync-maps") {
-      state = StateBuildingFullSyncMaps;
-    } else if (s == "sync") {
-      state = StateSync;
-    } else {
-      state = StateInit;
-    }
-    JSONDecoder::decode_json("num_shards", num_shards, obj);
-    JSONDecoder::decode_json("instance_id", instance_id, obj);
-  }
-  static void generate_test_instances(std::list<rgw_data_sync_info*>& o);
-
-  rgw_data_sync_info() : state((int)StateInit), num_shards(0) {}
-};
-WRITE_CLASS_ENCODER(rgw_data_sync_info)
-
-struct rgw_data_sync_marker {
-  enum SyncState {
-    FullSync = 0,
-    IncrementalSync = 1,
-  };
-  uint16_t state;
-  std::string marker;
-  std::string next_step_marker;
-  uint64_t total_entries;
-  uint64_t pos;
-  real_time timestamp;
-
-  rgw_data_sync_marker() : state(FullSync), total_entries(0), pos(0) {}
-
-  void encode(bufferlist& bl) const {
-    ENCODE_START(1, 1, bl);
-    encode(state, bl);
-    encode(marker, bl);
-    encode(next_step_marker, bl);
-    encode(total_entries, bl);
-    encode(pos, bl);
-    encode(timestamp, bl);
-    ENCODE_FINISH(bl);
-  }
-
-  void decode(bufferlist::const_iterator& bl) {
-     DECODE_START(1, bl);
-    decode(state, bl);
-    decode(marker, bl);
-    decode(next_step_marker, bl);
-    decode(total_entries, bl);
-    decode(pos, bl);
-    decode(timestamp, bl);
-     DECODE_FINISH(bl);
-  }
-
-  void dump(Formatter *f) const {
-    const char *s{nullptr};
-    switch ((SyncState)state) {
-      case FullSync:
-        s = "full-sync";
-        break;
-      case IncrementalSync:
-        s = "incremental-sync";
-        break;
-      default:
-        s = "unknown";
-        break;
-    }
-    encode_json("status", s, f);
-    encode_json("marker", marker, f);
-    encode_json("next_step_marker", next_step_marker, f);
-    encode_json("total_entries", total_entries, f);
-    encode_json("pos", pos, f);
-    encode_json("timestamp", utime_t(timestamp), f);
-  }
-  void decode_json(JSONObj *obj) {
-    std::string s;
-    JSONDecoder::decode_json("status", s, obj);
-    if (s == "full-sync") {
-      state = FullSync;
-    } else if (s == "incremental-sync") {
-      state = IncrementalSync;
-    }
-    JSONDecoder::decode_json("marker", marker, obj);
-    JSONDecoder::decode_json("next_step_marker", next_step_marker, obj);
-    JSONDecoder::decode_json("total_entries", total_entries, obj);
-    JSONDecoder::decode_json("pos", pos, obj);
-    utime_t t;
-    JSONDecoder::decode_json("timestamp", t, obj);
-    timestamp = t.to_real_time();
-  }
-  static void generate_test_instances(std::list<rgw_data_sync_marker*>& o);
-};
-WRITE_CLASS_ENCODER(rgw_data_sync_marker)
-
-struct rgw_data_sync_status {
-  rgw_data_sync_info sync_info;
-  std::map<uint32_t, rgw_data_sync_marker> sync_markers;
-
-  rgw_data_sync_status() {}
-
-  void encode(bufferlist& bl) const {
-    ENCODE_START(1, 1, bl);
-    encode(sync_info, bl);
-    /* sync markers are encoded separately */
-    ENCODE_FINISH(bl);
-  }
-
-  void decode(bufferlist::const_iterator& bl) {
-     DECODE_START(1, bl);
-    decode(sync_info, bl);
-    /* sync markers are decoded separately */
-     DECODE_FINISH(bl);
-  }
-
-  void dump(Formatter *f) const {
-    encode_json("info", sync_info, f);
-    encode_json("markers", sync_markers, f);
-  }
-  void decode_json(JSONObj *obj) {
-    JSONDecoder::decode_json("info", sync_info, obj);
-    JSONDecoder::decode_json("markers", sync_markers, obj);
-  }
-  static void generate_test_instances(std::list<rgw_data_sync_status*>& o);
-};
-WRITE_CLASS_ENCODER(rgw_data_sync_status)
-
-struct rgw_datalog_entry {
-  std::string key;
-  ceph::real_time timestamp;
-
-  void decode_json(JSONObj *obj);
-};
-
-struct rgw_datalog_shard_data {
-  std::string marker;
-  bool truncated;
-  std::vector<rgw_datalog_entry> entries;
-
-  void decode_json(JSONObj *obj);
-};
-
-class RGWAsyncRadosProcessor;
-class RGWDataSyncControlCR;
-
-struct rgw_bucket_entry_owner {
-  std::string id;
-  std::string display_name;
-
-  rgw_bucket_entry_owner() {}
-  rgw_bucket_entry_owner(const std::string& _id, const std::string& _display_name) : id(_id), display_name(_display_name) {}
-
-  void decode_json(JSONObj *obj);
-};
-
-class RGWSyncErrorLogger;
-class RGWRESTConn;
-class RGWServices;
-
-struct RGWDataSyncEnv {
-  const DoutPrefixProvider *dpp{nullptr};
-  CephContext *cct{nullptr};
-  rgw::sal::RadosStore* driver{nullptr};
-  RGWServices *svc{nullptr};
-  RGWAsyncRadosProcessor *async_rados{nullptr};
-  RGWHTTPManager *http_manager{nullptr};
-  RGWSyncErrorLogger *error_logger{nullptr};
-  RGWSyncTraceManager *sync_tracer{nullptr};
-  RGWSyncModuleInstanceRef sync_module{nullptr};
-  PerfCounters* counters{nullptr};
-
-  RGWDataSyncEnv() {}
-
-  void init(const DoutPrefixProvider *_dpp, CephContext *_cct, rgw::sal::RadosStore* _driver, RGWServices *_svc,
-            RGWAsyncRadosProcessor *_async_rados, RGWHTTPManager *_http_manager,
-            RGWSyncErrorLogger *_error_logger, RGWSyncTraceManager *_sync_tracer,
-            RGWSyncModuleInstanceRef& _sync_module,
-            PerfCounters* _counters) {
-     dpp = _dpp;
-    cct = _cct;
-    driver = _driver;
-    svc = _svc;
-    async_rados = _async_rados;
-    http_manager = _http_manager;
-    error_logger = _error_logger;
-    sync_tracer = _sync_tracer;
-    sync_module = _sync_module;
-    counters = _counters;
-  }
-
-  std::string shard_obj_name(int shard_id);
-  std::string status_oid();
-
-  std::ostream* ostr{nullptr}; // For pretty printing progress
-};
-
-// pretty ostream output for `radosgw-admin bucket sync run`
-template<typename ...T>
-void pretty_print(const RGWDataSyncEnv* env, T&& ...t) {
-  if (unlikely(!!env->ostr)) {
-    fmt::print(*env->ostr, std::forward<T>(t)...);
-    env->ostr->flush();
-  }
-}
-
-struct RGWDataSyncCtx {
-  RGWDataSyncEnv *env{nullptr};
-  CephContext *cct{nullptr};
-
-  RGWRESTConn *conn{nullptr};
-  rgw_zone_id source_zone;
-
-  RGWDataSyncCtx() = default;
-
-  RGWDataSyncCtx(RGWDataSyncEnv* env,
-                RGWRESTConn* conn,
-                const rgw_zone_id& source_zone)
-    : env(env), cct(env->cct), conn(conn), source_zone(source_zone) {}
-
-  void init(RGWDataSyncEnv *_env,
-            RGWRESTConn *_conn,
-            const rgw_zone_id& _source_zone) {
-    cct = _env->cct;
-    env = _env;
-    conn = _conn;
-    source_zone = _source_zone;
-  }
-};
-
-class RGWRados;
-
-class RGWRemoteDataLog : public RGWCoroutinesManager {
-  const DoutPrefixProvider *dpp;
-  rgw::sal::RadosStore* driver;
-  CephContext *cct;
-  RGWCoroutinesManagerRegistry *cr_registry;
-  RGWAsyncRadosProcessor *async_rados;
-  RGWHTTPManager http_manager;
-
-  RGWDataSyncEnv sync_env;
-  RGWDataSyncCtx sc;
-
-  ceph::shared_mutex lock = ceph::make_shared_mutex("RGWRemoteDataLog::lock");
-  RGWDataSyncControlCR *data_sync_cr;
-
-  RGWSyncTraceNodeRef tn;
-
-  bool initialized;
-
-public:
-  RGWRemoteDataLog(const DoutPrefixProvider *dpp,
-                   rgw::sal::RadosStore* _store,
-                   RGWAsyncRadosProcessor *async_rados);
-  int init(const rgw_zone_id& _source_zone, RGWRESTConn *_conn, RGWSyncErrorLogger *_error_logger,
-           RGWSyncTraceManager *_sync_tracer, RGWSyncModuleInstanceRef& module,
-           PerfCounters* _counters);
-  void finish();
-
-  int read_log_info(const DoutPrefixProvider *dpp, rgw_datalog_info *log_info);
-  int read_source_log_shards_info(const DoutPrefixProvider *dpp, std::map<int, RGWDataChangesLogInfo> *shards_info);
-  int read_source_log_shards_next(const DoutPrefixProvider *dpp, std::map<int, std::string> shard_markers, std::map<int, rgw_datalog_shard_data> *result);
-  int read_sync_status(const DoutPrefixProvider *dpp, rgw_data_sync_status *sync_status);
-  int read_recovering_shards(const DoutPrefixProvider *dpp, const int num_shards, std::set<int>& recovering_shards);
-  int read_shard_status(const DoutPrefixProvider *dpp, int shard_id, std::set<std::string>& lagging_buckets,std::set<std::string>& recovering_buckets, rgw_data_sync_marker* sync_marker, const int max_entries);
-  int init_sync_status(const DoutPrefixProvider *dpp, int num_shards);
-  int run_sync(const DoutPrefixProvider *dpp, int num_shards);
-
-  void wakeup(int shard_id, bc::flat_set<rgw_data_notify_entry>& entries);
-};
-
-class RGWDataSyncStatusManager : public DoutPrefixProvider {
-  rgw::sal::RadosStore* driver;
-
-  rgw_zone_id source_zone;
-  RGWRESTConn *conn;
-  RGWSyncErrorLogger *error_logger;
-  RGWSyncModuleInstanceRef sync_module;
-  PerfCounters* counters;
-
-  RGWRemoteDataLog source_log;
-
-  std::string source_status_oid;
-  std::string source_shard_status_oid_prefix;
-
-  std::map<int, rgw_raw_obj> shard_objs;
-
-  int num_shards;
-
-public:
-  RGWDataSyncStatusManager(rgw::sal::RadosStore* _driver, RGWAsyncRadosProcessor *async_rados,
-                           const rgw_zone_id& _source_zone, PerfCounters* counters)
-    : driver(_driver), source_zone(_source_zone), conn(NULL), error_logger(NULL),
-      sync_module(nullptr), counters(counters),
-      source_log(this, driver, async_rados), num_shards(0) {}
-  RGWDataSyncStatusManager(rgw::sal::RadosStore* _driver, RGWAsyncRadosProcessor *async_rados,
-                           const rgw_zone_id& _source_zone, PerfCounters* counters,
-                           const RGWSyncModuleInstanceRef& _sync_module)
-    : driver(_driver), source_zone(_source_zone), conn(NULL), error_logger(NULL),
-      sync_module(_sync_module), counters(counters),
-      source_log(this, driver, async_rados), num_shards(0) {}
-  ~RGWDataSyncStatusManager() {
-    finalize();
-  }
-  int init(const DoutPrefixProvider *dpp);
-  void finalize();
-
-  static std::string shard_obj_name(const rgw_zone_id& source_zone, int shard_id);
-  static std::string sync_status_oid(const rgw_zone_id& source_zone);
-
-  int read_sync_status(const DoutPrefixProvider *dpp, rgw_data_sync_status *sync_status) {
-    return source_log.read_sync_status(dpp, sync_status);
-  }
-
-  int read_recovering_shards(const DoutPrefixProvider *dpp, const int num_shards, std::set<int>& recovering_shards) {
-    return source_log.read_recovering_shards(dpp, num_shards, recovering_shards);
-  }
-
-  int read_shard_status(const DoutPrefixProvider *dpp, int shard_id, std::set<std::string>& lagging_buckets, std::set<std::string>& recovering_buckets, rgw_data_sync_marker *sync_marker, const int max_entries) {
-    return source_log.read_shard_status(dpp, shard_id, lagging_buckets, recovering_buckets,sync_marker, max_entries);
-  }
-  int init_sync_status(const DoutPrefixProvider *dpp) { return source_log.init_sync_status(dpp, num_shards); }
-
-  int read_log_info(const DoutPrefixProvider *dpp, rgw_datalog_info *log_info) {
-    return source_log.read_log_info(dpp, log_info);
-  }
-  int read_source_log_shards_info(const DoutPrefixProvider *dpp, std::map<int, RGWDataChangesLogInfo> *shards_info) {
-    return source_log.read_source_log_shards_info(dpp, shards_info);
-  }
-  int read_source_log_shards_next(const DoutPrefixProvider *dpp, std::map<int, std::string> shard_markers, std::map<int, rgw_datalog_shard_data> *result) {
-    return source_log.read_source_log_shards_next(dpp, shard_markers, result);
-  }
-
-  int run(const DoutPrefixProvider *dpp) { return source_log.run_sync(dpp, num_shards); }
-
-  void wakeup(int shard_id, bc::flat_set<rgw_data_notify_entry>& entries) { return source_log.wakeup(shard_id, entries); }
-
-  void stop() {
-    source_log.finish();
-  }
-
-  // implements DoutPrefixProvider
-  CephContext *get_cct() const override;
-  unsigned get_subsys() const override;
-  std::ostream& gen_prefix(std::ostream& out) const override;
-};
-
-class RGWBucketPipeSyncStatusManager;
-class RGWBucketSyncCR;
-
-struct rgw_bucket_shard_full_sync_marker {
-  rgw_obj_key position;
-  uint64_t count;
-
-  rgw_bucket_shard_full_sync_marker() : count(0) {}
-
-  void encode_attr(std::map<std::string, bufferlist>& attrs);
-
-  void encode(bufferlist& bl) const {
-    ENCODE_START(1, 1, bl);
-    encode(position, bl);
-    encode(count, bl);
-    ENCODE_FINISH(bl);
-  }
-
-  void decode(bufferlist::const_iterator& bl) {
-     DECODE_START(1, bl);
-    decode(position, bl);
-    decode(count, bl);
-     DECODE_FINISH(bl);
-  }
-
-  void dump(Formatter *f) const;
-  void decode_json(JSONObj *obj);
-};
-WRITE_CLASS_ENCODER(rgw_bucket_shard_full_sync_marker)
-
-struct rgw_bucket_shard_inc_sync_marker {
-  std::string position;
-  ceph::real_time timestamp;
-
-  void encode_attr(std::map<std::string, bufferlist>& attrs);
-
-  void encode(bufferlist& bl) const {
-    ENCODE_START(2, 1, bl);
-    encode(position, bl);
-    encode(timestamp, bl);
-    ENCODE_FINISH(bl);
-  }
-
-  void decode(bufferlist::const_iterator& bl) {
-    DECODE_START(2, bl);
-    decode(position, bl);
-    if (struct_v >= 2) {
-      decode(timestamp, bl);
-    }
-    DECODE_FINISH(bl);
-  }
-
-  void dump(Formatter *f) const;
-  void decode_json(JSONObj *obj);
-};
-WRITE_CLASS_ENCODER(rgw_bucket_shard_inc_sync_marker)
-
-struct rgw_bucket_shard_sync_info {
-  enum SyncState {
-    StateInit = 0,
-    StateFullSync = 1,
-    StateIncrementalSync = 2,
-    StateStopped = 3,
-  };
-
-  uint16_t state;
-  rgw_bucket_shard_inc_sync_marker inc_marker;
-
-  void decode_from_attrs(CephContext *cct, std::map<std::string, bufferlist>& attrs);
-  void encode_all_attrs(std::map<std::string, bufferlist>& attrs);
-  void encode_state_attr(std::map<std::string, bufferlist>& attrs);
-
-  void encode(bufferlist& bl) const {
-    ENCODE_START(2, 1, bl);
-    encode(state, bl);
-    encode(inc_marker, bl);
-    ENCODE_FINISH(bl);
-  }
-
-  void decode(bufferlist::const_iterator& bl) {
-     DECODE_START(2, bl);
-     decode(state, bl);
-     if (struct_v <= 1) {
-       rgw_bucket_shard_full_sync_marker full_marker;
-       decode(full_marker, bl);
-     }
-     decode(inc_marker, bl);
-     DECODE_FINISH(bl);
-  }
-
-  void dump(Formatter *f) const;
-  void decode_json(JSONObj *obj);
-
-  rgw_bucket_shard_sync_info() : state((int)StateInit) {}
-
-};
-WRITE_CLASS_ENCODER(rgw_bucket_shard_sync_info)
-
-struct rgw_bucket_full_sync_status {
-  rgw_obj_key position;
-  uint64_t count = 0;
-
-  void encode(bufferlist& bl) const {
-    ENCODE_START(1, 1, bl);
-    encode(position, bl);
-    encode(count, bl);
-    ENCODE_FINISH(bl);
-  }
-
-  void decode(bufferlist::const_iterator& bl) {
-    DECODE_START(1, bl);
-    decode(position, bl);
-    decode(count, bl);
-    DECODE_FINISH(bl);
-  }
-
-  void dump(Formatter *f) const;
-  void decode_json(JSONObj *obj);
-};
-WRITE_CLASS_ENCODER(rgw_bucket_full_sync_status)
-
-enum class BucketSyncState : uint8_t {
-  Init = 0,
-  Full,
-  Incremental,
-  Stopped,
-};
-inline std::ostream& operator<<(std::ostream& out, const BucketSyncState& s) {
-  switch (s) {
-  case BucketSyncState::Init: out << "init"; break;
-  case BucketSyncState::Full: out << "full"; break;
-  case BucketSyncState::Incremental: out << "incremental"; break;
-  case BucketSyncState::Stopped: out << "stopped"; break;
-  }
-  return out;
-}
-
-void encode_json(const char *name, BucketSyncState state, Formatter *f);
-void decode_json_obj(BucketSyncState& state, JSONObj *obj);
-
-struct rgw_bucket_sync_status {
-  BucketSyncState state = BucketSyncState::Init;
-  rgw_bucket_full_sync_status full;
-  uint64_t incremental_gen = 0;
-  std::vector<bool> shards_done_with_gen;
-
-  void encode(bufferlist& bl) const {
-    ENCODE_START(2, 1, bl);
-    encode(state, bl);
-    encode(full, bl);
-    encode(incremental_gen, bl);
-    encode(shards_done_with_gen, bl);
-    ENCODE_FINISH(bl);
-  }
-
-  void decode(bufferlist::const_iterator& bl) {
-    DECODE_START(2, bl);
-    decode(state, bl);
-    decode(full, bl);
-    if (struct_v > 1) {
-      decode(incremental_gen, bl);
-      decode(shards_done_with_gen, bl);
-    }
-    DECODE_FINISH(bl);
-  }
-
-  void dump(Formatter *f) const;
-  void decode_json(JSONObj *obj);
-};
-WRITE_CLASS_ENCODER(rgw_bucket_sync_status)
-
-struct bilog_status_v2 {
-  rgw_bucket_sync_status sync_status;
-  std::vector<rgw_bucket_shard_sync_info> inc_status;
-
-  void dump(Formatter *f) const;
-  void decode_json(JSONObj *obj);
-};
-
-struct store_gen_shards {
-  uint64_t gen = 0;
-  uint32_t num_shards = 0;
-
-  void dump(Formatter *f) const {
-    encode_json("gen", gen, f);
-    encode_json("num_shards", num_shards, f);
-  }
-
-  void decode_json(JSONObj *obj) {
-    JSONDecoder::decode_json("gen", gen, obj);
-    JSONDecoder::decode_json("num_shards", num_shards, obj);
-  }
-};
-
-struct rgw_bucket_index_marker_info {
-  std::string bucket_ver;
-  std::string master_ver;
-  std::string max_marker;
-  bool syncstopped{false};
-  uint64_t oldest_gen = 0;
-  uint64_t latest_gen = 0;
-  std::vector<store_gen_shards> generations;
-
-  void decode_json(JSONObj *obj) {
-    JSONDecoder::decode_json("bucket_ver", bucket_ver, obj);
-    JSONDecoder::decode_json("master_ver", master_ver, obj);
-    JSONDecoder::decode_json("max_marker", max_marker, obj);
-    JSONDecoder::decode_json("syncstopped", syncstopped, obj);
-    JSONDecoder::decode_json("oldest_gen", oldest_gen, obj);
-    JSONDecoder::decode_json("latest_gen", latest_gen, obj);
-    JSONDecoder::decode_json("generations", generations, obj);
-  }
-};
-
-
-class BucketIndexShardsManager;
-
-int rgw_read_remote_bilog_info(const DoutPrefixProvider *dpp,
-                               RGWRESTConn* conn,
-                               const rgw_bucket& bucket,
-                               rgw_bucket_index_marker_info& info,
-                               BucketIndexShardsManager& markers,
-                               optional_yield y);
-
-class RGWBucketPipeSyncStatusManager : public DoutPrefixProvider {
-  rgw::sal::RadosStore* driver;
-
-  RGWDataSyncEnv sync_env;
-
-  RGWCoroutinesManager cr_mgr{driver->ctx(),
-                              driver->getRados()->get_cr_registry()};
-
-  RGWHTTPManager http_manager{driver->ctx(), cr_mgr.get_completion_mgr()};
-
-  std::optional<rgw_zone_id> source_zone;
-  std::optional<rgw_bucket> source_bucket;
-
-  std::unique_ptr<RGWSyncErrorLogger> error_logger =
-    std::make_unique<RGWSyncErrorLogger>(driver, RGW_SYNC_ERROR_LOG_SHARD_PREFIX,
-                                        ERROR_LOGGER_SHARDS);
-  RGWSyncModuleInstanceRef sync_module;
-
-  rgw_bucket dest_bucket;
-
-  struct source {
-    RGWDataSyncCtx sc;
-    RGWBucketInfo info;
-    rgw_bucket dest;
-    RGWBucketSyncFlowManager::pipe_handler handler;
-    std::string zone_name;
-
-    source(RGWDataSyncEnv* env, const rgw_zone_id& zone, RGWRESTConn* conn,
-          const RGWBucketInfo& info, const rgw_bucket& dest,
-          const RGWBucketSyncFlowManager::pipe_handler& handler,
-          const std::string& zone_name)
-      : sc(env, conn, zone), info(info), dest(dest), handler(handler),
-       zone_name(zone_name) {}
-  };
-  std::vector<source> sources;
-
-  int do_init(const DoutPrefixProvider *dpp, std::ostream* ostr);
-  RGWBucketPipeSyncStatusManager(rgw::sal::RadosStore* driver,
-                                std::optional<rgw_zone_id> source_zone,
-                                std::optional<rgw_bucket> source_bucket,
-                                const rgw_bucket& dest_bucket)
-    : driver(driver), source_zone(source_zone), source_bucket(source_bucket),
-      dest_bucket(dest_bucket) {}
-
-  int remote_info(const DoutPrefixProvider *dpp, source& s,
-                 uint64_t* oldest_gen, uint64_t* latest_gen,
-                 uint64_t* num_shards);
-public:
-  static tl::expected<std::unique_ptr<RGWBucketPipeSyncStatusManager>, int>
-  construct(const DoutPrefixProvider* dpp, rgw::sal::RadosStore* driver,
-           std::optional<rgw_zone_id> source_zone,
-           std::optional<rgw_bucket> source_bucket,
-           const rgw_bucket& dest_bucket, std::ostream *ostream);
-  ~RGWBucketPipeSyncStatusManager() = default;
-
-
-  static std::string full_status_oid(const rgw_zone_id& source_zone,
-                                    const rgw_bucket& source_bucket,
-                                    const rgw_bucket& dest_bucket);
-  static std::string inc_status_oid(const rgw_zone_id& source_zone,
-                                   const rgw_bucket_sync_pair_info& bs,
-                                   uint64_t gen);
-  // specific source obj sync status, can be used by sync modules
-  static std::string obj_status_oid(const rgw_bucket_sync_pipe& sync_pipe,
-                                   const rgw_zone_id& source_zone, const rgw::sal::Object* obj); /* specific source obj sync status,
-                                                                                      can be used by sync modules */
-
-  // implements DoutPrefixProvider
-  CephContext *get_cct() const override;
-  unsigned get_subsys() const override;
-  std::ostream& gen_prefix(std::ostream& out) const override;
-
-  int init_sync_status(const DoutPrefixProvider *dpp);
-  tl::expected<std::map<int, rgw_bucket_shard_sync_info>, int> read_sync_status(
-    const DoutPrefixProvider *dpp);
-  int run(const DoutPrefixProvider *dpp);
-};
-
-/// read the full sync status with respect to a source bucket
-int rgw_read_bucket_full_sync_status(const DoutPrefixProvider *dpp,
-                                     rgw::sal::RadosStore *driver,
-                                     const rgw_sync_bucket_pipe& pipe,
-                                     rgw_bucket_sync_status *status,
-                                     optional_yield y);
-
-/// read the incremental sync status of all bucket shards from the given source zone
-int rgw_read_bucket_inc_sync_status(const DoutPrefixProvider *dpp,
-                                    rgw::sal::RadosStore *driver,
-                                    const rgw_sync_bucket_pipe& pipe,
-                                    uint64_t gen,
-                                    std::vector<rgw_bucket_shard_sync_info> *status);
-
-class RGWDefaultSyncModule : public RGWSyncModule {
-public:
-  RGWDefaultSyncModule() {}
-  bool supports_writes() override { return true; }
-  bool supports_data_export() override { return true; }
-  int create_instance(const DoutPrefixProvider *dpp, CephContext *cct, const JSONFormattable& config, RGWSyncModuleInstanceRef *instance) override;
-};
-
-class RGWArchiveSyncModule : public RGWDefaultSyncModule {
-public:
-  RGWArchiveSyncModule() {}
-  bool supports_writes() override { return true; }
-  bool supports_data_export() override { return false; }
-  int create_instance(const DoutPrefixProvider *dpp, CephContext *cct, const JSONFormattable& config, RGWSyncModuleInstanceRef *instance) override;
-};
-
-#endif
diff --git a/src/rgw/store/rados/rgw_datalog.cc b/src/rgw/store/rados/rgw_datalog.cc
deleted file mode 100644 (file)
index 3eeb820..0000000
+++ /dev/null
@@ -1,1065 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab ft=cpp
-
-#include <vector>
-
-#include "common/debug.h"
-#include "common/containers.h"
-#include "common/errno.h"
-#include "common/error_code.h"
-
-#include "common/async/blocked_completion.h"
-#include "common/async/librados_completion.h"
-
-#include "cls/fifo/cls_fifo_types.h"
-#include "cls/log/cls_log_client.h"
-
-#include "cls_fifo_legacy.h"
-#include "rgw_bucket_layout.h"
-#include "rgw_datalog.h"
-#include "rgw_log_backing.h"
-#include "rgw_tools.h"
-
-#define dout_context g_ceph_context
-static constexpr auto dout_subsys = ceph_subsys_rgw;
-
-namespace bs = boost::system;
-namespace lr = librados;
-
-using ceph::containers::tiny_vector;
-
-void rgw_data_change::dump(ceph::Formatter *f) const
-{
-  std::string type;
-  switch (entity_type) {
-    case ENTITY_TYPE_BUCKET:
-      type = "bucket";
-      break;
-    default:
-      type = "unknown";
-  }
-  encode_json("entity_type", type, f);
-  encode_json("key", key, f);
-  utime_t ut(timestamp);
-  encode_json("timestamp", ut, f);
-  encode_json("gen", gen, f);
-}
-
-void rgw_data_change::decode_json(JSONObj *obj) {
-  std::string s;
-  JSONDecoder::decode_json("entity_type", s, obj);
-  if (s == "bucket") {
-    entity_type = ENTITY_TYPE_BUCKET;
-  } else {
-    entity_type = ENTITY_TYPE_UNKNOWN;
-  }
-  JSONDecoder::decode_json("key", key, obj);
-  utime_t ut;
-  JSONDecoder::decode_json("timestamp", ut, obj);
-  timestamp = ut.to_real_time();
-  JSONDecoder::decode_json("gen", gen, obj);
-}
-
-void rgw_data_change_log_entry::dump(Formatter *f) const
-{
-  encode_json("log_id", log_id, f);
-  utime_t ut(log_timestamp);
-  encode_json("log_timestamp", ut, f);
-  encode_json("entry", entry, f);
-}
-
-void rgw_data_change_log_entry::decode_json(JSONObj *obj) {
-  JSONDecoder::decode_json("log_id", log_id, obj);
-  utime_t ut;
-  JSONDecoder::decode_json("log_timestamp", ut, obj);
-  log_timestamp = ut.to_real_time();
-  JSONDecoder::decode_json("entry", entry, obj);
-}
-
-void rgw_data_notify_entry::dump(Formatter *f) const
-{
-  encode_json("key", key, f);
-  encode_json("gen", gen, f);
-}
-
-void rgw_data_notify_entry::decode_json(JSONObj *obj) {
-  JSONDecoder::decode_json("key", key, obj);
-  JSONDecoder::decode_json("gen", gen, obj);
-}
-
-class RGWDataChangesOmap final : public RGWDataChangesBE {
-  using centries = std::list<cls_log_entry>;
-  std::vector<std::string> oids;
-
-public:
-  RGWDataChangesOmap(lr::IoCtx& ioctx,
-                    RGWDataChangesLog& datalog,
-                    uint64_t gen_id,
-                    int num_shards)
-    : RGWDataChangesBE(ioctx, datalog, gen_id) {
-    oids.reserve(num_shards);
-    for (auto i = 0; i < num_shards; ++i) {
-      oids.push_back(get_oid(i));
-    }
-  }
-  ~RGWDataChangesOmap() override = default;
-
-  void prepare(ceph::real_time ut, const std::string& key,
-              ceph::buffer::list&& entry, entries& out) override {
-    if (!std::holds_alternative<centries>(out)) {
-      ceph_assert(std::visit([](const auto& v) { return std::empty(v); }, out));
-      out = centries();
-    }
-
-    cls_log_entry e;
-    cls_log_add_prepare_entry(e, utime_t(ut), {}, key, entry);
-    std::get<centries>(out).push_back(std::move(e));
-  }
-  int push(const DoutPrefixProvider *dpp, int index, entries&& items) override {
-    lr::ObjectWriteOperation op;
-    cls_log_add(op, std::get<centries>(items), true);
-    auto r = rgw_rados_operate(dpp, ioctx, oids[index], &op, null_yield);
-    if (r < 0) {
-      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__
-                << ": failed to push to " << oids[index] << cpp_strerror(-r)
-                << dendl;
-    }
-    return r;
-  }
-  int push(const DoutPrefixProvider *dpp, int index, ceph::real_time now,
-          const std::string& key,
-          ceph::buffer::list&& bl) override {
-    lr::ObjectWriteOperation op;
-    cls_log_add(op, utime_t(now), {}, key, bl);
-    auto r = rgw_rados_operate(dpp, ioctx, oids[index], &op, null_yield);
-    if (r < 0) {
-      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__
-                << ": failed to push to " << oids[index]
-                << cpp_strerror(-r) << dendl;
-    }
-    return r;
-  }
-  int list(const DoutPrefixProvider *dpp, int index, int max_entries,
-          std::vector<rgw_data_change_log_entry>& entries,
-          std::optional<std::string_view> marker,
-          std::string* out_marker, bool* truncated) override {
-    std::list<cls_log_entry> log_entries;
-    lr::ObjectReadOperation op;
-    cls_log_list(op, {}, {}, std::string(marker.value_or("")),
-                max_entries, log_entries, out_marker, truncated);
-    auto r = rgw_rados_operate(dpp, ioctx, oids[index], &op, nullptr, null_yield);
-    if (r == -ENOENT) {
-      *truncated = false;
-      return 0;
-    }
-    if (r < 0) {
-      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__
-                << ": failed to list " << oids[index]
-                << cpp_strerror(-r) << dendl;
-      return r;
-    }
-    for (auto iter = log_entries.begin(); iter != log_entries.end(); ++iter) {
-      rgw_data_change_log_entry log_entry;
-      log_entry.log_id = iter->id;
-      auto rt = iter->timestamp.to_real_time();
-      log_entry.log_timestamp = rt;
-      auto liter = iter->data.cbegin();
-      try {
-       decode(log_entry.entry, liter);
-      } catch (ceph::buffer::error& err) {
-       ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__
-                  << ": failed to decode data changes log entry: "
-                  << err.what() << dendl;
-       return -EIO;
-      }
-      entries.push_back(log_entry);
-    }
-    return 0;
-  }
-  int get_info(const DoutPrefixProvider *dpp, int index, RGWDataChangesLogInfo *info) override {
-    cls_log_header header;
-    lr::ObjectReadOperation op;
-    cls_log_info(op, &header);
-    auto r = rgw_rados_operate(dpp, ioctx, oids[index], &op, nullptr, null_yield);
-    if (r == -ENOENT) r = 0;
-    if (r < 0) {
-      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__
-                << ": failed to get info from " << oids[index]
-                << cpp_strerror(-r) << dendl;
-    } else {
-      info->marker = header.max_marker;
-      info->last_update = header.max_time.to_real_time();
-    }
-    return r;
-  }
-  int trim(const DoutPrefixProvider *dpp, int index, std::string_view marker) override {
-    lr::ObjectWriteOperation op;
-    cls_log_trim(op, {}, {}, {}, std::string(marker));
-    auto r = rgw_rados_operate(dpp, ioctx, oids[index], &op, null_yield);
-    if (r == -ENOENT) r = -ENODATA;
-    if (r < 0 && r != -ENODATA) {
-      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__
-                << ": failed to get info from " << oids[index]
-                << cpp_strerror(-r) << dendl;
-    }
-    return r;
-  }
-  int trim(const DoutPrefixProvider *dpp, int index, std::string_view marker,
-          lr::AioCompletion* c) override {
-    lr::ObjectWriteOperation op;
-    cls_log_trim(op, {}, {}, {}, std::string(marker));
-    auto r = ioctx.aio_operate(oids[index], c, &op, 0);
-    if (r == -ENOENT) r = -ENODATA;
-    if (r < 0) {
-      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__
-                << ": failed to get info from " << oids[index]
-                << cpp_strerror(-r) << dendl;
-    }
-    return r;
-  }
-  std::string_view max_marker() const override {
-    return "99999999";
-  }
-  int is_empty(const DoutPrefixProvider *dpp) override {
-    for (auto shard = 0u; shard < oids.size(); ++shard) {
-      std::list<cls_log_entry> log_entries;
-      lr::ObjectReadOperation op;
-      std::string out_marker;
-      bool truncated;
-      cls_log_list(op, {}, {}, {}, 1, log_entries, &out_marker, &truncated);
-      auto r = rgw_rados_operate(dpp, ioctx, oids[shard], &op, nullptr, null_yield);
-      if (r == -ENOENT) {
-       continue;
-      }
-      if (r < 0) {
-       ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__
-                  << ": failed to list " << oids[shard]
-                  << cpp_strerror(-r) << dendl;
-       return r;
-      }
-      if (!log_entries.empty()) {
-       return 0;
-      }
-    }
-    return 1;
-  }
-};
-
-class RGWDataChangesFIFO final : public RGWDataChangesBE {
-  using centries = std::vector<ceph::buffer::list>;
-  tiny_vector<LazyFIFO> fifos;
-
-public:
-  RGWDataChangesFIFO(lr::IoCtx& ioctx,
-                    RGWDataChangesLog& datalog,
-                    uint64_t gen_id, int shards)
-    : RGWDataChangesBE(ioctx, datalog, gen_id),
-      fifos(shards, [&ioctx, this](std::size_t i, auto emplacer) {
-       emplacer.emplace(ioctx, get_oid(i));
-      }) {}
-  ~RGWDataChangesFIFO() override = default;
-  void prepare(ceph::real_time, const std::string&,
-              ceph::buffer::list&& entry, entries& out) override {
-    if (!std::holds_alternative<centries>(out)) {
-      ceph_assert(std::visit([](auto& v) { return std::empty(v); }, out));
-      out = centries();
-    }
-    std::get<centries>(out).push_back(std::move(entry));
-  }
-  int push(const DoutPrefixProvider *dpp, int index, entries&& items) override {
-    auto r = fifos[index].push(dpp, std::get<centries>(items), null_yield);
-    if (r < 0) {
-      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__
-                << ": unable to push to FIFO: " << get_oid(index)
-                << ": " << cpp_strerror(-r) << dendl;
-    }
-    return r;
-  }
-  int push(const DoutPrefixProvider *dpp, int index, ceph::real_time,
-          const std::string&,
-          ceph::buffer::list&& bl) override {
-    auto r = fifos[index].push(dpp, std::move(bl), null_yield);
-    if (r < 0) {
-      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__
-                << ": unable to push to FIFO: " << get_oid(index)
-                << ": " << cpp_strerror(-r) << dendl;
-    }
-    return r;
-  }
-  int list(const DoutPrefixProvider *dpp, int index, int max_entries,
-          std::vector<rgw_data_change_log_entry>& entries,
-          std::optional<std::string_view> marker,
-          std::string* out_marker, bool* truncated) override {
-    std::vector<rgw::cls::fifo::list_entry> log_entries;
-    bool more = false;
-    auto r = fifos[index].list(dpp, max_entries, marker, &log_entries, &more,
-                              null_yield);
-    if (r < 0) {
-      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__
-                << ": unable to list FIFO: " << get_oid(index)
-                << ": " << cpp_strerror(-r) << dendl;
-      return r;
-    }
-    for (const auto& entry : log_entries) {
-      rgw_data_change_log_entry log_entry;
-      log_entry.log_id = entry.marker;
-      log_entry.log_timestamp = entry.mtime;
-      auto liter = entry.data.cbegin();
-      try {
-       decode(log_entry.entry, liter);
-      } catch (const buffer::error& err) {
-       ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__
-                  << ": failed to decode data changes log entry: "
-                  << err.what() << dendl;
-       return -EIO;
-      }
-      entries.push_back(std::move(log_entry));
-    }
-    if (truncated)
-      *truncated = more;
-    if (out_marker && !log_entries.empty()) {
-      *out_marker = log_entries.back().marker;
-    }
-    return 0;
-  }
-  int get_info(const DoutPrefixProvider *dpp, int index, RGWDataChangesLogInfo *info) override {
-    auto& fifo = fifos[index];
-    auto r = fifo.read_meta(dpp, null_yield);
-    if (r < 0) {
-      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__
-                << ": unable to get FIFO metadata: " << get_oid(index)
-                << ": " << cpp_strerror(-r) << dendl;
-      return r;
-    }
-    rados::cls::fifo::info m;
-    fifo.meta(dpp, m, null_yield);
-    auto p = m.head_part_num;
-    if (p < 0) {
-      info->marker = "";
-      info->last_update = ceph::real_clock::zero();
-      return 0;
-    }
-    rgw::cls::fifo::part_info h;
-    r = fifo.get_part_info(dpp, p, &h, null_yield);
-    if (r < 0) {
-      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__
-                << ": unable to get part info: " << get_oid(index) << "/" << p
-                << ": " << cpp_strerror(-r) << dendl;
-      return r;
-    }
-    info->marker = rgw::cls::fifo::marker{p, h.last_ofs}.to_string();
-    info->last_update = h.max_time;
-    return 0;
-  }
-  int trim(const DoutPrefixProvider *dpp, int index, std::string_view marker) override {
-    auto r = fifos[index].trim(dpp, marker, false, null_yield);
-    if (r < 0) {
-      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__
-                << ": unable to trim FIFO: " << get_oid(index)
-                << ": " << cpp_strerror(-r) << dendl;
-    }
-    return r;
-  }
-  int trim(const DoutPrefixProvider *dpp, int index, std::string_view marker,
-          librados::AioCompletion* c) override {
-    int r = 0;
-    if (marker == rgw::cls::fifo::marker(0, 0).to_string()) {
-      rgw_complete_aio_completion(c, -ENODATA);
-    } else {
-      fifos[index].trim(dpp, marker, false, c, null_yield);
-    }
-    return r;
-  }
-  std::string_view max_marker() const override {
-    static const std::string mm =
-      rgw::cls::fifo::marker::max().to_string();
-    return std::string_view(mm);
-  }
-  int is_empty(const DoutPrefixProvider *dpp) override {
-    std::vector<rgw::cls::fifo::list_entry> log_entries;
-    bool more = false;
-    for (auto shard = 0u; shard < fifos.size(); ++shard) {
-      auto r = fifos[shard].list(dpp, 1, {}, &log_entries, &more,
-                                null_yield);
-      if (r < 0) {
-       ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__
-                  << ": unable to list FIFO: " << get_oid(shard)
-                  << ": " << cpp_strerror(-r) << dendl;
-       return r;
-      }
-      if (!log_entries.empty()) {
-       return 0;
-      }
-    }
-    return 1;
-  }
-};
-
-RGWDataChangesLog::RGWDataChangesLog(CephContext* cct)
-  : cct(cct),
-    num_shards(cct->_conf->rgw_data_log_num_shards),
-    prefix(get_prefix()),
-    changes(cct->_conf->rgw_data_log_changes_size) {}
-
-bs::error_code DataLogBackends::handle_init(entries_t e) noexcept {
-  std::unique_lock l(m);
-
-  for (const auto& [gen_id, gen] : e) {
-    if (gen.pruned) {
-      lderr(datalog.cct)
-       << __PRETTY_FUNCTION__ << ":" << __LINE__
-       << ": ERROR: given empty generation: gen_id=" << gen_id << dendl;
-    }
-    if (count(gen_id) != 0) {
-      lderr(datalog.cct)
-       << __PRETTY_FUNCTION__ << ":" << __LINE__
-       << ": ERROR: generation already exists: gen_id=" << gen_id << dendl;
-    }
-    try {
-      switch (gen.type) {
-      case log_type::omap:
-       emplace(gen_id, new RGWDataChangesOmap(ioctx, datalog, gen_id, shards));
-       break;
-      case log_type::fifo:
-       emplace(gen_id, new RGWDataChangesFIFO(ioctx, datalog, gen_id, shards));
-       break;
-      default:
-       lderr(datalog.cct)
-         << __PRETTY_FUNCTION__ << ":" << __LINE__
-         << ": IMPOSSIBLE: invalid log type: gen_id=" << gen_id
-         << ", type" << gen.type << dendl;
-       return bs::error_code(EFAULT, bs::system_category());
-      }
-    } catch (const bs::system_error& err) {
-      lderr(datalog.cct)
-         << __PRETTY_FUNCTION__ << ":" << __LINE__
-         << ": error setting up backend: gen_id=" << gen_id
-         << ", err=" << err.what() << dendl;
-      return err.code();
-    }
-  }
-  return {};
-}
-bs::error_code DataLogBackends::handle_new_gens(entries_t e) noexcept {
-  return handle_init(std::move(e));
-}
-bs::error_code DataLogBackends::handle_empty_to(uint64_t new_tail) noexcept {
-  std::unique_lock l(m);
-  auto i = cbegin();
-  if (i->first < new_tail) {
-    return {};
-  }
-  if (new_tail >= (cend() - 1)->first) {
-    lderr(datalog.cct)
-      << __PRETTY_FUNCTION__ << ":" << __LINE__
-      << ": ERROR: attempt to trim head: new_tail=" << new_tail << dendl;
-    return bs::error_code(EFAULT, bs::system_category());
-  }
-  erase(i, upper_bound(new_tail));
-  return {};
-}
-
-
-int RGWDataChangesLog::start(const DoutPrefixProvider *dpp, const RGWZone* _zone,
-                            const RGWZoneParams& zoneparams,
-                            librados::Rados* lr)
-{
-  zone = _zone;
-  ceph_assert(zone);
-  auto defbacking = to_log_type(
-    cct->_conf.get_val<std::string>("rgw_default_data_log_backing"));
-  // Should be guaranteed by `set_enum_allowed`
-  ceph_assert(defbacking);
-  auto log_pool = zoneparams.log_pool;
-  auto r = rgw_init_ioctx(dpp, lr, log_pool, ioctx, true, false);
-  if (r < 0) {
-    ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__
-              << ": Failed to initialized ioctx, r=" << r
-              << ", pool=" << log_pool << dendl;
-    return -r;
-  }
-
-  auto besr = logback_generations::init<DataLogBackends>(
-    dpp, ioctx, metadata_log_oid(), [this](uint64_t gen_id, int shard) {
-      return get_oid(gen_id, shard);
-    },
-    num_shards, *defbacking, null_yield, *this);
-
-
-  if (!besr) {
-    lderr(cct) << __PRETTY_FUNCTION__
-              << ": Error initializing backends: "
-              << besr.error().message() << dendl;
-    return ceph::from_error_code(besr.error());
-  }
-
-  bes = std::move(*besr);
-  renew_thread = make_named_thread("rgw_dt_lg_renew",
-                                  &RGWDataChangesLog::renew_run, this);
-  return 0;
-}
-
-int RGWDataChangesLog::choose_oid(const rgw_bucket_shard& bs) {
-  const auto& name = bs.bucket.name;
-  auto shard_shift = (bs.shard_id > 0 ? bs.shard_id : 0);
-  auto r = (ceph_str_hash_linux(name.data(), name.size()) +
-           shard_shift) % num_shards;
-  return static_cast<int>(r);
-}
-
-int RGWDataChangesLog::renew_entries(const DoutPrefixProvider *dpp)
-{
-  if (!zone->log_data)
-    return 0;
-
-  /* we can't keep the bucket name as part of the cls_log_entry, and we need
-   * it later, so we keep two lists under the map */
-  bc::flat_map<int, std::pair<std::vector<BucketGen>,
-                             RGWDataChangesBE::entries>> m;
-
-  std::unique_lock l(lock);
-  decltype(cur_cycle) entries;
-  entries.swap(cur_cycle);
-  l.unlock();
-
-  auto ut = real_clock::now();
-  auto be = bes->head();
-  for (const auto& [bs, gen] : entries) {
-    auto index = choose_oid(bs);
-
-    rgw_data_change change;
-    bufferlist bl;
-    change.entity_type = ENTITY_TYPE_BUCKET;
-    change.key = bs.get_key();
-    change.timestamp = ut;
-    change.gen = gen;
-    encode(change, bl);
-
-    m[index].first.push_back({bs, gen});
-    be->prepare(ut, change.key, std::move(bl), m[index].second);
-  }
-
-  for (auto& [index, p] : m) {
-    auto& [buckets, entries] = p;
-
-    auto now = real_clock::now();
-
-    auto ret = be->push(dpp, index, std::move(entries));
-    if (ret < 0) {
-      /* we don't really need to have a special handling for failed cases here,
-       * as this is just an optimization. */
-      ldpp_dout(dpp, -1) << "ERROR: svc.cls->timelog.add() returned " << ret << dendl;
-      return ret;
-    }
-
-    auto expiration = now;
-    expiration += ceph::make_timespan(cct->_conf->rgw_data_log_window);
-    for (auto& [bs, gen] : buckets) {
-      update_renewed(bs, gen, expiration);
-    }
-  }
-
-  return 0;
-}
-
-auto RGWDataChangesLog::_get_change(const rgw_bucket_shard& bs,
-                                   uint64_t gen)
-  -> ChangeStatusPtr
-{
-  ceph_assert(ceph_mutex_is_locked(lock));
-  ChangeStatusPtr status;
-  if (!changes.find({bs, gen}, status)) {
-    status = std::make_shared<ChangeStatus>();
-    changes.add({bs, gen}, status);
-  }
-  return status;
-}
-
-void RGWDataChangesLog::register_renew(const rgw_bucket_shard& bs,
-                                      const rgw::bucket_log_layout_generation& gen)
-{
-  std::scoped_lock l{lock};
-  cur_cycle.insert({bs, gen.gen});
-}
-
-void RGWDataChangesLog::update_renewed(const rgw_bucket_shard& bs,
-                                      uint64_t gen,
-                                      real_time expiration)
-{
-  std::unique_lock l{lock};
-  auto status = _get_change(bs, gen);
-  l.unlock();
-
-  ldout(cct, 20) << "RGWDataChangesLog::update_renewd() bucket_name="
-                << bs.bucket.name << " shard_id=" << bs.shard_id
-                << " expiration=" << expiration << dendl;
-
-  std::unique_lock sl(status->lock);
-  status->cur_expiration = expiration;
-}
-
-int RGWDataChangesLog::get_log_shard_id(rgw_bucket& bucket, int shard_id) {
-  rgw_bucket_shard bs(bucket, shard_id);
-  return choose_oid(bs);
-}
-
-bool RGWDataChangesLog::filter_bucket(const DoutPrefixProvider *dpp, 
-                                      const rgw_bucket& bucket,
-                                     optional_yield y) const
-{
-  if (!bucket_filter) {
-    return true;
-  }
-
-  return bucket_filter(bucket, y, dpp);
-}
-
-std::string RGWDataChangesLog::get_oid(uint64_t gen_id, int i) const {
-  return (gen_id > 0 ?
-         fmt::format("{}@G{}.{}", prefix, gen_id, i) :
-         fmt::format("{}.{}", prefix, i));
-}
-
-int RGWDataChangesLog::add_entry(const DoutPrefixProvider *dpp,
-                                const RGWBucketInfo& bucket_info,
-                                const rgw::bucket_log_layout_generation& gen,
-                                int shard_id)
-{
-  auto& bucket = bucket_info.bucket;
-
-  if (!filter_bucket(dpp, bucket, null_yield)) {
-    return 0;
-  }
-
-  if (observer) {
-    observer->on_bucket_changed(bucket.get_key());
-  }
-
-  rgw_bucket_shard bs(bucket, shard_id);
-
-  int index = choose_oid(bs);
-
-  mark_modified(index, bs, gen.gen);
-
-  std::unique_lock l(lock);
-
-  auto status = _get_change(bs, gen.gen);
-  l.unlock();
-
-  auto now = real_clock::now();
-
-  std::unique_lock sl(status->lock);
-
-  ldpp_dout(dpp, 20) << "RGWDataChangesLog::add_entry() bucket.name=" << bucket.name
-                    << " shard_id=" << shard_id << " now=" << now
-                    << " cur_expiration=" << status->cur_expiration << dendl;
-
-  if (now < status->cur_expiration) {
-    /* no need to send, recently completed */
-    sl.unlock();
-    register_renew(bs, gen);
-    return 0;
-  }
-
-  RefCountedCond* cond;
-
-  if (status->pending) {
-    cond = status->cond;
-
-    ceph_assert(cond);
-
-    status->cond->get();
-    sl.unlock();
-
-    int ret = cond->wait();
-    cond->put();
-    if (!ret) {
-      register_renew(bs, gen);
-    }
-    return ret;
-  }
-
-  status->cond = new RefCountedCond;
-  status->pending = true;
-
-  ceph::real_time expiration;
-
-  int ret;
-
-  do {
-    status->cur_sent = now;
-
-    expiration = now;
-    expiration += ceph::make_timespan(cct->_conf->rgw_data_log_window);
-
-    sl.unlock();
-
-    ceph::buffer::list bl;
-    rgw_data_change change;
-    change.entity_type = ENTITY_TYPE_BUCKET;
-    change.key = bs.get_key();
-    change.timestamp = now;
-    change.gen = gen.gen;
-    encode(change, bl);
-
-    ldpp_dout(dpp, 20) << "RGWDataChangesLog::add_entry() sending update with now=" << now << " cur_expiration=" << expiration << dendl;
-
-    auto be = bes->head();
-    ret = be->push(dpp, index, now, change.key, std::move(bl));
-
-    now = real_clock::now();
-
-    sl.lock();
-
-  } while (!ret && real_clock::now() > expiration);
-
-  cond = status->cond;
-
-  status->pending = false;
-  /* time of when operation started, not completed */
-  status->cur_expiration = status->cur_sent;
-  status->cur_expiration += make_timespan(cct->_conf->rgw_data_log_window);
-  status->cond = nullptr;
-  sl.unlock();
-
-  cond->done(ret);
-  cond->put();
-
-  return ret;
-}
-
-int DataLogBackends::list(const DoutPrefixProvider *dpp, int shard, int max_entries,
-                         std::vector<rgw_data_change_log_entry>& entries,
-                         std::string_view marker,
-                         std::string* out_marker,
-                         bool* truncated)
-{
-  const auto [start_id, start_cursor] = cursorgen(marker);
-  auto gen_id = start_id;
-  std::string out_cursor;
-  while (max_entries > 0) {
-    std::vector<rgw_data_change_log_entry> gentries;
-    std::unique_lock l(m);
-    auto i = lower_bound(gen_id);
-    if (i == end()) return 0;
-    auto be = i->second;
-    l.unlock();
-    gen_id = be->gen_id;
-    auto r = be->list(dpp, shard, max_entries, gentries,
-                     gen_id == start_id ? start_cursor : std::string{},
-                     &out_cursor, truncated);
-    if (r < 0)
-      return r;
-
-    if (out_marker && !out_cursor.empty()) {
-      *out_marker = gencursor(gen_id, out_cursor);
-    }
-    for (auto& g : gentries) {
-      g.log_id = gencursor(gen_id, g.log_id);
-    }
-    if (int s = gentries.size(); s < 0 || s > max_entries)
-      max_entries = 0;
-    else
-      max_entries -= gentries.size();
-
-    std::move(gentries.begin(), gentries.end(),
-             std::back_inserter(entries));
-    ++gen_id;
-  }
-  return 0;
-}
-
-int RGWDataChangesLog::list_entries(const DoutPrefixProvider *dpp, int shard, int max_entries,
-                                   std::vector<rgw_data_change_log_entry>& entries,
-                                   std::string_view marker,
-                                   std::string* out_marker, bool* truncated)
-{
-  assert(shard < num_shards);
-  return bes->list(dpp, shard, max_entries, entries, marker, out_marker, truncated);
-}
-
-int RGWDataChangesLog::list_entries(const DoutPrefixProvider *dpp, int max_entries,
-                                   std::vector<rgw_data_change_log_entry>& entries,
-                                   LogMarker& marker, bool *ptruncated)
-{
-  bool truncated;
-  entries.clear();
-  for (; marker.shard < num_shards && int(entries.size()) < max_entries;
-       marker.shard++, marker.marker.clear()) {
-    int ret = list_entries(dpp, marker.shard, max_entries - entries.size(),
-                          entries, marker.marker, NULL, &truncated);
-    if (ret == -ENOENT) {
-      continue;
-    }
-    if (ret < 0) {
-      return ret;
-    }
-    if (!truncated) {
-      *ptruncated = false;
-      return 0;
-    }
-  }
-  *ptruncated = (marker.shard < num_shards);
-  return 0;
-}
-
-int RGWDataChangesLog::get_info(const DoutPrefixProvider *dpp, int shard_id, RGWDataChangesLogInfo *info)
-{
-  assert(shard_id < num_shards);
-  auto be = bes->head();
-  auto r = be->get_info(dpp, shard_id, info);
-  if (!info->marker.empty()) {
-    info->marker = gencursor(be->gen_id, info->marker);
-  }
-  return r;
-}
-
-int DataLogBackends::trim_entries(const DoutPrefixProvider *dpp, int shard_id, std::string_view marker)
-{
-  auto [target_gen, cursor] = cursorgen(marker);
-  std::unique_lock l(m);
-  const auto head_gen = (end() - 1)->second->gen_id;
-  const auto tail_gen = begin()->first;
-  if (target_gen < tail_gen) return 0;
-  auto r = 0;
-  for (auto be = lower_bound(0)->second;
-       be->gen_id <= target_gen && be->gen_id <= head_gen && r >= 0;
-       be = upper_bound(be->gen_id)->second) {
-    l.unlock();
-    auto c = be->gen_id == target_gen ? cursor : be->max_marker();
-    r = be->trim(dpp, shard_id, c);
-    if (r == -ENOENT)
-      r = -ENODATA;
-    if (r == -ENODATA && be->gen_id < target_gen)
-      r = 0;
-    if (be->gen_id == target_gen)
-      break;
-    l.lock();
-  };
-  return r;
-}
-
-int RGWDataChangesLog::trim_entries(const DoutPrefixProvider *dpp, int shard_id, std::string_view marker)
-{
-  assert(shard_id < num_shards);
-  return bes->trim_entries(dpp, shard_id, marker);
-}
-
-class GenTrim : public rgw::cls::fifo::Completion<GenTrim> {
-public:
-  DataLogBackends* const bes;
-  const int shard_id;
-  const uint64_t target_gen;
-  const std::string cursor;
-  const uint64_t head_gen;
-  const uint64_t tail_gen;
-  boost::intrusive_ptr<RGWDataChangesBE> be;
-
-  GenTrim(const DoutPrefixProvider *dpp, DataLogBackends* bes, int shard_id, uint64_t target_gen,
-         std::string cursor, uint64_t head_gen, uint64_t tail_gen,
-         boost::intrusive_ptr<RGWDataChangesBE> be,
-         lr::AioCompletion* super)
-    : Completion(dpp, super), bes(bes), shard_id(shard_id), target_gen(target_gen),
-      cursor(std::move(cursor)), head_gen(head_gen), tail_gen(tail_gen),
-      be(std::move(be)) {}
-
-  void handle(const DoutPrefixProvider *dpp, Ptr&& p, int r) {
-    auto gen_id = be->gen_id;
-    be.reset();
-    if (r == -ENOENT)
-      r = -ENODATA;
-    if (r == -ENODATA && gen_id < target_gen)
-      r = 0;
-    if (r < 0) {
-      complete(std::move(p), r);
-      return;
-    }
-
-    {
-      std::unique_lock l(bes->m);
-      auto i = bes->upper_bound(gen_id);
-      if (i == bes->end() || i->first > target_gen || i->first > head_gen) {
-       l.unlock();
-       complete(std::move(p), -ENODATA);
-       return;
-      }
-      be = i->second;
-    }
-    auto c = be->gen_id == target_gen ? cursor : be->max_marker();
-    be->trim(dpp, shard_id, c, call(std::move(p)));
-  }
-};
-
-void DataLogBackends::trim_entries(const DoutPrefixProvider *dpp, int shard_id, std::string_view marker,
-                                  librados::AioCompletion* c)
-{
-  auto [target_gen, cursor] = cursorgen(marker);
-  std::unique_lock l(m);
-  const auto head_gen = (end() - 1)->second->gen_id;
-  const auto tail_gen = begin()->first;
-  if (target_gen < tail_gen) {
-    l.unlock();
-    rgw_complete_aio_completion(c, -ENODATA);
-    return;
-  }
-  auto be = begin()->second;
-  l.unlock();
-  auto gt = std::make_unique<GenTrim>(dpp, this, shard_id, target_gen,
-                                     std::string(cursor), head_gen, tail_gen,
-                                     be, c);
-
-  auto cc = be->gen_id == target_gen ? cursor : be->max_marker();
-  be->trim(dpp, shard_id, cc,  GenTrim::call(std::move(gt)));
-}
-
-int DataLogBackends::trim_generations(const DoutPrefixProvider *dpp, std::optional<uint64_t>& through) {
-  if (size() != 1) {
-    std::vector<mapped_type> candidates;
-    {
-      std::scoped_lock l(m);
-      auto e = cend() - 1;
-      for (auto i = cbegin(); i < e; ++i) {
-       candidates.push_back(i->second);
-      }
-    }
-
-    std::optional<uint64_t> highest;
-    for (auto& be : candidates) {
-      auto r = be->is_empty(dpp);
-      if (r < 0) {
-       return r;
-      } else if (r == 1) {
-       highest = be->gen_id;
-      } else {
-       break;
-      }
-    }
-
-    through = highest;
-    if (!highest) {
-      return 0;
-    }
-    auto ec = empty_to(dpp, *highest, null_yield);
-    if (ec) {
-      return ceph::from_error_code(ec);
-    }
-  }
-
-  return ceph::from_error_code(remove_empty(dpp, null_yield));
-}
-
-
-int RGWDataChangesLog::trim_entries(const DoutPrefixProvider *dpp, int shard_id, std::string_view marker,
-                                   librados::AioCompletion* c)
-{
-  assert(shard_id < num_shards);
-  bes->trim_entries(dpp, shard_id, marker, c);
-  return 0;
-}
-
-bool RGWDataChangesLog::going_down() const
-{
-  return down_flag;
-}
-
-RGWDataChangesLog::~RGWDataChangesLog() {
-  down_flag = true;
-  if (renew_thread.joinable()) {
-    renew_stop();
-    renew_thread.join();
-  }
-}
-
-void RGWDataChangesLog::renew_run() noexcept {
-  static constexpr auto runs_per_prune = 150;
-  auto run = 0;
-  for (;;) {
-    const DoutPrefix dp(cct, dout_subsys, "rgw data changes log: ");
-    ldpp_dout(&dp, 2) << "RGWDataChangesLog::ChangesRenewThread: start" << dendl;
-    int r = renew_entries(&dp);
-    if (r < 0) {
-      ldpp_dout(&dp, 0) << "ERROR: RGWDataChangesLog::renew_entries returned error r=" << r << dendl;
-    }
-
-    if (going_down())
-      break;
-
-    if (run == runs_per_prune) {
-      std::optional<uint64_t> through;
-      ldpp_dout(&dp, 2) << "RGWDataChangesLog::ChangesRenewThread: pruning old generations" << dendl;
-      trim_generations(&dp, through);
-      if (r < 0) {
-       derr << "RGWDataChangesLog::ChangesRenewThread: failed pruning r="
-            << r << dendl;
-      } else if (through) {
-       ldpp_dout(&dp, 2) << "RGWDataChangesLog::ChangesRenewThread: pruned generations "
-               << "through " << *through << "." << dendl;
-      } else {
-       ldpp_dout(&dp, 2) << "RGWDataChangesLog::ChangesRenewThread: nothing to prune."
-               << dendl;
-      }
-      run = 0;
-    } else {
-      ++run;
-    }
-
-    int interval = cct->_conf->rgw_data_log_window * 3 / 4;
-    std::unique_lock locker{renew_lock};
-    renew_cond.wait_for(locker, std::chrono::seconds(interval));
-  }
-}
-
-void RGWDataChangesLog::renew_stop()
-{
-  std::lock_guard l{renew_lock};
-  renew_cond.notify_all();
-}
-
-void RGWDataChangesLog::mark_modified(int shard_id, const rgw_bucket_shard& bs, uint64_t gen)
-{
-  if (!cct->_conf->rgw_data_notify_interval_msec) {
-    return;
-  }
-
-  auto key = bs.get_key();
-  {
-    std::shared_lock rl{modified_lock}; // read lock to check for existence
-    auto shard = modified_shards.find(shard_id);
-    if (shard != modified_shards.end() && shard->second.count({key, gen})) {
-      return;
-    }
-  }
-
-  std::unique_lock wl{modified_lock}; // write lock for insertion
-  modified_shards[shard_id].insert(rgw_data_notify_entry{key, gen});
-}
-
-std::string RGWDataChangesLog::max_marker() const {
-  return gencursor(std::numeric_limits<uint64_t>::max(),
-                  "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~");
-}
-
-int RGWDataChangesLog::change_format(const DoutPrefixProvider *dpp, log_type type, optional_yield y) {
-  return ceph::from_error_code(bes->new_backing(dpp, type, y));
-}
-
-int RGWDataChangesLog::trim_generations(const DoutPrefixProvider *dpp, std::optional<uint64_t>& through) {
-  return bes->trim_generations(dpp, through);
-}
-
-void RGWDataChangesLogInfo::dump(Formatter *f) const
-{
-  encode_json("marker", marker, f);
-  utime_t ut(last_update);
-  encode_json("last_update", ut, f);
-}
-
-void RGWDataChangesLogInfo::decode_json(JSONObj *obj)
-{
-  JSONDecoder::decode_json("marker", marker, obj);
-  utime_t ut;
-  JSONDecoder::decode_json("last_update", ut, obj);
-  last_update = ut.to_real_time();
-}
-
-
diff --git a/src/rgw/store/rados/rgw_datalog.h b/src/rgw/store/rados/rgw_datalog.h
deleted file mode 100644 (file)
index 0bc4837..0000000
+++ /dev/null
@@ -1,386 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab ft=cpp
-
-#ifndef CEPH_RGW_DATALOG_H
-#define CEPH_RGW_DATALOG_H
-
-#include <cstdint>
-#include <list>
-#include <memory>
-#include <string>
-#include <string_view>
-#include <variant>
-#include <vector>
-
-#include <boost/container/flat_map.hpp>
-#include <boost/container/flat_set.hpp>
-#include <boost/smart_ptr/intrusive_ptr.hpp>
-#include <boost/smart_ptr/intrusive_ref_counter.hpp>
-
-#undef FMT_HEADER_ONLY
-#define FMT_HEADER_ONLY 1
-#include <fmt/format.h>
-
-#include "include/buffer.h"
-#include "include/encoding.h"
-#include "include/function2.hpp"
-
-#include "include/rados/librados.hpp"
-
-#include "common/ceph_context.h"
-#include "common/ceph_json.h"
-#include "common/ceph_time.h"
-#include "common/Formatter.h"
-#include "common/lru_map.h"
-#include "common/RefCountedObj.h"
-
-#include "cls/log/cls_log_types.h"
-
-#include "rgw_basic_types.h"
-#include "rgw_log_backing.h"
-#include "rgw_sync_policy.h"
-#include "rgw_zone.h"
-#include "rgw_trim_bilog.h"
-
-namespace bc = boost::container;
-
-enum DataLogEntityType {
-  ENTITY_TYPE_UNKNOWN = 0,
-  ENTITY_TYPE_BUCKET = 1,
-};
-
-struct rgw_data_change {
-  DataLogEntityType entity_type;
-  std::string key;
-  ceph::real_time timestamp;
-  uint64_t gen = 0;
-
-  void encode(ceph::buffer::list& bl) const {
-    // require decoders to recognize v2 when gen>0
-    const uint8_t compat = (gen == 0) ? 1 : 2;
-    ENCODE_START(2, compat, bl);
-    auto t = std::uint8_t(entity_type);
-    encode(t, bl);
-    encode(key, bl);
-    encode(timestamp, bl);
-    encode(gen, bl);
-    ENCODE_FINISH(bl);
-  }
-
-  void decode(bufferlist::const_iterator& bl) {
-     DECODE_START(2, bl);
-     std::uint8_t t;
-     decode(t, bl);
-     entity_type = DataLogEntityType(t);
-     decode(key, bl);
-     decode(timestamp, bl);
-     if (struct_v < 2) {
-       gen = 0;
-     } else {
-       decode(gen, bl);
-     }
-     DECODE_FINISH(bl);
-  }
-
-  void dump(ceph::Formatter* f) const;
-  void decode_json(JSONObj* obj);
-};
-WRITE_CLASS_ENCODER(rgw_data_change)
-
-struct rgw_data_change_log_entry {
-  std::string log_id;
-  ceph::real_time log_timestamp;
-  rgw_data_change entry;
-
-  void encode(ceph::buffer::list& bl) const {
-    ENCODE_START(1, 1, bl);
-    encode(log_id, bl);
-    encode(log_timestamp, bl);
-    encode(entry, bl);
-    ENCODE_FINISH(bl);
-  }
-
-  void decode(ceph::buffer::list::const_iterator& bl) {
-     DECODE_START(1, bl);
-     decode(log_id, bl);
-     decode(log_timestamp, bl);
-     decode(entry, bl);
-     DECODE_FINISH(bl);
-  }
-
-  void dump(ceph::Formatter* f) const;
-  void decode_json(JSONObj* obj);
-};
-WRITE_CLASS_ENCODER(rgw_data_change_log_entry)
-
-struct RGWDataChangesLogInfo {
-  std::string marker;
-  ceph::real_time last_update;
-
-  void dump(ceph::Formatter* f) const;
-  void decode_json(JSONObj* obj);
-};
-
-struct RGWDataChangesLogMarker {
-  int shard = 0;
-  std::string marker;
-
-  RGWDataChangesLogMarker() = default;
-};
-
-class RGWDataChangesLog;
-
-struct rgw_data_notify_entry {
-  std::string key;
-  uint64_t gen = 0;
-
-  void dump(ceph::Formatter* f) const;
-  void decode_json(JSONObj* obj);
-
-  rgw_data_notify_entry& operator=(const rgw_data_notify_entry&) = default;
-
-  bool operator <(const rgw_data_notify_entry& d) const {
-    if (key < d.key) {
-      return true;
-    }
-    if (d.key < key) {
-      return false;
-    }
-    return gen < d.gen;
-  }
-  friend std::ostream& operator <<(std::ostream& m,
-                                  const rgw_data_notify_entry& e) {
-    return m << "[key: " << e.key << ", gen: " << e.gen << "]";
-  }
-};
-
-class RGWDataChangesBE;
-
-class DataLogBackends final
-  : public logback_generations,
-    private bc::flat_map<uint64_t, boost::intrusive_ptr<RGWDataChangesBE>> {
-  friend class logback_generations;
-  friend class GenTrim;
-
-  std::mutex m;
-  RGWDataChangesLog& datalog;
-
-  DataLogBackends(librados::IoCtx& ioctx,
-                 std::string oid,
-                 fu2::unique_function<std::string(
-                   uint64_t, int) const>&& get_oid,
-                 int shards, RGWDataChangesLog& datalog) noexcept
-    : logback_generations(ioctx, oid, std::move(get_oid),
-                         shards), datalog(datalog) {}
-public:
-
-  boost::intrusive_ptr<RGWDataChangesBE> head() {
-    std::unique_lock l(m);
-    auto i = end();
-    --i;
-    return i->second;
-  }
-  int list(const DoutPrefixProvider *dpp, int shard, int max_entries,
-          std::vector<rgw_data_change_log_entry>& entries,
-          std::string_view marker,
-          std::string* out_marker, bool* truncated);
-  int trim_entries(const DoutPrefixProvider *dpp, int shard_id, std::string_view marker);
-  void trim_entries(const DoutPrefixProvider *dpp, int shard_id, std::string_view marker,
-                   librados::AioCompletion* c);
-  void set_zero(RGWDataChangesBE* be) {
-    emplace(0, be);
-  }
-
-  bs::error_code handle_init(entries_t e) noexcept override;
-  bs::error_code handle_new_gens(entries_t e) noexcept override;
-  bs::error_code handle_empty_to(uint64_t new_tail) noexcept override;
-
-  int trim_generations(const DoutPrefixProvider *dpp, std::optional<uint64_t>& through);
-};
-
-struct BucketGen {
-  rgw_bucket_shard shard;
-  uint64_t gen;
-
-  BucketGen(const rgw_bucket_shard& shard, uint64_t gen)
-    : shard(shard), gen(gen) {}
-
-  BucketGen(rgw_bucket_shard&& shard, uint64_t gen)
-    : shard(std::move(shard)), gen(gen) {}
-
-  BucketGen(const BucketGen&) = default;
-  BucketGen(BucketGen&&) = default;
-  BucketGen& operator =(const BucketGen&) = default;
-  BucketGen& operator =(BucketGen&&) = default;
-
-  ~BucketGen() = default;
-};
-
-inline bool operator ==(const BucketGen& l, const BucketGen& r) {
-  return (l.shard == r.shard) && (l.gen == r.gen);
-}
-
-inline bool operator <(const BucketGen& l, const BucketGen& r) {
-  if (l.shard < r.shard) {
-    return true;
-  } else if (l.shard == r.shard) {
-    return l.gen < r.gen;
-  } else {
-    return false;
-  }
-}
-
-class RGWDataChangesLog {
-  friend DataLogBackends;
-  CephContext *cct;
-  librados::IoCtx ioctx;
-  rgw::BucketChangeObserver *observer = nullptr;
-  const RGWZone* zone;
-  std::unique_ptr<DataLogBackends> bes;
-
-  const int num_shards;
-  std::string get_prefix() {
-    auto prefix = cct->_conf->rgw_data_log_obj_prefix;
-    return prefix.empty() ? prefix : "data_log";
-  }
-  std::string metadata_log_oid() {
-    return get_prefix() + "generations_metadata";
-  }
-  std::string prefix;
-
-  ceph::mutex lock = ceph::make_mutex("RGWDataChangesLog::lock");
-  ceph::shared_mutex modified_lock =
-    ceph::make_shared_mutex("RGWDataChangesLog::modified_lock");
-  bc::flat_map<int, bc::flat_set<rgw_data_notify_entry>> modified_shards;
-
-  std::atomic<bool> down_flag = { false };
-
-  struct ChangeStatus {
-    std::shared_ptr<const rgw_sync_policy_info> sync_policy;
-    ceph::real_time cur_expiration;
-    ceph::real_time cur_sent;
-    bool pending = false;
-    RefCountedCond* cond = nullptr;
-    ceph::mutex lock = ceph::make_mutex("RGWDataChangesLog::ChangeStatus");
-  };
-
-  using ChangeStatusPtr = std::shared_ptr<ChangeStatus>;
-
-  lru_map<BucketGen, ChangeStatusPtr> changes;
-
-  bc::flat_set<BucketGen> cur_cycle;
-
-  ChangeStatusPtr _get_change(const rgw_bucket_shard& bs, uint64_t gen);
-  void register_renew(const rgw_bucket_shard& bs,
-                     const rgw::bucket_log_layout_generation& gen);
-  void update_renewed(const rgw_bucket_shard& bs,
-                     uint64_t gen,
-                     ceph::real_time expiration);
-
-  ceph::mutex renew_lock = ceph::make_mutex("ChangesRenewThread::lock");
-  ceph::condition_variable renew_cond;
-  void renew_run() noexcept;
-  void renew_stop();
-  std::thread renew_thread;
-
-  std::function<bool(const rgw_bucket& bucket, optional_yield y, const DoutPrefixProvider *dpp)> bucket_filter;
-  bool going_down() const;
-  bool filter_bucket(const DoutPrefixProvider *dpp, const rgw_bucket& bucket, optional_yield y) const;
-  int renew_entries(const DoutPrefixProvider *dpp);
-
-public:
-
-  RGWDataChangesLog(CephContext* cct);
-  ~RGWDataChangesLog();
-
-  int start(const DoutPrefixProvider *dpp, const RGWZone* _zone, const RGWZoneParams& zoneparams,
-           librados::Rados* lr);
-  int choose_oid(const rgw_bucket_shard& bs);
-  int add_entry(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info,
-               const rgw::bucket_log_layout_generation& gen, int shard_id);
-  int get_log_shard_id(rgw_bucket& bucket, int shard_id);
-  int list_entries(const DoutPrefixProvider *dpp, int shard, int max_entries,
-                  std::vector<rgw_data_change_log_entry>& entries,
-                  std::string_view marker,
-                  std::string* out_marker, bool* truncated);
-  int trim_entries(const DoutPrefixProvider *dpp, int shard_id, std::string_view marker);
-  int trim_entries(const DoutPrefixProvider *dpp, int shard_id, std::string_view marker,
-                  librados::AioCompletion* c); // :(
-  int get_info(const DoutPrefixProvider *dpp, int shard_id, RGWDataChangesLogInfo *info);
-
-  using LogMarker = RGWDataChangesLogMarker;
-
-  int list_entries(const DoutPrefixProvider *dpp, int max_entries,
-                  std::vector<rgw_data_change_log_entry>& entries,
-                  LogMarker& marker, bool* ptruncated);
-
-  void mark_modified(int shard_id, const rgw_bucket_shard& bs, uint64_t gen);
-  auto read_clear_modified() {
-    std::unique_lock wl{modified_lock};
-    decltype(modified_shards) modified;
-    modified.swap(modified_shards);
-    modified_shards.clear();
-    return modified;
-  }
-
-  void set_observer(rgw::BucketChangeObserver *observer) {
-    this->observer = observer;
-  }
-
-  void set_bucket_filter(decltype(bucket_filter)&& f) {
-    bucket_filter = std::move(f);
-  }
-  // a marker that compares greater than any other
-  std::string max_marker() const;
-  std::string get_oid(uint64_t gen_id, int shard_id) const;
-
-
-  int change_format(const DoutPrefixProvider *dpp, log_type type, optional_yield y);
-  int trim_generations(const DoutPrefixProvider *dpp, std::optional<uint64_t>& through);
-};
-
-class RGWDataChangesBE : public boost::intrusive_ref_counter<RGWDataChangesBE> {
-protected:
-  librados::IoCtx& ioctx;
-  CephContext* const cct;
-  RGWDataChangesLog& datalog;
-
-  std::string get_oid(int shard_id) {
-    return datalog.get_oid(gen_id, shard_id);
-  }
-public:
-  using entries = std::variant<std::list<cls_log_entry>,
-                              std::vector<ceph::buffer::list>>;
-
-  const uint64_t gen_id;
-
-  RGWDataChangesBE(librados::IoCtx& ioctx,
-                  RGWDataChangesLog& datalog,
-                  uint64_t gen_id)
-    : ioctx(ioctx), cct(static_cast<CephContext*>(ioctx.cct())),
-      datalog(datalog), gen_id(gen_id) {}
-  virtual ~RGWDataChangesBE() = default;
-
-  virtual void prepare(ceph::real_time now,
-                      const std::string& key,
-                      ceph::buffer::list&& entry,
-                      entries& out) = 0;
-  virtual int push(const DoutPrefixProvider *dpp, int index, entries&& items) = 0;
-  virtual int push(const DoutPrefixProvider *dpp, int index, ceph::real_time now,
-                  const std::string& key,
-                  ceph::buffer::list&& bl) = 0;
-  virtual int list(const DoutPrefixProvider *dpp, int shard, int max_entries,
-                  std::vector<rgw_data_change_log_entry>& entries,
-                  std::optional<std::string_view> marker,
-                  std::string* out_marker, bool* truncated) = 0;
-  virtual int get_info(const DoutPrefixProvider *dpp, int index, RGWDataChangesLogInfo *info) = 0;
-  virtual int trim(const DoutPrefixProvider *dpp, int index, std::string_view marker) = 0;
-  virtual int trim(const DoutPrefixProvider *dpp, int index, std::string_view marker,
-                  librados::AioCompletion* c) = 0;
-  virtual std::string_view max_marker() const = 0;
-  // 1 on empty, 0 on non-empty, negative on error.
-  virtual int is_empty(const DoutPrefixProvider *dpp) = 0;
-};
-
-
-#endif
diff --git a/src/rgw/store/rados/rgw_datalog_notify.cc b/src/rgw/store/rados/rgw_datalog_notify.cc
deleted file mode 100644 (file)
index 12cdc53..0000000
+++ /dev/null
@@ -1,76 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab ft=cpp
-
-#include "rgw_datalog_notify.h"
-#include "rgw_datalog.h"
-
-// custom encoding for v1 notify API
-struct EntryEncoderV1 {
-  const rgw_data_notify_entry& entry;
-};
-struct SetEncoderV1 {
-  const bc::flat_set<rgw_data_notify_entry>& entries;
-};
-
-// encode rgw_data_notify_entry as string
-void encode_json(const char *name, const EntryEncoderV1& e, Formatter *f)
-{
-  f->dump_string(name, e.entry.key); // encode the key only
-}
-// encode set<rgw_data_notify_entry> as set<string>
-void encode_json(const char *name, const SetEncoderV1& e, Formatter *f)
-{
-  f->open_array_section(name);
-  for (auto& entry : e.entries) {
-    encode_json("obj", EntryEncoderV1{entry}, f);
-  }
-  f->close_section();
-}
-// encode map<int, set<rgw_data_notify_entry>> as map<int, set<string>>
-void encode_json(const char *name, const rgw_data_notify_v1_encoder& e, Formatter *f)
-{
-  f->open_array_section(name);
-  for (auto& [key, val] : e.shards) {
-    f->open_object_section("entry");
-    encode_json("key", key, f);
-    encode_json("val", SetEncoderV1{val}, f);
-    f->close_section();
-  }
-  f->close_section();
-}
-
-struct EntryDecoderV1 {
-  rgw_data_notify_entry& entry;
-};
-struct SetDecoderV1 {
-  bc::flat_set<rgw_data_notify_entry>& entries;
-};
-
-// decode string into rgw_data_notify_entry
-void decode_json_obj(EntryDecoderV1& d, JSONObj *obj)
-{
-  decode_json_obj(d.entry.key, obj);
-  d.entry.gen = 0;
-}
-// decode set<string> into set<rgw_data_notify_entry>
-void decode_json_obj(SetDecoderV1& d, JSONObj *obj)
-{
-  for (JSONObjIter o = obj->find_first(); !o.end(); ++o) {
-    rgw_data_notify_entry val;
-    auto decoder = EntryDecoderV1{val};
-    decode_json_obj(decoder, *o);
-    d.entries.insert(std::move(val));
-  }
-}
-// decode map<int, set<string>> into map<int, set<rgw_data_notify_entry>>
-void decode_json_obj(rgw_data_notify_v1_decoder& d, JSONObj *obj)
-{
-  for (JSONObjIter o = obj->find_first(); !o.end(); ++o) {
-    int shard_id = 0;
-    JSONDecoder::decode_json("key", shard_id, *o);
-    bc::flat_set<rgw_data_notify_entry> val;
-    SetDecoderV1 decoder{val};
-    JSONDecoder::decode_json("val", decoder, *o);
-    d.shards[shard_id] = std::move(val);
-  }
-}
diff --git a/src/rgw/store/rados/rgw_datalog_notify.h b/src/rgw/store/rados/rgw_datalog_notify.h
deleted file mode 100644 (file)
index 4cd1b3c..0000000
+++ /dev/null
@@ -1,31 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab ft=cpp
-
-#pragma once
-
-#include <boost/container/flat_map.hpp>
-#include <boost/container/flat_set.hpp>
-
-#include "rgw_datalog.h"
-
-namespace bc = boost::container;
-
-namespace ceph { class Formatter; }
-class JSONObj;
-
-class RGWCoroutine;
-class RGWHTTPManager;
-class RGWRESTConn;
-
-struct rgw_data_notify_entry;
-
-// json encoder and decoder for notify v1 API
-struct rgw_data_notify_v1_encoder {
-  const bc::flat_map<int, bc::flat_set<rgw_data_notify_entry>>& shards;
-};
-void encode_json(const char *name, const rgw_data_notify_v1_encoder& e,
-                 ceph::Formatter *f);
-struct rgw_data_notify_v1_decoder {
-  bc::flat_map<int, bc::flat_set<rgw_data_notify_entry>>& shards;
-};
-void decode_json_obj(rgw_data_notify_v1_decoder& d, JSONObj *obj);
diff --git a/src/rgw/store/rados/rgw_etag_verifier.cc b/src/rgw/store/rados/rgw_etag_verifier.cc
deleted file mode 100644 (file)
index 52f7c79..0000000
+++ /dev/null
@@ -1,191 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab ft=cpp
-
-#include "rgw_etag_verifier.h"
-#include "rgw_obj_manifest.h"
-
-#define dout_subsys ceph_subsys_rgw
-
-namespace rgw::putobj {
-
-int create_etag_verifier(const DoutPrefixProvider *dpp, 
-                         CephContext* cct, rgw::sal::DataProcessor* filter,
-                         const bufferlist& manifest_bl,
-                         const std::optional<RGWCompressionInfo>& compression,
-                         etag_verifier_ptr& verifier)
-{
-  RGWObjManifest manifest;
-
-  try {
-    auto miter = manifest_bl.cbegin();
-    decode(manifest, miter);
-  } catch (buffer::error& err) {
-    ldpp_dout(dpp, 0) << "ERROR: couldn't decode manifest" << dendl;
-    return -EIO;
-  }
-
-  RGWObjManifestRule rule;
-  bool found = manifest.get_rule(0, &rule);
-  if (!found) {
-    ldpp_dout(dpp, -1) << "ERROR: manifest->get_rule() could not find rule" << dendl;
-    return -EIO;
-  }
-
-  if (rule.start_part_num == 0) {
-    /* Atomic object */
-    verifier.emplace<ETagVerifier_Atomic>(cct, filter);
-    return 0;
-  }
-
-  uint64_t cur_part_ofs = UINT64_MAX;
-  std::vector<uint64_t> part_ofs;
-
-  /*
-   * We must store the offset of each part to calculate the ETAGs for each
-   * MPU part. These part ETags then become the input for the MPU object
-   * Etag.
-   */
-  for (auto mi = manifest.obj_begin(dpp); mi != manifest.obj_end(dpp); ++mi) {
-    if (cur_part_ofs == mi.get_part_ofs())
-      continue;
-    cur_part_ofs = mi.get_part_ofs();
-    ldpp_dout(dpp, 20) << "MPU Part offset:" << cur_part_ofs << dendl;
-    part_ofs.push_back(cur_part_ofs);
-  }
-
-  if (compression) {
-    // if the source object was compressed, the manifest is storing
-    // compressed part offsets. transform the compressed offsets back to
-    // their original offsets by finding the first block of each part
-    const auto& blocks = compression->blocks;
-    auto block = blocks.begin();
-    for (auto& ofs : part_ofs) {
-      // find the compression_block with new_ofs == ofs
-      constexpr auto less = [] (const compression_block& block, uint64_t ofs) {
-        return block.new_ofs < ofs;
-      };
-      block = std::lower_bound(block, blocks.end(), ofs, less);
-      if (block == blocks.end() || block->new_ofs != ofs) {
-        ldpp_dout(dpp, 4) << "no match for compressed offset " << ofs
-            << ", disabling etag verification" << dendl;
-        return -EIO;
-      }
-      ofs = block->old_ofs;
-      ldpp_dout(dpp, 20) << "MPU Part uncompressed offset:" << ofs << dendl;
-    }
-  }
-
-  verifier.emplace<ETagVerifier_MPU>(cct, std::move(part_ofs), filter);
-  return 0;
-}
-
-int ETagVerifier_Atomic::process(bufferlist&& in, uint64_t logical_offset)
-{
-  bufferlist out;
-  if (in.length() > 0)
-    hash.Update((const unsigned char *)in.c_str(), in.length());
-
-  return Pipe::process(std::move(in), logical_offset);
-}
-
-void ETagVerifier_Atomic::calculate_etag()
-{
-  unsigned char m[CEPH_CRYPTO_MD5_DIGESTSIZE];
-  char calc_md5[CEPH_CRYPTO_MD5_DIGESTSIZE * 2 + 1];
-
-  /* Return early if ETag has already been calculated */
-  if (!calculated_etag.empty())
-    return;
-
-  hash.Final(m);
-  buf_to_hex(m, CEPH_CRYPTO_MD5_DIGESTSIZE, calc_md5);
-  calculated_etag = calc_md5;
-  ldout(cct, 20) << "Single part object: " << " etag:" << calculated_etag
-          << dendl;
-}
-
-void ETagVerifier_MPU::process_end_of_MPU_part()
-{
-  unsigned char m[CEPH_CRYPTO_MD5_DIGESTSIZE];
-  char calc_md5_part[CEPH_CRYPTO_MD5_DIGESTSIZE * 2 + 1];
-  std::string calculated_etag_part;
-
-  hash.Final(m);
-  mpu_etag_hash.Update((const unsigned char *)m, sizeof(m));
-  hash.Restart();
-
-  if (cct->_conf->subsys.should_gather(dout_subsys, 20)) {
-    buf_to_hex(m, CEPH_CRYPTO_MD5_DIGESTSIZE, calc_md5_part);
-    calculated_etag_part = calc_md5_part;
-    ldout(cct, 20) << "Part etag: " << calculated_etag_part << dendl;
-  }
-
-  cur_part_index++;
-  next_part_index++;
-}
-
-int ETagVerifier_MPU::process(bufferlist&& in, uint64_t logical_offset)
-{
-  uint64_t bl_end = in.length() + logical_offset;
-
-  /* Handle the last MPU part */
-  if (size_t(next_part_index) == part_ofs.size()) {
-    hash.Update((const unsigned char *)in.c_str(), in.length());
-    goto done;
-  }
-
-  /* Incoming bufferlist spans two MPU parts. Calculate separate ETags */
-  if (bl_end > part_ofs[next_part_index]) {
-
-    uint64_t part_one_len = part_ofs[next_part_index] - logical_offset;
-    hash.Update((const unsigned char *)in.c_str(), part_one_len);
-    process_end_of_MPU_part();
-
-    hash.Update((const unsigned char *)in.c_str() + part_one_len,
-      bl_end - part_ofs[cur_part_index]);
-    /*
-     * If we've moved to the last part of the MPU, avoid usage of
-     * parts_ofs[next_part_index] as it will lead to our-of-range access.
-     */
-    if (size_t(next_part_index) == part_ofs.size())
-      goto done;
-  } else {
-    hash.Update((const unsigned char *)in.c_str(), in.length());
-  }
-
-  /* Update the MPU Etag if the current part has ended */
-  if (logical_offset + in.length() + 1 == part_ofs[next_part_index])
-    process_end_of_MPU_part();
-
-done:
-  return Pipe::process(std::move(in), logical_offset);
-}
-
-void ETagVerifier_MPU::calculate_etag()
-{
-  const uint32_t parts = part_ofs.size();
-  constexpr auto digits10 = std::numeric_limits<uint32_t>::digits10;
-  constexpr auto extra = 2 + digits10; // add "-%u\0" at the end
-
-  unsigned char m[CEPH_CRYPTO_MD5_DIGESTSIZE], mpu_m[CEPH_CRYPTO_MD5_DIGESTSIZE];
-  char final_etag_str[CEPH_CRYPTO_MD5_DIGESTSIZE * 2 + extra];
-
-  /* Return early if ETag has already been calculated */
-  if (!calculated_etag.empty())
-    return;
-
-  hash.Final(m);
-  mpu_etag_hash.Update((const unsigned char *)m, sizeof(m));
-
-  /* Refer RGWCompleteMultipart::execute() for ETag calculation for MPU object */
-  mpu_etag_hash.Final(mpu_m);
-  buf_to_hex(mpu_m, CEPH_CRYPTO_MD5_DIGESTSIZE, final_etag_str);
-  snprintf(&final_etag_str[CEPH_CRYPTO_MD5_DIGESTSIZE * 2],
-           sizeof(final_etag_str) - CEPH_CRYPTO_MD5_DIGESTSIZE * 2,
-           "-%u", parts);
-
-  calculated_etag = final_etag_str;
-  ldout(cct, 20) << "MPU calculated ETag:" << calculated_etag << dendl;
-}
-
-} // namespace rgw::putobj
diff --git a/src/rgw/store/rados/rgw_etag_verifier.h b/src/rgw/store/rados/rgw_etag_verifier.h
deleted file mode 100644 (file)
index 56a679e..0000000
+++ /dev/null
@@ -1,92 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab ft=cpp
-
-/*
- * RGW Etag Verifier is an RGW filter which enables the objects copied using
- * multisite sync to be verified using their ETag from source i.e. the MD5
- * checksum of the object is computed at the destination and is verified to be
- * identical to the ETag stored in the object HEAD at source cluster.
- * 
- * For MPU objects, a different filter named RGWMultipartEtagFilter is applied
- * which re-computes ETag using RGWObjManifest. This computes the ETag using the
- * same algorithm used at the source cluster i.e. MD5 sum of the individual ETag
- * on the MPU parts.
- */
-#ifndef CEPH_RGW_ETAG_VERIFIER_H
-#define CEPH_RGW_ETAG_VERIFIER_H
-
-#include "rgw_putobj.h"
-#include "rgw_op.h"
-#include "common/static_ptr.h"
-
-namespace rgw::putobj {
-
-class ETagVerifier : public rgw::putobj::Pipe
-{
-protected:
-  CephContext* cct;
-  MD5 hash;
-  std::string calculated_etag;
-
-public:
-  ETagVerifier(CephContext* cct_, rgw::sal::DataProcessor *next)
-    : Pipe(next), cct(cct_) {
-      // Allow use of MD5 digest in FIPS mode for non-cryptographic purposes
-      hash.SetFlags(EVP_MD_CTX_FLAG_NON_FIPS_ALLOW);
-    }
-
-  virtual void calculate_etag() = 0;
-  std::string get_calculated_etag() { return calculated_etag;}
-
-}; /* ETagVerifier */
-
-class ETagVerifier_Atomic : public ETagVerifier
-{
-public:
-  ETagVerifier_Atomic(CephContext* cct_, rgw::sal::DataProcessor *next)
-    : ETagVerifier(cct_, next) {}
-
-  int process(bufferlist&& data, uint64_t logical_offset) override;
-  void calculate_etag() override;
-
-}; /* ETagVerifier_Atomic */
-
-class ETagVerifier_MPU : public ETagVerifier
-{
-  std::vector<uint64_t> part_ofs;
-  uint64_t cur_part_index{0}, next_part_index{1};
-  MD5 mpu_etag_hash;
-  void process_end_of_MPU_part();
-
-public:
-  ETagVerifier_MPU(CephContext* cct,
-                             std::vector<uint64_t> part_ofs,
-                             rgw::sal::DataProcessor *next)
-    : ETagVerifier(cct, next),
-      part_ofs(std::move(part_ofs))
-  {
-    // Allow use of MD5 digest in FIPS mode for non-cryptographic purposes
-    hash.SetFlags(EVP_MD_CTX_FLAG_NON_FIPS_ALLOW);
-  }
-
-  int process(bufferlist&& data, uint64_t logical_offset) override;
-  void calculate_etag() override;
-
-}; /* ETagVerifier_MPU */
-
-constexpr auto max_etag_verifier_size = std::max(
-    sizeof(ETagVerifier_Atomic),
-    sizeof(ETagVerifier_MPU)
-  );
-using etag_verifier_ptr = ceph::static_ptr<ETagVerifier, max_etag_verifier_size>;
-
-int create_etag_verifier(const DoutPrefixProvider *dpp, 
-                         CephContext* cct, rgw::sal::DataProcessor* next,
-                         const bufferlist& manifest_bl,
-                         const std::optional<RGWCompressionInfo>& compression,
-                         etag_verifier_ptr& verifier);
-
-} // namespace rgw::putobj
-
-#endif /* CEPH_RGW_ETAG_VERIFIER_H */
diff --git a/src/rgw/store/rados/rgw_gc.cc b/src/rgw/store/rados/rgw_gc.cc
deleted file mode 100644 (file)
index bd16bde..0000000
+++ /dev/null
@@ -1,811 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab ft=cpp
-
-#include "rgw_gc.h"
-
-#include "rgw_tools.h"
-#include "include/scope_guard.h"
-#include "include/rados/librados.hpp"
-#include "cls/rgw/cls_rgw_client.h"
-#include "cls/rgw_gc/cls_rgw_gc_client.h"
-#include "cls/refcount/cls_refcount_client.h"
-#include "cls/version/cls_version_client.h"
-#include "rgw_perf_counters.h"
-#include "cls/lock/cls_lock_client.h"
-#include "include/random.h"
-#include "rgw_gc_log.h"
-
-#include <list> // XXX
-#include <sstream>
-#include "xxhash.h"
-
-#define dout_context g_ceph_context
-#define dout_subsys ceph_subsys_rgw
-
-using namespace std;
-using namespace librados;
-
-static string gc_oid_prefix = "gc";
-static string gc_index_lock_name = "gc_process";
-
-void RGWGC::initialize(CephContext *_cct, RGWRados *_store) {
-  cct = _cct;
-  store = _store;
-
-  max_objs = min(static_cast<int>(cct->_conf->rgw_gc_max_objs), rgw_shards_max());
-
-  obj_names = new string[max_objs];
-
-  for (int i = 0; i < max_objs; i++) {
-    obj_names[i] = gc_oid_prefix;
-    char buf[32];
-    snprintf(buf, 32, ".%d", i);
-    obj_names[i].append(buf);
-
-    auto it = transitioned_objects_cache.begin() + i;
-    transitioned_objects_cache.insert(it, false);
-
-    //version = 0 -> not ready for transition
-    //version = 1 -> marked ready for transition
-    librados::ObjectWriteOperation op;
-    op.create(false);
-    const uint64_t queue_size = cct->_conf->rgw_gc_max_queue_size, num_deferred_entries = cct->_conf->rgw_gc_max_deferred;
-    gc_log_init2(op, queue_size, num_deferred_entries);
-    store->gc_operate(this, obj_names[i], &op);
-  }
-}
-
-void RGWGC::finalize()
-{
-  delete[] obj_names;
-}
-
-int RGWGC::tag_index(const string& tag)
-{
-  return rgw_shards_mod(XXH64(tag.c_str(), tag.size(), seed), max_objs);
-}
-
-std::tuple<int, std::optional<cls_rgw_obj_chain>> RGWGC::send_split_chain(const cls_rgw_obj_chain& chain, const std::string& tag)
-{
-  ldpp_dout(this, 20) << "RGWGC::send_split_chain - tag is: " << tag << dendl;
-
-  if (cct->_conf->rgw_max_chunk_size) {
-    cls_rgw_obj_chain broken_chain;
-    ldpp_dout(this, 20) << "RGWGC::send_split_chain - rgw_max_chunk_size is: " << cct->_conf->rgw_max_chunk_size << dendl;
-
-    for (auto it = chain.objs.begin(); it != chain.objs.end(); it++) {
-      ldpp_dout(this, 20) << "RGWGC::send_split_chain - adding obj with name: " << it->key << dendl;
-      broken_chain.objs.emplace_back(*it);
-      cls_rgw_gc_obj_info info;
-      info.tag = tag;
-      info.chain = broken_chain;
-      cls_rgw_gc_set_entry_op op;
-      op.info = info;
-      size_t total_encoded_size = op.estimate_encoded_size();
-      ldpp_dout(this, 20) << "RGWGC::send_split_chain - total_encoded_size is: " << total_encoded_size << dendl;
-
-      if (total_encoded_size > cct->_conf->rgw_max_chunk_size) { //dont add to chain, and send to gc
-        broken_chain.objs.pop_back();
-        --it;
-        ldpp_dout(this, 20) << "RGWGC::send_split_chain - more than, dont add to broken chain and send chain" << dendl;
-        auto ret = send_chain(broken_chain, tag);
-        if (ret < 0) {
-          broken_chain.objs.insert(broken_chain.objs.end(), it, chain.objs.end()); // add all the remainder objs to the list to be deleted inline
-          ldpp_dout(this, 0) << "RGWGC::send_split_chain - send chain returned error: " << ret << dendl;
-          return {ret, {broken_chain}};
-        }
-        broken_chain.objs.clear();
-      }
-    }
-    if (!broken_chain.objs.empty()) { //when the chain is smaller than or equal to rgw_max_chunk_size
-      ldpp_dout(this, 20) << "RGWGC::send_split_chain - sending leftover objects" << dendl;
-      auto ret = send_chain(broken_chain, tag);
-      if (ret < 0) {
-        ldpp_dout(this, 0) << "RGWGC::send_split_chain - send chain returned error: " << ret << dendl;
-        return {ret, {broken_chain}};
-      }
-    }
-  } else {
-    auto ret = send_chain(chain, tag);
-    if (ret < 0) {
-      ldpp_dout(this, 0) << "RGWGC::send_split_chain - send chain returned error: " << ret << dendl;
-      return {ret, {std::move(chain)}};
-    }
-  }
-  return {0, {}};
-}
-
-int RGWGC::send_chain(const cls_rgw_obj_chain& chain, const string& tag)
-{
-  ObjectWriteOperation op;
-  cls_rgw_gc_obj_info info;
-  info.chain = chain;
-  info.tag = tag;
-  gc_log_enqueue2(op, cct->_conf->rgw_gc_obj_min_wait, info);
-
-  int i = tag_index(tag);
-
-  ldpp_dout(this, 20) << "RGWGC::send_chain - on object name: " << obj_names[i] << "tag is: " << tag << dendl;
-
-  auto ret = store->gc_operate(this, obj_names[i], &op);
-  if (ret != -ECANCELED && ret != -EPERM) {
-    return ret;
-  }
-  ObjectWriteOperation set_entry_op;
-  cls_rgw_gc_set_entry(set_entry_op, cct->_conf->rgw_gc_obj_min_wait, info);
-  return store->gc_operate(this, obj_names[i], &set_entry_op);
-}
-
-struct defer_chain_state {
-  librados::AioCompletion* completion = nullptr;
-  // TODO: hold a reference on the state in RGWGC to avoid use-after-free if
-  // RGWGC destructs before this completion fires
-  RGWGC* gc = nullptr;
-  cls_rgw_gc_obj_info info;
-
-  ~defer_chain_state() {
-    if (completion) {
-      completion->release();
-    }
-  }
-};
-
-static void async_defer_callback(librados::completion_t, void* arg)
-{
-  std::unique_ptr<defer_chain_state> state{static_cast<defer_chain_state*>(arg)};
-  if (state->completion->get_return_value() == -ECANCELED) {
-    state->gc->on_defer_canceled(state->info);
-  }
-}
-
-void RGWGC::on_defer_canceled(const cls_rgw_gc_obj_info& info)
-{
-  const std::string& tag = info.tag;
-  const int i = tag_index(tag);
-
-  // ECANCELED from cls_version_check() tells us that we've transitioned
-  transitioned_objects_cache[i] = true;
-
-  ObjectWriteOperation op;
-  cls_rgw_gc_queue_defer_entry(op, cct->_conf->rgw_gc_obj_min_wait, info);
-  cls_rgw_gc_remove(op, {tag});
-
-  auto c = librados::Rados::aio_create_completion(nullptr, nullptr);
-  store->gc_aio_operate(obj_names[i], c, &op);
-  c->release();
-}
-
-int RGWGC::async_defer_chain(const string& tag, const cls_rgw_obj_chain& chain)
-{
-  const int i = tag_index(tag);
-  cls_rgw_gc_obj_info info;
-  info.chain = chain;
-  info.tag = tag;
-
-  // if we've transitioned this shard object, we can rely on the cls_rgw_gc queue
-  if (transitioned_objects_cache[i]) {
-    ObjectWriteOperation op;
-    cls_rgw_gc_queue_defer_entry(op, cct->_conf->rgw_gc_obj_min_wait, info);
-
-    // this tag may still be present in omap, so remove it once the cls_rgw_gc
-    // enqueue succeeds
-    cls_rgw_gc_remove(op, {tag});
-
-    auto c = librados::Rados::aio_create_completion(nullptr, nullptr);
-    int ret = store->gc_aio_operate(obj_names[i], c, &op);
-    c->release();
-    return ret;
-  }
-
-  // if we haven't seen the transition yet, write the defer to omap with cls_rgw
-  ObjectWriteOperation op;
-
-  // assert that we haven't initialized cls_rgw_gc queue. this prevents us
-  // from writing new entries to omap after the transition
-  gc_log_defer1(op, cct->_conf->rgw_gc_obj_min_wait, info);
-
-  // prepare a callback to detect the transition via ECANCELED from cls_version_check()
-  auto state = std::make_unique<defer_chain_state>();
-  state->gc = this;
-  state->info.chain = chain;
-  state->info.tag = tag;
-  state->completion = librados::Rados::aio_create_completion(
-      state.get(), async_defer_callback);
-
-  int ret = store->gc_aio_operate(obj_names[i], state->completion, &op);
-  if (ret == 0) {
-    state.release(); // release ownership until async_defer_callback()
-  }
-  return ret;
-}
-
-int RGWGC::remove(int index, const std::vector<string>& tags, AioCompletion **pc)
-{
-  ObjectWriteOperation op;
-  cls_rgw_gc_remove(op, tags);
-
-  auto c = librados::Rados::aio_create_completion(nullptr, nullptr);
-  int ret = store->gc_aio_operate(obj_names[index], c, &op);
-  if (ret < 0) {
-    c->release();
-  } else {
-    *pc = c;
-  }
-  return ret;
-}
-
-int RGWGC::remove(int index, int num_entries)
-{
-  ObjectWriteOperation op;
-  cls_rgw_gc_queue_remove_entries(op, num_entries);
-
-  return store->gc_operate(this, obj_names[index], &op);
-}
-
-int RGWGC::list(int *index, string& marker, uint32_t max, bool expired_only, std::list<cls_rgw_gc_obj_info>& result, bool *truncated, bool& processing_queue)
-{
-  result.clear();
-  string next_marker;
-  bool check_queue = false;
-
-  for (; *index < max_objs && result.size() < max; (*index)++, marker.clear(), check_queue = false) {
-    std::list<cls_rgw_gc_obj_info> entries, queue_entries;
-    int ret = 0;
-
-    //processing_queue is set to true from previous iteration if the queue was under process and probably has more elements in it.
-    if (! transitioned_objects_cache[*index] && ! check_queue && ! processing_queue) {
-      ret = cls_rgw_gc_list(store->gc_pool_ctx, obj_names[*index], marker, max - result.size(), expired_only, entries, truncated, next_marker);
-      if (ret != -ENOENT && ret < 0) {
-        return ret;
-      }
-      obj_version objv;
-      cls_version_read(store->gc_pool_ctx, obj_names[*index], &objv);
-      if (ret == -ENOENT || entries.size() == 0) {
-        if (objv.ver == 0) {
-          continue;
-        } else {
-          if (! expired_only) {
-            transitioned_objects_cache[*index] = true;
-            marker.clear();
-          } else {
-            std::list<cls_rgw_gc_obj_info> non_expired_entries;
-            ret = cls_rgw_gc_list(store->gc_pool_ctx, obj_names[*index], marker, 1, false, non_expired_entries, truncated, next_marker);
-            if (non_expired_entries.size() == 0) {
-              transitioned_objects_cache[*index] = true;
-              marker.clear();
-            }
-          }
-        }
-      }
-      if ((objv.ver == 1) && (entries.size() < max - result.size())) {
-        check_queue = true;
-        marker.clear();
-      }
-    }
-    if (transitioned_objects_cache[*index] || check_queue || processing_queue) {
-      processing_queue = false;
-      ret = cls_rgw_gc_queue_list_entries(store->gc_pool_ctx, obj_names[*index], marker, (max - result.size()) - entries.size(), expired_only, queue_entries, truncated, next_marker);
-      if (ret < 0) {
-        return ret;
-      }
-    }
-    if (entries.size() == 0 && queue_entries.size() == 0)
-      continue;
-
-    std::list<cls_rgw_gc_obj_info>::iterator iter;
-    for (iter = entries.begin(); iter != entries.end(); ++iter) {
-      result.push_back(*iter);
-    }
-
-    for (iter = queue_entries.begin(); iter != queue_entries.end(); ++iter) {
-      result.push_back(*iter);
-    }
-
-    marker = next_marker;
-
-    if (*index == max_objs - 1) {
-      if (queue_entries.size() > 0 && *truncated) {
-        processing_queue = true;
-      } else {
-        processing_queue = false;
-      }
-      /* we cut short here, truncated will hold the correct value */
-      return 0;
-    }
-
-    if (result.size() == max) {
-      if (queue_entries.size() > 0 && *truncated) {
-        processing_queue = true;
-      } else {
-        processing_queue = false;
-        *index += 1; //move to next gc object
-      }
-
-      /* close approximation, it might be that the next of the objects don't hold
-       * anything, in this case truncated should have been false, but we can find
-       * that out on the next iteration
-       */
-      *truncated = true;
-      return 0;
-    }
-  }
-  *truncated = false;
-  processing_queue = false;
-
-  return 0;
-}
-
-class RGWGCIOManager {
-  const DoutPrefixProvider* dpp;
-  CephContext *cct;
-  RGWGC *gc;
-
-  struct IO {
-    enum Type {
-      UnknownIO = 0,
-      TailIO = 1,
-      IndexIO = 2,
-    } type{UnknownIO};
-    librados::AioCompletion *c{nullptr};
-    string oid;
-    int index{-1};
-    string tag;
-  };
-
-  deque<IO> ios;
-  vector<std::vector<string> > remove_tags;
-  /* tracks the number of remaining shadow objects for a given tag in order to
-   * only remove the tag once all shadow objects have themselves been removed
-   */
-  vector<map<string, size_t> > tag_io_size;
-
-#define MAX_AIO_DEFAULT 10
-  size_t max_aio{MAX_AIO_DEFAULT};
-
-public:
-  RGWGCIOManager(const DoutPrefixProvider* _dpp, CephContext *_cct, RGWGC *_gc) : dpp(_dpp),
-                                                                                  cct(_cct),
-                                                                                  gc(_gc) {
-    max_aio = cct->_conf->rgw_gc_max_concurrent_io;
-    remove_tags.resize(min(static_cast<int>(cct->_conf->rgw_gc_max_objs), rgw_shards_max()));
-    tag_io_size.resize(min(static_cast<int>(cct->_conf->rgw_gc_max_objs), rgw_shards_max()));
-  }
-
-  ~RGWGCIOManager() {
-    for (auto io : ios) {
-      io.c->release();
-    }
-  }
-
-  int schedule_io(IoCtx *ioctx, const string& oid, ObjectWriteOperation *op,
-                 int index, const string& tag) {
-    while (ios.size() > max_aio) {
-      if (gc->going_down()) {
-        return 0;
-      }
-      auto ret = handle_next_completion();
-      //Return error if we are using queue, else ignore it
-      if (gc->transitioned_objects_cache[index] && ret < 0) {
-        return ret;
-      }
-    }
-
-    auto c = librados::Rados::aio_create_completion(nullptr, nullptr);
-    int ret = ioctx->aio_operate(oid, c, op);
-    if (ret < 0) {
-      return ret;
-    }
-    ios.push_back(IO{IO::TailIO, c, oid, index, tag});
-
-    return 0;
-  }
-
-  int handle_next_completion() {
-    ceph_assert(!ios.empty());
-    IO& io = ios.front();
-    io.c->wait_for_complete();
-    int ret = io.c->get_return_value();
-    io.c->release();
-
-    if (ret == -ENOENT) {
-      ret = 0;
-    }
-
-    if (io.type == IO::IndexIO && ! gc->transitioned_objects_cache[io.index]) {
-      if (ret < 0) {
-        ldpp_dout(dpp, 0) << "WARNING: gc cleanup of tags on gc shard index=" <<
-         io.index << " returned error, ret=" << ret << dendl;
-      }
-      goto done;
-    }
-
-    if (ret < 0) {
-      ldpp_dout(dpp, 0) << "WARNING: gc could not remove oid=" << io.oid <<
-       ", ret=" << ret << dendl;
-      goto done;
-    }
-
-    if (! gc->transitioned_objects_cache[io.index]) {
-      schedule_tag_removal(io.index, io.tag);
-    }
-
-  done:
-    ios.pop_front();
-    return ret;
-  }
-
-  /* This is a request to schedule a tag removal. It will be called once when
-   * there are no shadow objects. But it will also be called for every shadow
-   * object when there are any. Since we do not want the tag to be removed
-   * until all shadow objects have been successfully removed, the scheduling
-   * will not happen until the shadow object count goes down to zero
-   */
-  void schedule_tag_removal(int index, string tag) {
-    auto& ts = tag_io_size[index];
-    auto ts_it = ts.find(tag);
-    if (ts_it != ts.end()) {
-      auto& size = ts_it->second;
-      --size;
-      // wait all shadow obj delete return
-      if (size != 0)
-        return;
-
-      ts.erase(ts_it);
-    }
-
-    auto& rt = remove_tags[index];
-
-    rt.push_back(tag);
-    if (rt.size() >= (size_t)cct->_conf->rgw_gc_max_trim_chunk) {
-      flush_remove_tags(index, rt);
-    }
-  }
-
-  void add_tag_io_size(int index, string tag, size_t size) {
-    auto& ts = tag_io_size[index];
-    ts.emplace(tag, size);
-  }
-
-  int drain_ios() {
-    int ret_val = 0;
-    while (!ios.empty()) {
-      if (gc->going_down()) {
-        return -EAGAIN;
-      }
-      auto ret = handle_next_completion();
-      if (ret < 0) {
-        ret_val = ret;
-      }
-    }
-    return ret_val;
-  }
-
-  void drain() {
-    drain_ios();
-    flush_remove_tags();
-    /* the tags draining might have generated more ios, drain those too */
-    drain_ios();
-  }
-
-  void flush_remove_tags(int index, vector<string>& rt) {
-    IO index_io;
-    index_io.type = IO::IndexIO;
-    index_io.index = index;
-
-    ldpp_dout(dpp, 20) << __func__ <<
-      " removing entries from gc log shard index=" << index << ", size=" <<
-      rt.size() << ", entries=" << rt << dendl;
-
-    auto rt_guard = make_scope_guard(
-      [&]
-       {
-         rt.clear();
-       }
-      );
-
-    int ret = gc->remove(index, rt, &index_io.c);
-    if (ret < 0) {
-      /* we already cleared list of tags, this prevents us from
-       * ballooning in case of a persistent problem
-       */
-      ldpp_dout(dpp, 0) << "WARNING: failed to remove tags on gc shard index=" <<
-       index << " ret=" << ret << dendl;
-      return;
-    }
-    if (perfcounter) {
-      /* log the count of tags retired for rate estimation */
-      perfcounter->inc(l_rgw_gc_retire, rt.size());
-    }
-    ios.push_back(index_io);
-  }
-
-  void flush_remove_tags() {
-    int index = 0;
-    for (auto& rt : remove_tags) {
-      if (! gc->transitioned_objects_cache[index]) {
-        flush_remove_tags(index, rt);
-      }
-      ++index;
-    }
-  }
-
-  int remove_queue_entries(int index, int num_entries) {
-    int ret = gc->remove(index, num_entries);
-    if (ret < 0) {
-      ldpp_dout(dpp, 0) << "ERROR: failed to remove queue entries on index=" <<
-           index << " ret=" << ret << dendl;
-      return ret;
-    }
-    if (perfcounter) {
-      /* log the count of tags retired for rate estimation */
-      perfcounter->inc(l_rgw_gc_retire, num_entries);
-    }
-    return 0;
-  }
-}; // class RGWGCIOManger
-
-int RGWGC::process(int index, int max_secs, bool expired_only,
-                   RGWGCIOManager& io_manager)
-{
-  ldpp_dout(this, 20) << "RGWGC::process entered with GC index_shard=" <<
-    index << ", max_secs=" << max_secs << ", expired_only=" <<
-    expired_only << dendl;
-
-  rados::cls::lock::Lock l(gc_index_lock_name);
-  utime_t end = ceph_clock_now();
-
-  /* max_secs should be greater than zero. We don't want a zero max_secs
-   * to be translated as no timeout, since we'd then need to break the
-   * lock and that would require a manual intervention. In this case
-   * we can just wait it out. */
-  if (max_secs <= 0)
-    return -EAGAIN;
-
-  end += max_secs;
-  utime_t time(max_secs, 0);
-  l.set_duration(time);
-
-  int ret = l.lock_exclusive(&store->gc_pool_ctx, obj_names[index]);
-  if (ret == -EBUSY) { /* already locked by another gc processor */
-    ldpp_dout(this, 10) << "RGWGC::process failed to acquire lock on " <<
-      obj_names[index] << dendl;
-    return 0;
-  }
-  if (ret < 0)
-    return ret;
-
-  string marker;
-  string next_marker;
-  bool truncated;
-  IoCtx *ctx = new IoCtx;
-  do {
-    int max = 100;
-    std::list<cls_rgw_gc_obj_info> entries;
-
-    int ret = 0;
-
-    if (! transitioned_objects_cache[index]) {
-      ret = cls_rgw_gc_list(store->gc_pool_ctx, obj_names[index], marker, max, expired_only, entries, &truncated, next_marker);
-      ldpp_dout(this, 20) <<
-      "RGWGC::process cls_rgw_gc_list returned with returned:" << ret <<
-      ", entries.size=" << entries.size() << ", truncated=" << truncated <<
-      ", next_marker='" << next_marker << "'" << dendl;
-      obj_version objv;
-      cls_version_read(store->gc_pool_ctx, obj_names[index], &objv);
-      if ((objv.ver == 1) && entries.size() == 0) {
-        std::list<cls_rgw_gc_obj_info> non_expired_entries;
-        ret = cls_rgw_gc_list(store->gc_pool_ctx, obj_names[index], marker, 1, false, non_expired_entries, &truncated, next_marker);
-        if (non_expired_entries.size() == 0) {
-          transitioned_objects_cache[index] = true;
-          marker.clear();
-          ldpp_dout(this, 20) << "RGWGC::process cls_rgw_gc_list returned NO non expired entries, so setting cache entry to TRUE" << dendl;
-        } else {
-          ret = 0;
-          goto done;
-        }
-      }
-      if ((objv.ver == 0) && (ret == -ENOENT || entries.size() == 0)) {
-        ret = 0;
-        goto done;
-      }
-    }
-
-    if (transitioned_objects_cache[index]) {
-      ret = cls_rgw_gc_queue_list_entries(store->gc_pool_ctx, obj_names[index], marker, max, expired_only, entries, &truncated, next_marker);
-      ldpp_dout(this, 20) <<
-      "RGWGC::process cls_rgw_gc_queue_list_entries returned with return value:" << ret <<
-      ", entries.size=" << entries.size() << ", truncated=" << truncated <<
-      ", next_marker='" << next_marker << "'" << dendl;
-      if (entries.size() == 0) {
-        ret = 0;
-        goto done;
-      }
-    }
-
-    if (ret < 0)
-      goto done;
-
-    marker = next_marker;
-
-    string last_pool;
-    std::list<cls_rgw_gc_obj_info>::iterator iter;
-    for (iter = entries.begin(); iter != entries.end(); ++iter) {
-      cls_rgw_gc_obj_info& info = *iter;
-
-      ldpp_dout(this, 20) << "RGWGC::process iterating over entry tag='" <<
-       info.tag << "', time=" << info.time << ", chain.objs.size()=" <<
-       info.chain.objs.size() << dendl;
-
-      std::list<cls_rgw_obj>::iterator liter;
-      cls_rgw_obj_chain& chain = info.chain;
-
-      utime_t now = ceph_clock_now();
-      if (now >= end) {
-        goto done;
-      }
-      if (! transitioned_objects_cache[index]) {
-        if (chain.objs.empty()) {
-          io_manager.schedule_tag_removal(index, info.tag);
-        } else {
-          io_manager.add_tag_io_size(index, info.tag, chain.objs.size());
-        }
-      }
-      if (! chain.objs.empty()) {
-       for (liter = chain.objs.begin(); liter != chain.objs.end(); ++liter) {
-         cls_rgw_obj& obj = *liter;
-
-         if (obj.pool != last_pool) {
-           delete ctx;
-           ctx = new IoCtx;
-           ret = rgw_init_ioctx(this, store->get_rados_handle(), obj.pool, *ctx);
-           if (ret < 0) {
-        if (transitioned_objects_cache[index]) {
-          goto done;
-        }
-             last_pool = "";
-             ldpp_dout(this, 0) << "ERROR: failed to create ioctx pool=" <<
-               obj.pool << dendl;
-             continue;
-           }
-           last_pool = obj.pool;
-         }
-
-         ctx->locator_set_key(obj.loc);
-
-         const string& oid = obj.key.name; /* just stored raw oid there */
-
-         ldpp_dout(this, 5) << "RGWGC::process removing " << obj.pool <<
-           ":" << obj.key.name << dendl;
-         ObjectWriteOperation op;
-         cls_refcount_put(op, info.tag, true);
-
-         ret = io_manager.schedule_io(ctx, oid, &op, index, info.tag);
-         if (ret < 0) {
-           ldpp_dout(this, 0) <<
-             "WARNING: failed to schedule deletion for oid=" << oid << dendl;
-      if (transitioned_objects_cache[index]) {
-        //If deleting oid failed for any of them, we will not delete queue entries
-        goto done;
-      }
-         }
-         if (going_down()) {
-           // leave early, even if tag isn't removed, it's ok since it
-           // will be picked up next time around
-           goto done;
-         }
-       } // chains loop
-      } // else -- chains not empty
-    } // entries loop
-    if (transitioned_objects_cache[index] && entries.size() > 0) {
-      ret = io_manager.drain_ios();
-      if (ret < 0) {
-        goto done;
-      }
-      //Remove the entries from the queue
-      ldpp_dout(this, 5) << "RGWGC::process removing entries, marker: " << marker << dendl;
-      ret = io_manager.remove_queue_entries(index, entries.size());
-      if (ret < 0) {
-        ldpp_dout(this, 0) <<
-          "WARNING: failed to remove queue entries" << dendl;
-        goto done;
-      }
-    }
-  } while (truncated);
-
-done:
-  /* we don't drain here, because if we're going down we don't want to
-   * hold the system if backend is unresponsive
-   */
-  l.unlock(&store->gc_pool_ctx, obj_names[index]);
-  delete ctx;
-
-  return 0;
-}
-
-int RGWGC::process(bool expired_only)
-{
-  int max_secs = cct->_conf->rgw_gc_processor_max_time;
-
-  const int start = ceph::util::generate_random_number(0, max_objs - 1);
-
-  RGWGCIOManager io_manager(this, store->ctx(), this);
-
-  for (int i = 0; i < max_objs; i++) {
-    int index = (i + start) % max_objs;
-    int ret = process(index, max_secs, expired_only, io_manager);
-    if (ret < 0)
-      return ret;
-  }
-  if (!going_down()) {
-    io_manager.drain();
-  }
-
-  return 0;
-}
-
-bool RGWGC::going_down()
-{
-  return down_flag;
-}
-
-void RGWGC::start_processor()
-{
-  worker = new GCWorker(this, cct, this);
-  worker->create("rgw_gc");
-}
-
-void RGWGC::stop_processor()
-{
-  down_flag = true;
-  if (worker) {
-    worker->stop();
-    worker->join();
-  }
-  delete worker;
-  worker = NULL;
-}
-
-unsigned RGWGC::get_subsys() const
-{
-  return dout_subsys;
-}
-
-std::ostream& RGWGC::gen_prefix(std::ostream& out) const
-{
-  return out << "garbage collection: ";
-}
-
-void *RGWGC::GCWorker::entry() {
-  do {
-    utime_t start = ceph_clock_now();
-    ldpp_dout(dpp, 2) << "garbage collection: start" << dendl;
-    int r = gc->process(true);
-    if (r < 0) {
-      ldpp_dout(dpp, 0) << "ERROR: garbage collection process() returned error r=" << r << dendl;
-    }
-    ldpp_dout(dpp, 2) << "garbage collection: stop" << dendl;
-
-    if (gc->going_down())
-      break;
-
-    utime_t end = ceph_clock_now();
-    end -= start;
-    int secs = cct->_conf->rgw_gc_processor_period;
-
-    if (secs <= end.sec())
-      continue; // next round
-
-    secs -= end.sec();
-
-    std::unique_lock locker{lock};
-    cond.wait_for(locker, std::chrono::seconds(secs));
-  } while (!gc->going_down());
-
-  return NULL;
-}
-
-void RGWGC::GCWorker::stop()
-{
-  std::lock_guard l{lock};
-  cond.notify_all();
-}
diff --git a/src/rgw/store/rados/rgw_gc.h b/src/rgw/store/rados/rgw_gc.h
deleted file mode 100644 (file)
index 196f280..0000000
+++ /dev/null
@@ -1,87 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab ft=cpp
-
-#ifndef CEPH_RGW_GC_H
-#define CEPH_RGW_GC_H
-
-
-#include "include/types.h"
-#include "include/rados/librados.hpp"
-#include "common/ceph_mutex.h"
-#include "common/Cond.h"
-#include "common/Thread.h"
-#include "rgw_common.h"
-#include "rgw_sal.h"
-#include "rgw_rados.h"
-#include "cls/rgw/cls_rgw_types.h"
-
-#include <atomic>
-
-class RGWGCIOManager;
-
-class RGWGC : public DoutPrefixProvider {
-  CephContext *cct;
-  RGWRados *store;
-  int max_objs;
-  std::string *obj_names;
-  std::atomic<bool> down_flag = { false };
-
-  static constexpr uint64_t seed = 8675309;
-
-  int tag_index(const std::string& tag);
-  int send_chain(const cls_rgw_obj_chain& chain, const std::string& tag);
-
-  class GCWorker : public Thread {
-    const DoutPrefixProvider *dpp;
-    CephContext *cct;
-    RGWGC *gc;
-    ceph::mutex lock = ceph::make_mutex("GCWorker");
-    ceph::condition_variable cond;
-
-  public:
-    GCWorker(const DoutPrefixProvider *_dpp, CephContext *_cct, RGWGC *_gc) : dpp(_dpp), cct(_cct), gc(_gc) {}
-    void *entry() override;
-    void stop();
-  };
-
-  GCWorker *worker;
-public:
-  RGWGC() : cct(NULL), store(NULL), max_objs(0), obj_names(NULL), worker(NULL) {}
-  ~RGWGC() {
-    stop_processor();
-    finalize();
-  }
-  std::vector<bool> transitioned_objects_cache;
-  std::tuple<int, std::optional<cls_rgw_obj_chain>> send_split_chain(const cls_rgw_obj_chain& chain, const std::string& tag);
-
-  // asynchronously defer garbage collection on an object that's still being read
-  int async_defer_chain(const std::string& tag, const cls_rgw_obj_chain& info);
-
-  // callback for when async_defer_chain() fails with ECANCELED
-  void on_defer_canceled(const cls_rgw_gc_obj_info& info);
-
-  int remove(int index, const std::vector<std::string>& tags, librados::AioCompletion **pc);
-  int remove(int index, int num_entries);
-
-  void initialize(CephContext *_cct, RGWRados *_store);
-  void finalize();
-
-  int list(int *index, std::string& marker, uint32_t max, bool expired_only, std::list<cls_rgw_gc_obj_info>& result, bool *truncated, bool& processing_queue);
-  void list_init(int *index) { *index = 0; }
-  int process(int index, int process_max_secs, bool expired_only,
-              RGWGCIOManager& io_manager);
-  int process(bool expired_only);
-
-  bool going_down();
-  void start_processor();
-  void stop_processor();
-
-  CephContext *get_cct() const override { return store->ctx(); }
-  unsigned get_subsys() const;
-
-  std::ostream& gen_prefix(std::ostream& out) const;
-
-};
-
-
-#endif
diff --git a/src/rgw/store/rados/rgw_gc_log.cc b/src/rgw/store/rados/rgw_gc_log.cc
deleted file mode 100644 (file)
index ad819ed..0000000
+++ /dev/null
@@ -1,55 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab ft=cpp
-
-#include "rgw_gc_log.h"
-
-#include "cls/rgw/cls_rgw_client.h"
-#include "cls/rgw_gc/cls_rgw_gc_client.h"
-#include "cls/version/cls_version_client.h"
-
-
-void gc_log_init2(librados::ObjectWriteOperation& op,
-                  uint64_t max_size, uint64_t max_deferred)
-{
-  obj_version objv; // objv.ver = 0
-  cls_version_check(op, objv, VER_COND_EQ);
-  cls_rgw_gc_queue_init(op, max_size, max_deferred);
-  objv.ver = 1;
-  cls_version_set(op, objv);
-}
-
-void gc_log_enqueue1(librados::ObjectWriteOperation& op,
-                     uint32_t expiration, cls_rgw_gc_obj_info& info)
-{
-  obj_version objv; // objv.ver = 0
-  cls_version_check(op, objv, VER_COND_EQ);
-  cls_rgw_gc_set_entry(op, expiration, info);
-}
-
-void gc_log_enqueue2(librados::ObjectWriteOperation& op,
-                     uint32_t expiration, const cls_rgw_gc_obj_info& info)
-{
-  obj_version objv;
-  objv.ver = 1;
-  cls_version_check(op, objv, VER_COND_EQ);
-  cls_rgw_gc_queue_enqueue(op, expiration, info);
-}
-
-void gc_log_defer1(librados::ObjectWriteOperation& op,
-                   uint32_t expiration, const cls_rgw_gc_obj_info& info)
-{
-  obj_version objv; // objv.ver = 0
-  cls_version_check(op, objv, VER_COND_EQ);
-  cls_rgw_gc_defer_entry(op, expiration, info.tag);
-}
-
-void gc_log_defer2(librados::ObjectWriteOperation& op,
-                   uint32_t expiration, const cls_rgw_gc_obj_info& info)
-{
-  obj_version objv;
-  objv.ver = 1;
-  cls_version_check(op, objv, VER_COND_EQ);
-  cls_rgw_gc_queue_defer_entry(op, expiration, info);
-  // TODO: conditional on whether omap is known to be empty
-  cls_rgw_gc_remove(op, {info.tag});
-}
diff --git a/src/rgw/store/rados/rgw_lc_tier.cc b/src/rgw/store/rados/rgw_lc_tier.cc
deleted file mode 100644 (file)
index 0ad2169..0000000
+++ /dev/null
@@ -1,1336 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab ft=cpp
-
-#include <string.h>
-#include <iostream>
-#include <map>
-
-#include "common/Formatter.h"
-#include <common/errno.h>
-#include "rgw_lc.h"
-#include "rgw_lc_tier.h"
-#include "rgw_string.h"
-#include "rgw_zone.h"
-#include "rgw_common.h"
-#include "rgw_rest.h"
-#include "svc_zone.h"
-
-#include <boost/algorithm/string/split.hpp>
-#include <boost/algorithm/string.hpp>
-#include <boost/algorithm/string/predicate.hpp>
-
-#define dout_context g_ceph_context
-#define dout_subsys ceph_subsys_rgw
-
-using namespace std;
-
-struct rgw_lc_multipart_part_info {
-  int part_num{0};
-  uint64_t ofs{0};
-  uint64_t size{0};
-  std::string etag;
-};
-
-struct rgw_lc_obj_properties {
-  ceph::real_time mtime;
-  std::string etag;
-  uint64_t versioned_epoch{0};
-  std::map<std::string, RGWTierACLMapping>& target_acl_mappings;
-  std::string target_storage_class;
-
-  rgw_lc_obj_properties(ceph::real_time _mtime, std::string _etag,
-      uint64_t _versioned_epoch, std::map<std::string,
-      RGWTierACLMapping>& _t_acl_mappings,
-      std::string _t_storage_class) :
-    mtime(_mtime), etag(_etag),
-    versioned_epoch(_versioned_epoch),
-    target_acl_mappings(_t_acl_mappings),
-    target_storage_class(_t_storage_class) {}
-};
-
-struct rgw_lc_multipart_upload_info {
-  std::string upload_id;
-  uint64_t obj_size;
-  ceph::real_time mtime;
-  std::string etag;
-
-  void encode(bufferlist& bl) const {
-    ENCODE_START(1, 1, bl);
-    encode(upload_id, bl);
-    encode(obj_size, bl);
-    encode(mtime, bl);
-    encode(etag, bl);
-    ENCODE_FINISH(bl);
-  }
-
-  void decode(bufferlist::const_iterator& bl) {
-    DECODE_START(1, bl);
-    decode(upload_id, bl);
-    decode(obj_size, bl);
-    decode(mtime, bl);
-    decode(etag, bl);
-    DECODE_FINISH(bl);
-  }
-};
-WRITE_CLASS_ENCODER(rgw_lc_multipart_upload_info)
-
-static inline string get_key_instance(const rgw_obj_key& key)
-{
-  if (!key.instance.empty() &&
-      !key.have_null_instance()) {
-    return "-" + key.instance;
-  }
-  return "";
-}
-
-static inline string get_key_oid(const rgw_obj_key& key)
-{
-  string oid = key.name;
-  if (!key.instance.empty() &&
-      !key.have_null_instance()) {
-    oid += string("-") + key.instance;
-  }
-  return oid;
-}
-
-static inline string obj_to_aws_path(const rgw_obj& obj)
-{
-  string path = obj.bucket.name + "/" + get_key_oid(obj.key);
-  return path;
-}
-
-static int read_upload_status(const DoutPrefixProvider *dpp, rgw::sal::Driver *driver,
-    const rgw_raw_obj *status_obj, rgw_lc_multipart_upload_info *status)
-{
-  int ret = 0;
-  rgw::sal::RadosStore *rados = dynamic_cast<rgw::sal::RadosStore*>(driver);
-
-  if (!rados) {
-    ldpp_dout(dpp, 0) << "ERROR: Not a RadosStore. Cannot be transitioned to cloud." << dendl;
-    return -1;
-  }
-
-  auto& pool = status_obj->pool;
-  const auto oid = status_obj->oid;
-  auto sysobj = rados->svc()->sysobj;
-  bufferlist bl;
-
-  ret = rgw_get_system_obj(sysobj, pool, oid, bl, nullptr, nullptr,
-      null_yield, dpp);
-
-  if (ret < 0) {
-    return ret;
-  }
-
-  if (bl.length() > 0) {
-    try {
-      auto p = bl.cbegin();
-      status->decode(p);
-    } catch (buffer::error& e) {
-      ldpp_dout(dpp, 10) << "failed to decode status obj: "
-        << e.what() << dendl;
-      return -EIO;
-    }
-  } else {
-    return -EIO;
-  }
-
-  return 0;
-}
-
-static int put_upload_status(const DoutPrefixProvider *dpp, rgw::sal::Driver *driver,
-    const rgw_raw_obj *status_obj, rgw_lc_multipart_upload_info *status)
-{
-  int ret = 0;
-  rgw::sal::RadosStore *rados = dynamic_cast<rgw::sal::RadosStore*>(driver);
-
-  if (!rados) {
-    ldpp_dout(dpp, 0) << "ERROR: Not a RadosStore. Cannot be transitioned to cloud." << dendl;
-    return -1;
-  }
-
-  auto& pool = status_obj->pool;
-  const auto oid = status_obj->oid;
-  auto sysobj = rados->svc()->sysobj;
-  bufferlist bl;
-  status->encode(bl);
-
-  ret = rgw_put_system_obj(dpp, sysobj, pool, oid, bl, true, nullptr,
-      real_time{}, null_yield);
-
-  return ret;
-}
-
-static int delete_upload_status(const DoutPrefixProvider *dpp, rgw::sal::Driver *driver,
-    const rgw_raw_obj *status_obj)
-{
-  int ret = 0;
-  rgw::sal::RadosStore *rados = dynamic_cast<rgw::sal::RadosStore*>(driver);
-
-  if (!rados) {
-    ldpp_dout(dpp, 0) << "ERROR: Not a RadosStore. Cannot be transitioned to cloud." << dendl;
-    return -1;
-  }
-
-  auto& pool = status_obj->pool;
-  const auto oid = status_obj->oid;
-  auto sysobj = rados->svc()->sysobj;
-
-  ret = rgw_delete_system_obj(dpp, sysobj, pool, oid, nullptr, null_yield);
-
-  return ret;
-}
-
-static std::set<string> keep_headers = { "CONTENT_TYPE",
-                                         "CONTENT_ENCODING",
-                                         "CONTENT_DISPOSITION",
-                                         "CONTENT_LANGUAGE" };
-
-/*
- * mapping between rgw object attrs and output http fields
- *
- static const struct rgw_http_attr base_rgw_to_http_attrs[] = {
- { RGW_ATTR_CONTENT_LANG,      "Content-Language" },
- { RGW_ATTR_EXPIRES,           "Expires" },
- { RGW_ATTR_CACHE_CONTROL,     "Cache-Control" },
- { RGW_ATTR_CONTENT_DISP,      "Content-Disposition" },
- { RGW_ATTR_CONTENT_ENC,       "Content-Encoding" },
- { RGW_ATTR_USER_MANIFEST,     "X-Object-Manifest" },
- { RGW_ATTR_X_ROBOTS_TAG ,     "X-Robots-Tag" },
- { RGW_ATTR_STORAGE_CLASS ,    "X-Amz-Storage-Class" },
-// RGW_ATTR_AMZ_WEBSITE_REDIRECT_LOCATION header depends on access mode:
-// S3 endpoint: x-amz-website-redirect-location
-// S3Website endpoint: Location
-{ RGW_ATTR_AMZ_WEBSITE_REDIRECT_LOCATION, "x-amz-website-redirect-location" },
-}; */
-
-static void init_headers(map<string, bufferlist>& attrs,
-    map<string, string>& headers)
-{
-  for (auto& kv : attrs) {
-    const char * name = kv.first.c_str();
-    const auto aiter = rgw_to_http_attrs.find(name);
-
-    if (aiter != std::end(rgw_to_http_attrs)) {
-      headers[aiter->second] = rgw_bl_str(kv.second);
-    } else if (strncmp(name, RGW_ATTR_META_PREFIX,
-          sizeof(RGW_ATTR_META_PREFIX)-1) == 0) {
-      name += sizeof(RGW_ATTR_META_PREFIX) - 1;
-      string sname(name);
-      string name_prefix = RGW_ATTR_META_PREFIX;
-      char full_name_buf[name_prefix.size() + sname.size() + 1];
-      snprintf(full_name_buf, sizeof(full_name_buf), "%.*s%.*s",
-          static_cast<int>(name_prefix.length()),
-          name_prefix.data(),
-          static_cast<int>(sname.length()),
-          sname.data());
-      headers[full_name_buf] = rgw_bl_str(kv.second);
-    } else if (strcmp(name,RGW_ATTR_CONTENT_TYPE) == 0) {
-      headers["CONTENT_TYPE"] = rgw_bl_str(kv.second);
-    }
-  }
-}
-
-/* Read object or just head from remote endpoint. For now initializes only headers,
- * but can be extended to fetch etag, mtime etc if needed.
- */
-static int cloud_tier_get_object(RGWLCCloudTierCtx& tier_ctx, bool head,
-                         std::map<std::string, std::string>& headers) {
-  RGWRESTConn::get_obj_params req_params;
-  RGWBucketInfo b;
-  std::string target_obj_name;
-  int ret = 0;
-  std::unique_ptr<rgw::sal::Bucket> dest_bucket;
-  std::unique_ptr<rgw::sal::Object> dest_obj;
-  rgw_lc_obj_properties obj_properties(tier_ctx.o.meta.mtime, tier_ctx.o.meta.etag,
-        tier_ctx.o.versioned_epoch, tier_ctx.acl_mappings,
-        tier_ctx.target_storage_class);
-  std::string etag;
-  RGWRESTStreamRWRequest *in_req;
-
-  b.bucket.name = tier_ctx.target_bucket_name;
-  target_obj_name = tier_ctx.bucket_info.bucket.name + "/" +
-                    tier_ctx.obj->get_name();
-  if (!tier_ctx.o.is_current()) {
-    target_obj_name += get_key_instance(tier_ctx.obj->get_key());
-  }
-
-  ret = tier_ctx.driver->get_bucket(nullptr, b, &dest_bucket);
-  if (ret < 0) {
-    ldpp_dout(tier_ctx.dpp, 0) << "ERROR: failed to initialize dest_bucket - " << tier_ctx.target_bucket_name << " , reterr = " << ret << dendl;
-    return ret;
-  }
-
-  dest_obj = dest_bucket->get_object(rgw_obj_key(target_obj_name));
-  if (!dest_obj) {
-    ldpp_dout(tier_ctx.dpp, 0) << "ERROR: failed to initialize dest_object path - " << target_obj_name << dendl;
-    return -1;
-  }
-  /* init input connection */
-  req_params.get_op = !head;
-  req_params.prepend_metadata = true;
-  req_params.rgwx_stat = true;
-  req_params.sync_manifest = true;
-  req_params.skip_decrypt = true;
-
-  ret = tier_ctx.conn.get_obj(tier_ctx.dpp, dest_obj.get(), req_params, true /* send */, &in_req);
-  if (ret < 0) {
-    ldpp_dout(tier_ctx.dpp, 0) << "ERROR: " << __func__ << "(): conn.get_obj() returned ret=" << ret << dendl;
-    return ret;
-  }
-
-  /* fetch headers */
-  ret = tier_ctx.conn.complete_request(in_req, nullptr, nullptr, nullptr, nullptr, &headers, null_yield);
-  if (ret < 0 && ret != -ENOENT) {
-    ldpp_dout(tier_ctx.dpp, 20) << "ERROR: " << __func__ << "(): conn.complete_request() returned ret=" << ret << dendl;
-    return ret;
-  }
-  return 0;
-}
-
-static bool is_already_tiered(const DoutPrefixProvider *dpp,
-                             std::map<std::string, std::string>& headers,
-                             ceph::real_time& mtime) {
-  char buf[32];
-  map<string, string> attrs = headers;
-
-  for (const auto& a : attrs) {
-    ldpp_dout(dpp, 20) << "GetCrf attr[" << a.first << "] = " << a.second <<dendl;
-  }
-  utime_t ut(mtime);
-  snprintf(buf, sizeof(buf), "%lld.%09lld",
-      (long long)ut.sec(),
-      (long long)ut.nsec());
-
-  string s = attrs["X_AMZ_META_RGWX_SOURCE_MTIME"];
-
-  if (s.empty())
-    s = attrs["x_amz_meta_rgwx_source_mtime"];
-
-  ldpp_dout(dpp, 20) << "is_already_tiered attrs[X_AMZ_META_RGWX_SOURCE_MTIME] = " << s <<dendl;
-  ldpp_dout(dpp, 20) << "is_already_tiered mtime buf = " << buf <<dendl;
-
-  if (!s.empty() && !strcmp(s.c_str(), buf)){
-    return 1;
-  }
-  return 0;
-}
-
-/* Read object locally & also initialize dest rest obj based on read attrs */
-class RGWLCStreamRead
-{
-  CephContext *cct;
-  const DoutPrefixProvider *dpp;
-  std::map<std::string, bufferlist> attrs;
-  uint64_t obj_size;
-  rgw::sal::Object *obj;
-  const real_time &mtime;
-
-  bool multipart{false};
-  uint64_t m_part_size{0};
-  off_t m_part_off{0};
-  off_t m_part_end{0};
-
-  std::unique_ptr<rgw::sal::Object::ReadOp> read_op;
-  off_t ofs{0};
-  off_t end{0};
-  rgw_rest_obj rest_obj;
-
-  int retcode{0};
-
-  public:
-  RGWLCStreamRead(CephContext *_cct, const DoutPrefixProvider *_dpp,
-      rgw::sal::Object *_obj, const real_time &_mtime) :
-    cct(_cct), dpp(_dpp), obj(_obj), mtime(_mtime),
-    read_op(obj->get_read_op()) {}
-
-  ~RGWLCStreamRead() {};
-  int set_range(off_t _ofs, off_t _end);
-  int get_range(off_t &_ofs, off_t &_end);
-  rgw_rest_obj& get_rest_obj();
-  void set_multipart(uint64_t part_size, off_t part_off, off_t part_end);
-  int init();
-  int init_rest_obj();
-  int read(off_t ofs, off_t end, RGWGetDataCB *out_cb);
-};
-
-/* Send PUT op to remote endpoint */
-class RGWLCCloudStreamPut
-{
-  const DoutPrefixProvider *dpp;
-  rgw_lc_obj_properties obj_properties;
-  RGWRESTConn& conn;
-  rgw::sal::Object *dest_obj;
-  std::string etag;
-  RGWRESTStreamS3PutObj *out_req{nullptr};
-
-  struct multipart_info {
-    bool is_multipart{false};
-    std::string upload_id;
-    int part_num{0};
-    uint64_t part_size;
-  } multipart;
-
-  int retcode;
-
-  public:
-  RGWLCCloudStreamPut(const DoutPrefixProvider *_dpp,
-      const rgw_lc_obj_properties&  _obj_properties,
-      RGWRESTConn& _conn,
-      rgw::sal::Object *_dest_obj) :
-    dpp(_dpp), obj_properties(_obj_properties), conn(_conn), dest_obj(_dest_obj) {
-    }
-  int init();
-  static bool keep_attr(const std::string& h);
-  static void init_send_attrs(const DoutPrefixProvider *dpp, const rgw_rest_obj& rest_obj,
-      const rgw_lc_obj_properties& obj_properties,
-      std::map<std::string, std::string>& attrs);
-  void send_ready(const DoutPrefixProvider *dpp, const rgw_rest_obj& rest_obj);
-  void handle_headers(const std::map<std::string, std::string>& headers);
-  bool get_etag(std::string *petag);
-  void set_multipart(const std::string& upload_id, int part_num, uint64_t part_size);
-  int send();
-  RGWGetDataCB *get_cb();
-  int complete_request();
-};
-
-int RGWLCStreamRead::set_range(off_t _ofs, off_t _end) {
-  ofs = _ofs;
-  end = _end;
-
-  return 0;
-}
-
-int RGWLCStreamRead::get_range(off_t &_ofs, off_t &_end) {
-  _ofs = ofs;
-  _end = end;
-
-  return 0;
-}
-
-rgw_rest_obj& RGWLCStreamRead::get_rest_obj() {
-  return rest_obj;
-}
-
-void RGWLCStreamRead::set_multipart(uint64_t part_size, off_t part_off, off_t part_end) {
-  multipart = true;
-  m_part_size = part_size;
-  m_part_off = part_off;
-  m_part_end = part_end;
-}
-
-int RGWLCStreamRead::init() {
-  optional_yield y = null_yield;
-  real_time read_mtime;
-
-  read_op->params.lastmod = &read_mtime;
-
-  int ret = read_op->prepare(y, dpp);
-  if (ret < 0) {
-    ldpp_dout(dpp, 0) << "ERROR: fail to prepare read_op, ret = " << ret << dendl;
-    return ret;
-  }
-
-  if (read_mtime != mtime) {
-    /* raced */
-    return -ECANCELED;
-  }
-
-  attrs = obj->get_attrs();
-  obj_size = obj->get_obj_size();
-
-  ret = init_rest_obj();
-  if (ret < 0) {
-    ldpp_dout(dpp, 0) << "ERROR: fail to initialize rest_obj, ret = " << ret << dendl;
-    return ret;
-  }
-
-  if (!multipart) {
-    set_range(0, obj_size - 1);
-  } else {
-    set_range(m_part_off, m_part_end);
-  }
-  return 0;
-}
-
-int RGWLCStreamRead::init_rest_obj() {
-  /* Initialize rgw_rest_obj. 
-   * Reference: do_decode_rest_obj
-   * Check how to copy headers content */ 
-  rest_obj.init(obj->get_key());
-
-  if (!multipart) {
-    rest_obj.content_len = obj_size;
-  } else {
-    rest_obj.content_len = m_part_size;
-  }
-
-  /* For mulitpart attrs are sent as part of InitMultipartCR itself */
-  if (multipart) {
-    return 0;
-  }
-
-  /*
-   * XXX: verify if its right way to copy attrs into rest obj
-   */
-  init_headers(attrs, rest_obj.attrs);
-
-  rest_obj.acls.set_ctx(cct);
-  const auto aiter = attrs.find(RGW_ATTR_ACL);
-  if (aiter != attrs.end()) {
-    bufferlist& bl = aiter->second;
-    auto bliter = bl.cbegin();
-    try {
-      rest_obj.acls.decode(bliter);
-    } catch (buffer::error& err) {
-      ldpp_dout(dpp, 0) << "ERROR: failed to decode policy off attrs" << dendl;
-      return -EIO;
-    }
-  } else {
-    ldpp_dout(dpp, 0) << "WARNING: acl attrs not provided" << dendl;
-  }
-  return 0;
-}
-
-int RGWLCStreamRead::read(off_t ofs, off_t end, RGWGetDataCB *out_cb) {
-  int ret = read_op->iterate(dpp, ofs, end, out_cb, null_yield);
-  return ret;
-}
-
-int RGWLCCloudStreamPut::init() {
-  /* init output connection */
-  if (multipart.is_multipart) {
-    char buf[32];
-    snprintf(buf, sizeof(buf), "%d", multipart.part_num);
-    rgw_http_param_pair params[] = { { "uploadId", multipart.upload_id.c_str() },
-                                     { "partNumber", buf },
-                                     { nullptr, nullptr } };
-    conn.put_obj_send_init(dest_obj, params, &out_req);
-  } else {
-    conn.put_obj_send_init(dest_obj, nullptr, &out_req);
-  }
-
-  return 0;
-}
-
-bool RGWLCCloudStreamPut::keep_attr(const string& h) {
-  return (keep_headers.find(h) != keep_headers.end() ||
-      boost::algorithm::starts_with(h, "X_AMZ_"));
-}
-
-void RGWLCCloudStreamPut::init_send_attrs(const DoutPrefixProvider *dpp,
-    const rgw_rest_obj& rest_obj,
-    const rgw_lc_obj_properties& obj_properties,
-    std::map<string, string>& attrs) {
-
-  map<string, RGWTierACLMapping>& acl_mappings(obj_properties.target_acl_mappings);
-  const std::string& target_storage_class = obj_properties.target_storage_class;
-
-  attrs.clear();
-
-  for (auto& hi : rest_obj.attrs) {
-    if (keep_attr(hi.first)) {
-      attrs.insert(hi);
-    }
-  }
-
-  const auto acl = rest_obj.acls.get_acl();
-
-  map<int, vector<string> > access_map;
-
-  if (!acl_mappings.empty()) {
-    for (auto& grant : acl.get_grant_map()) {
-      auto& orig_grantee = grant.first;
-      auto& perm = grant.second;
-
-      string grantee;
-
-      const auto& am = acl_mappings;
-
-      const auto iter = am.find(orig_grantee);
-      if (iter == am.end()) {
-        ldpp_dout(dpp, 20) << "acl_mappings: Could not find " << orig_grantee << " .. ignoring" << dendl;
-        continue;
-      }
-
-      grantee = iter->second.dest_id;
-
-      string type;
-
-      switch (iter->second.type) {
-        case ACL_TYPE_CANON_USER:
-          type = "id";
-          break;
-        case ACL_TYPE_EMAIL_USER:
-          type = "emailAddress";
-          break;
-        case ACL_TYPE_GROUP:
-          type = "uri";
-          break;
-        default:
-          continue;
-      }
-
-      string tv = type + "=" + grantee;
-
-      int flags = perm.get_permission().get_permissions();
-      if ((flags & RGW_PERM_FULL_CONTROL) == RGW_PERM_FULL_CONTROL) {
-        access_map[flags].push_back(tv);
-        continue;
-      }
-
-      for (int i = 1; i <= RGW_PERM_WRITE_ACP; i <<= 1) {
-        if (flags & i) {
-          access_map[i].push_back(tv);
-        }
-      }
-    }
-  }
-
-  for (const auto& aiter : access_map) {
-    int grant_type = aiter.first;
-
-    string header_str("x-amz-grant-");
-
-    switch (grant_type) {
-      case RGW_PERM_READ:
-        header_str.append("read");
-        break;
-      case RGW_PERM_WRITE:
-        header_str.append("write");
-        break;
-      case RGW_PERM_READ_ACP:
-        header_str.append("read-acp");
-        break;
-      case RGW_PERM_WRITE_ACP:
-        header_str.append("write-acp");
-        break;
-      case RGW_PERM_FULL_CONTROL:
-        header_str.append("full-control");
-        break;
-    }
-
-    string s;
-
-    for (const auto& viter : aiter.second) {
-      if (!s.empty()) {
-        s.append(", ");
-      }
-      s.append(viter);
-    }
-
-    ldpp_dout(dpp, 20) << "acl_mappings: set acl: " << header_str << "=" << s << dendl;
-
-    attrs[header_str] = s;
-  }
-
-  /* Copy target storage class */
-  if (!target_storage_class.empty()) {
-    attrs["x-amz-storage-class"] = target_storage_class;
-  } else {
-    attrs["x-amz-storage-class"] = "STANDARD";
-  }
-
-  /* New attribute to specify its transitioned from RGW */
-  attrs["x-amz-meta-rgwx-source"] = "rgw";
-
-  char buf[32];
-  snprintf(buf, sizeof(buf), "%llu", (long long)obj_properties.versioned_epoch);
-  attrs["x-amz-meta-rgwx-versioned-epoch"] = buf;
-
-  utime_t ut(obj_properties.mtime);
-  snprintf(buf, sizeof(buf), "%lld.%09lld",
-      (long long)ut.sec(),
-      (long long)ut.nsec());
-
-  attrs["x-amz-meta-rgwx-source-mtime"] = buf;
-  attrs["x-amz-meta-rgwx-source-etag"] = obj_properties.etag;
-  attrs["x-amz-meta-rgwx-source-key"] = rest_obj.key.name;
-  if (!rest_obj.key.instance.empty()) {
-    attrs["x-amz-meta-rgwx-source-version-id"] = rest_obj.key.instance;
-  }
-  for (const auto& a : attrs) {
-    ldpp_dout(dpp, 30) << "init_send_attrs attr[" << a.first << "] = " << a.second <<dendl;
-  }
-}
-
-void RGWLCCloudStreamPut::send_ready(const DoutPrefixProvider *dpp, const rgw_rest_obj& rest_obj) {
-  auto r = static_cast<RGWRESTStreamS3PutObj *>(out_req);
-
-  std::map<std::string, std::string> new_attrs;
-  if (!multipart.is_multipart) {
-    init_send_attrs(dpp, rest_obj, obj_properties, new_attrs);
-  }
-
-  r->set_send_length(rest_obj.content_len);
-
-  RGWAccessControlPolicy policy;
-
-  r->send_ready(dpp, conn.get_key(), new_attrs, policy);
-}
-
-void RGWLCCloudStreamPut::handle_headers(const map<string, string>& headers) {
-  for (const auto& h : headers) {
-    if (h.first == "ETAG") {
-      etag = h.second;
-    }
-  }
-}
-
-bool RGWLCCloudStreamPut::get_etag(string *petag) {
-  if (etag.empty()) {
-    return false;
-  }
-  *petag = etag;
-  return true;
-}
-
-void RGWLCCloudStreamPut::set_multipart(const string& upload_id, int part_num, uint64_t part_size) {
-  multipart.is_multipart = true;
-  multipart.upload_id = upload_id;
-  multipart.part_num = part_num;
-  multipart.part_size = part_size;
-}
-
-int RGWLCCloudStreamPut::send() {
-  int ret = RGWHTTP::send(out_req);
-  return ret;
-}
-
-RGWGetDataCB *RGWLCCloudStreamPut::get_cb() {
-  return out_req->get_out_cb();
-}
-
-int RGWLCCloudStreamPut::complete_request() {
-  int ret = conn.complete_request(out_req, etag, &obj_properties.mtime, null_yield);
-  return ret;
-}
-
-/* Read local copy and write to Cloud endpoint */
-static int cloud_tier_transfer_object(const DoutPrefixProvider* dpp,
-                            RGWLCStreamRead* readf, RGWLCCloudStreamPut* writef) {
-  std::string url;
-  bufferlist bl;
-  bool sent_attrs{false};
-  int ret{0};
-  off_t ofs;
-  off_t end;
-
-  ret = readf->init();
-  if (ret < 0) {
-    ldpp_dout(dpp, 0) << "ERROR: fail to initialize in_crf, ret = " << ret << dendl;
-    return ret;
-  }
-  readf->get_range(ofs, end);
-  rgw_rest_obj& rest_obj = readf->get_rest_obj();
-  if (!sent_attrs) {
-    ret = writef->init();
-    if (ret < 0) {
-      ldpp_dout(dpp, 0) << "ERROR: fail to initialize out_crf, ret = " << ret << dendl;
-      return ret;
-    }
-
-    writef->send_ready(dpp, rest_obj);
-    ret = writef->send();
-    if (ret < 0) {
-      return ret;
-    }
-    sent_attrs = true;
-  }
-
-  ret = readf->read(ofs, end, writef->get_cb());
-
-  if (ret < 0) {
-    ldpp_dout(dpp, 0) << "ERROR: fail to read from in_crf, ret = " << ret << dendl;
-    return ret;
-  }
-
-  ret = writef->complete_request();
-  if (ret < 0) {
-    ldpp_dout(dpp, 0) << "ERROR: fail to complete request, ret = " << ret << dendl;
-    return ret;
-  }
-
-  return 0;
-}
-
-static int cloud_tier_plain_transfer(RGWLCCloudTierCtx& tier_ctx) {
-  int ret;
-  std::unique_ptr<rgw::sal::Bucket> dest_bucket;
-  std::unique_ptr<rgw::sal::Object> dest_obj;
-
-  rgw_lc_obj_properties obj_properties(tier_ctx.o.meta.mtime, tier_ctx.o.meta.etag,
-                        tier_ctx.o.versioned_epoch, tier_ctx.acl_mappings,
-                        tier_ctx.target_storage_class);
-  RGWBucketInfo b;
-  std::string target_obj_name;
-
-  b.bucket.name = tier_ctx.target_bucket_name;
-  target_obj_name = tier_ctx.bucket_info.bucket.name + "/" +
-    tier_ctx.obj->get_name();
-  if (!tier_ctx.o.is_current()) {
-    target_obj_name += get_key_instance(tier_ctx.obj->get_key());
-  }
-
-  ret = tier_ctx.driver->get_bucket(nullptr, b, &dest_bucket);
-  if (ret < 0) {
-    ldpp_dout(tier_ctx.dpp, 0) << "ERROR: failed to initialize dest_bucket - " << tier_ctx.target_bucket_name << " , ret = " << ret << dendl;
-    return ret;
-  }
-
-  dest_obj = dest_bucket->get_object(rgw_obj_key(target_obj_name));
-  if (!dest_obj) {
-    ldpp_dout(tier_ctx.dpp, 0) << "ERROR: failed to initialize dest_object path - " << target_obj_name << dendl;
-    return -1;
-  }
-
-  tier_ctx.obj->set_atomic();
-
-  /* Prepare Read from source */
-  /* TODO: Define readf, writef as stack variables. For some reason,
-   * when used as stack variables (esp., readf), the transition seems to
-   * be taking lot of time eventually erroring out at times.
-   */
-  std::shared_ptr<RGWLCStreamRead> readf;
-  readf.reset(new RGWLCStreamRead(tier_ctx.cct, tier_ctx.dpp,
-        tier_ctx.obj, tier_ctx.o.meta.mtime));
-
-  std::shared_ptr<RGWLCCloudStreamPut> writef;
-  writef.reset(new RGWLCCloudStreamPut(tier_ctx.dpp, obj_properties, tier_ctx.conn,
-               dest_obj.get()));
-
-  /* actual Read & Write */
-  ret = cloud_tier_transfer_object(tier_ctx.dpp, readf.get(), writef.get());
-
-  return ret;
-}
-
-static int cloud_tier_send_multipart_part(RGWLCCloudTierCtx& tier_ctx,
-                                const std::string& upload_id,
-                                const rgw_lc_multipart_part_info& part_info,
-                                std::string *petag) {
-  int ret;
-  std::unique_ptr<rgw::sal::Bucket> dest_bucket;
-  std::unique_ptr<rgw::sal::Object> dest_obj;
-
-  rgw_lc_obj_properties obj_properties(tier_ctx.o.meta.mtime, tier_ctx.o.meta.etag,
-                        tier_ctx.o.versioned_epoch, tier_ctx.acl_mappings,
-                        tier_ctx.target_storage_class);
-  RGWBucketInfo b;
-  std::string target_obj_name;
-  off_t end;
-
-  b.bucket.name = tier_ctx.target_bucket_name;
-  target_obj_name = tier_ctx.bucket_info.bucket.name + "/" +
-    tier_ctx.obj->get_name();
-  if (!tier_ctx.o.is_current()) {
-    target_obj_name += get_key_instance(tier_ctx.obj->get_key());
-  }
-
-  ret = tier_ctx.driver->get_bucket(nullptr, b, &dest_bucket);
-  if (ret < 0) {
-    ldpp_dout(tier_ctx.dpp, 0) << "ERROR: failed to initialize dest_bucket - " << tier_ctx.target_bucket_name << " , ret = " << ret << dendl;
-    return ret;
-  }
-
-  dest_obj = dest_bucket->get_object(rgw_obj_key(target_obj_name));
-  if (!dest_obj) {
-    ldpp_dout(tier_ctx.dpp, 0) << "ERROR: failed to initialize dest_object path - " << target_obj_name << dendl;
-    return -1;
-  }
-
-  tier_ctx.obj->set_atomic();
-
-  /* TODO: Define readf, writef as stack variables. For some reason,
-   * when used as stack variables (esp., readf), the transition seems to
-   * be taking lot of time eventually erroring out at times. */
-  std::shared_ptr<RGWLCStreamRead> readf;
-  readf.reset(new RGWLCStreamRead(tier_ctx.cct, tier_ctx.dpp,
-        tier_ctx.obj, tier_ctx.o.meta.mtime));
-
-  std::shared_ptr<RGWLCCloudStreamPut> writef;
-  writef.reset(new RGWLCCloudStreamPut(tier_ctx.dpp, obj_properties, tier_ctx.conn,
-               dest_obj.get()));
-
-  /* Prepare Read from source */
-  end = part_info.ofs + part_info.size - 1;
-  readf->set_multipart(part_info.size, part_info.ofs, end);
-
-  /* Prepare write */
-  writef->set_multipart(upload_id, part_info.part_num, part_info.size);
-
-  /* actual Read & Write */
-  ret = cloud_tier_transfer_object(tier_ctx.dpp, readf.get(), writef.get());
-  if (ret < 0) {
-    return ret;
-  }
-
-  if (!(writef->get_etag(petag))) {
-    ldpp_dout(tier_ctx.dpp, 0) << "ERROR: failed to get etag from PUT request" << dendl;
-    return -EIO;
-  }
-
-  return 0;
-}
-
-static int cloud_tier_abort_multipart(const DoutPrefixProvider *dpp,
-      RGWRESTConn& dest_conn, const rgw_obj& dest_obj,
-      const std::string& upload_id) {
-  int ret;
-  bufferlist out_bl;
-  bufferlist bl;
-  rgw_http_param_pair params[] = { { "uploadId", upload_id.c_str() }, {nullptr, nullptr} };
-
-  string resource = obj_to_aws_path(dest_obj);
-  ret = dest_conn.send_resource(dpp, "DELETE", resource, params, nullptr,
-      out_bl, &bl, nullptr, null_yield);
-
-
-  if (ret < 0) {
-    ldpp_dout(dpp, 0) << "ERROR: failed to abort multipart upload for dest object=" << dest_obj << " (ret=" << ret << ")" << dendl;
-    return ret;
-  }
-
-  return 0;
-}
-
-static int cloud_tier_init_multipart(const DoutPrefixProvider *dpp,
-      RGWRESTConn& dest_conn, const rgw_obj& dest_obj,
-      uint64_t obj_size, std::map<std::string, std::string>& attrs,
-      std::string& upload_id) {
-  bufferlist out_bl;
-  bufferlist bl;
-
-  struct InitMultipartResult {
-    std::string bucket;
-    std::string key;
-    std::string upload_id;
-
-    void decode_xml(XMLObj *obj) {
-      RGWXMLDecoder::decode_xml("Bucket", bucket, obj);
-      RGWXMLDecoder::decode_xml("Key", key, obj);
-      RGWXMLDecoder::decode_xml("UploadId", upload_id, obj);
-    }
-  } result;
-
-  int ret;
-  rgw_http_param_pair params[] = { { "uploads", nullptr }, {nullptr, nullptr} };
-
-  string resource = obj_to_aws_path(dest_obj);
-
-  ret = dest_conn.send_resource(dpp, "POST", resource, params, &attrs,
-      out_bl, &bl, nullptr, null_yield);
-
-  if (ret < 0) {
-    ldpp_dout(dpp, 0) << "ERROR: failed to initialize multipart upload for dest object=" << dest_obj << dendl;
-    return ret;
-  }
-  /*
-   * If one of the following fails we cannot abort upload, as we cannot
-   * extract the upload id. If one of these fail it's very likely that that's
-   * the least of our problem.
-   */
-  RGWXMLDecoder::XMLParser parser;
-  if (!parser.init()) {
-    ldpp_dout(dpp, 0) << "ERROR: failed to initialize xml parser for parsing multipart init response from server" << dendl;
-    return -EIO;
-  }
-
-  if (!parser.parse(out_bl.c_str(), out_bl.length(), 1)) {
-    string str(out_bl.c_str(), out_bl.length());
-    ldpp_dout(dpp, 5) << "ERROR: failed to parse xml initmultipart: " << str << dendl;
-    return -EIO;
-  }
-
-  try {
-    RGWXMLDecoder::decode_xml("InitiateMultipartUploadResult", result, &parser, true);
-  } catch (RGWXMLDecoder::err& err) {
-    string str(out_bl.c_str(), out_bl.length());
-    ldpp_dout(dpp, 5) << "ERROR: unexpected xml: " << str << dendl;
-    return -EIO;
-  }
-
-  ldpp_dout(dpp, 20) << "init multipart result: bucket=" << result.bucket << " key=" << result.key << " upload_id=" << result.upload_id << dendl;
-
-  upload_id = result.upload_id;
-
-  return 0;
-}
-
-static int cloud_tier_complete_multipart(const DoutPrefixProvider *dpp,
-      RGWRESTConn& dest_conn, const rgw_obj& dest_obj,
-      std::string& upload_id,
-      const std::map<int, rgw_lc_multipart_part_info>& parts) {
-  rgw_http_param_pair params[] = { { "uploadId", upload_id.c_str() }, {nullptr, nullptr} };
-
-  stringstream ss;
-  XMLFormatter formatter;
-  int ret;
-
-  bufferlist bl, out_bl;
-  string resource = obj_to_aws_path(dest_obj);
-
-  struct CompleteMultipartReq {
-    std::map<int, rgw_lc_multipart_part_info> parts;
-
-    explicit CompleteMultipartReq(const std::map<int, rgw_lc_multipart_part_info>& _parts) : parts(_parts) {}
-
-    void dump_xml(Formatter *f) const {
-      for (const auto& p : parts) {
-        f->open_object_section("Part");
-        encode_xml("PartNumber", p.first, f);
-        encode_xml("ETag", p.second.etag, f);
-        f->close_section();
-      };
-    }
-  } req_enc(parts);
-
-  struct CompleteMultipartResult {
-    std::string location;
-    std::string bucket;
-    std::string key;
-    std::string etag;
-
-    void decode_xml(XMLObj *obj) {
-      RGWXMLDecoder::decode_xml("Location", bucket, obj);
-      RGWXMLDecoder::decode_xml("Bucket", bucket, obj);
-      RGWXMLDecoder::decode_xml("Key", key, obj);
-      RGWXMLDecoder::decode_xml("ETag", etag, obj);
-    }
-  } result;
-
-  encode_xml("CompleteMultipartUpload", req_enc, &formatter);
-
-  formatter.flush(ss);
-  bl.append(ss.str());
-
-  ret = dest_conn.send_resource(dpp, "POST", resource, params, nullptr,
-      out_bl, &bl, nullptr, null_yield);
-
-
-  if (ret < 0) {
-    ldpp_dout(dpp, 0) << "ERROR: failed to complete multipart upload for dest object=" << dest_obj << dendl;
-    return ret;
-  }
-  /*
-   * If one of the following fails we cannot abort upload, as we cannot
-   * extract the upload id. If one of these fail it's very likely that that's
-   * the least of our problem.
-   */
-  RGWXMLDecoder::XMLParser parser;
-  if (!parser.init()) {
-    ldpp_dout(dpp, 0) << "ERROR: failed to initialize xml parser for parsing multipart init response from server" << dendl;
-    return -EIO;
-  }
-
-  if (!parser.parse(out_bl.c_str(), out_bl.length(), 1)) {
-    string str(out_bl.c_str(), out_bl.length());
-    ldpp_dout(dpp, 5) << "ERROR: failed to parse xml Completemultipart: " << str << dendl;
-    return -EIO;
-  }
-
-  try {
-    RGWXMLDecoder::decode_xml("CompleteMultipartUploadResult", result, &parser, true);
-  } catch (RGWXMLDecoder::err& err) {
-    string str(out_bl.c_str(), out_bl.length());
-    ldpp_dout(dpp, 5) << "ERROR: unexpected xml: " << str << dendl;
-    return -EIO;
-  }
-
-  ldpp_dout(dpp, 20) << "complete multipart result: location=" << result.location << " bucket=" << result.bucket << " key=" << result.key << " etag=" << result.etag << dendl;
-
-  return ret;
-}
-
-static int cloud_tier_abort_multipart_upload(RGWLCCloudTierCtx& tier_ctx,
-      const rgw_obj& dest_obj, const rgw_raw_obj& status_obj,
-      const std::string& upload_id) {
-  int ret;
-
-  ret = cloud_tier_abort_multipart(tier_ctx.dpp, tier_ctx.conn, dest_obj, upload_id);
-
-  if (ret < 0) {
-    ldpp_dout(tier_ctx.dpp, 0) << "ERROR: failed to abort multipart upload dest obj=" << dest_obj << " upload_id=" << upload_id << " ret=" << ret << dendl;
-    /* ignore error, best effort */
-  }
-  /* remove status obj */
-  ret = delete_upload_status(tier_ctx.dpp, tier_ctx.driver, &status_obj);
-  if (ret < 0) {
-    ldpp_dout(tier_ctx.dpp, 0) << "ERROR: failed to remove sync status obj obj=" << status_obj << " ret=" << ret << dendl;
-    // ignore error, best effort 
-  }
-  return 0;
-}
-
-static int cloud_tier_multipart_transfer(RGWLCCloudTierCtx& tier_ctx) {
-  rgw_obj src_obj;
-  rgw_obj dest_obj;
-
-  uint64_t obj_size;
-  std::string src_etag;
-  rgw_rest_obj rest_obj;
-
-  rgw_lc_multipart_upload_info status;
-
-  std::map<std::string, std::string> new_attrs;
-
-  rgw_raw_obj status_obj;
-
-  RGWBucketInfo b;
-  std::string target_obj_name;
-  rgw_bucket target_bucket;
-
-  int ret;
-
-  rgw_lc_obj_properties obj_properties(tier_ctx.o.meta.mtime, tier_ctx.o.meta.etag,
-        tier_ctx.o.versioned_epoch, tier_ctx.acl_mappings,
-        tier_ctx.target_storage_class);
-
-  uint32_t part_size{0};
-  uint32_t num_parts{0};
-
-  int cur_part{0};
-  uint64_t cur_ofs{0};
-  std::map<int, rgw_lc_multipart_part_info> parts;
-
-  obj_size = tier_ctx.o.meta.size;
-
-  target_bucket.name = tier_ctx.target_bucket_name;
-
-  target_obj_name = tier_ctx.bucket_info.bucket.name + "/" +
-    tier_ctx.obj->get_name();
-  if (!tier_ctx.o.is_current()) {
-    target_obj_name += get_key_instance(tier_ctx.obj->get_key());
-  }
-  dest_obj.init(target_bucket, target_obj_name);
-
-  rgw_pool pool = static_cast<rgw::sal::RadosStore*>(tier_ctx.driver)->svc()->zone->get_zone_params().log_pool;
-  status_obj = rgw_raw_obj(pool, "lc_multipart_" + tier_ctx.obj->get_oid());
-
-  ret = read_upload_status(tier_ctx.dpp, tier_ctx.driver, &status_obj, &status);
-
-  if (ret < 0 && ret != -ENOENT) {
-    ldpp_dout(tier_ctx.dpp, 0) << "ERROR: failed to read sync status of object " << src_obj << " ret=" << ret << dendl;
-    return ret;
-  }
-
-  if (ret >= 0) {
-    // check here that mtime and size did not change 
-    if (status.mtime != obj_properties.mtime || status.obj_size != obj_size ||
-        status.etag != obj_properties.etag) {
-      cloud_tier_abort_multipart_upload(tier_ctx, dest_obj, status_obj, status.upload_id);
-      ret = -ENOENT;
-    }
-  }
-
-  if (ret == -ENOENT) { 
-    RGWLCStreamRead readf(tier_ctx.cct, tier_ctx.dpp, tier_ctx.obj, tier_ctx.o.meta.mtime);
-
-    readf.init();
-
-    rest_obj = readf.get_rest_obj();
-
-    RGWLCCloudStreamPut::init_send_attrs(tier_ctx.dpp, rest_obj, obj_properties, new_attrs);
-
-    ret = cloud_tier_init_multipart(tier_ctx.dpp, tier_ctx.conn, dest_obj, obj_size, new_attrs, status.upload_id);
-    if (ret < 0) {
-      return ret;
-    }
-
-    status.obj_size = obj_size;
-    status.mtime = obj_properties.mtime;
-    status.etag = obj_properties.etag;
-
-    ret = put_upload_status(tier_ctx.dpp, tier_ctx.driver, &status_obj, &status);
-
-    if (ret < 0) {
-      ldpp_dout(tier_ctx.dpp, 0) << "ERROR: failed to driver multipart upload state, ret=" << ret << dendl;
-      // continue with upload anyway 
-    }
-
-#define MULTIPART_MAX_PARTS 10000
-#define MULTIPART_MAX_PARTS 10000
-    uint64_t min_part_size = obj_size / MULTIPART_MAX_PARTS;
-    uint64_t min_conf_size = tier_ctx.multipart_min_part_size;
-
-    if (min_conf_size < MULTIPART_MIN_POSSIBLE_PART_SIZE) {
-      min_conf_size = MULTIPART_MIN_POSSIBLE_PART_SIZE;
-    }
-
-    part_size = std::max(min_conf_size, min_part_size);
-    num_parts = (obj_size + part_size - 1) / part_size;
-    cur_part = 1;
-    cur_ofs = 0;
-  }
-
-  for (; (uint32_t)cur_part <= num_parts; ++cur_part) {
-    ldpp_dout(tier_ctx.dpp, 20) << "cur_part = "<< cur_part << ", info.ofs = " << cur_ofs << ", info.size = " << part_size << ", obj size = " << obj_size<< ", num_parts:" << num_parts << dendl;
-    rgw_lc_multipart_part_info& cur_part_info = parts[cur_part];
-    cur_part_info.part_num = cur_part;
-    cur_part_info.ofs = cur_ofs;
-    cur_part_info.size = std::min((uint64_t)part_size, obj_size - cur_ofs);
-
-    cur_ofs += cur_part_info.size;
-
-    ret = cloud_tier_send_multipart_part(tier_ctx,
-            status.upload_id,
-            cur_part_info,
-            &cur_part_info.etag);
-
-    if (ret < 0) {
-      ldpp_dout(tier_ctx.dpp, 0) << "ERROR: failed to send multipart part of obj=" << tier_ctx.obj << ", sync via multipart upload, upload_id=" << status.upload_id << " part number " << cur_part << " (error: " << cpp_strerror(-ret) << ")" << dendl;
-      cloud_tier_abort_multipart_upload(tier_ctx, dest_obj, status_obj, status.upload_id);
-      return ret;
-    }
-
-  }
-
-  ret = cloud_tier_complete_multipart(tier_ctx.dpp, tier_ctx.conn, dest_obj, status.upload_id, parts);
-  if (ret < 0) {
-    ldpp_dout(tier_ctx.dpp, 0) << "ERROR: failed to complete multipart upload of obj=" << tier_ctx.obj << " (error: " << cpp_strerror(-ret) << ")" << dendl;
-    cloud_tier_abort_multipart_upload(tier_ctx, dest_obj, status_obj, status.upload_id);
-    return ret;
-  }
-
-  /* remove status obj */
-  ret = delete_upload_status(tier_ctx.dpp, tier_ctx.driver, &status_obj);
-  if (ret < 0) {
-    ldpp_dout(tier_ctx.dpp, 0) << "ERROR: failed to abort multipart upload obj=" << tier_ctx.obj << " upload_id=" << status.upload_id << " part number " << cur_part << " (" << cpp_strerror(-ret) << ")" << dendl;
-    // ignore error, best effort 
-  }
-  return 0;
-}
-
-/* Check if object has already been transitioned */
-static int cloud_tier_check_object(RGWLCCloudTierCtx& tier_ctx, bool& already_tiered) {
-  int ret;
-  std::map<std::string, std::string> headers;
-
-  /* Fetch Head object */
-  ret = cloud_tier_get_object(tier_ctx, true, headers);
-
-  if (ret < 0) {
-    ldpp_dout(tier_ctx.dpp, 0) << "ERROR: failed to fetch HEAD from cloud for obj=" << tier_ctx.obj << " , ret = " << ret << dendl;
-    return ret;
-  }
-
-  already_tiered = is_already_tiered(tier_ctx.dpp, headers, tier_ctx.o.meta.mtime);
-
-  if (already_tiered) {
-    ldpp_dout(tier_ctx.dpp, 20) << "is_already_tiered true" << dendl;
-  } else {
-    ldpp_dout(tier_ctx.dpp, 20) << "is_already_tiered false..going with out_crf writing" << dendl;
-  }
-
-  return ret;
-}
-
-static int cloud_tier_create_bucket(RGWLCCloudTierCtx& tier_ctx) {
-  bufferlist out_bl;
-  int ret = 0;
-  pair<string, string> key(tier_ctx.storage_class, tier_ctx.target_bucket_name);
-  struct CreateBucketResult {
-    std::string code;
-
-    void decode_xml(XMLObj *obj) {
-      RGWXMLDecoder::decode_xml("Code", code, obj);
-    }
-  } result;
-
-  ldpp_dout(tier_ctx.dpp, 30) << "Cloud_tier_ctx: creating bucket:" << tier_ctx.target_bucket_name << dendl;
-  bufferlist bl;
-  string resource = tier_ctx.target_bucket_name;
-
-  ret = tier_ctx.conn.send_resource(tier_ctx.dpp, "PUT", resource, nullptr, nullptr,
-                                    out_bl, &bl, nullptr, null_yield);
-
-  if (ret < 0 ) {
-    ldpp_dout(tier_ctx.dpp, 0) << "create target bucket : " << tier_ctx.target_bucket_name << " returned ret:" << ret << dendl;
-  }
-  if (out_bl.length() > 0) {
-    RGWXMLDecoder::XMLParser parser;
-    if (!parser.init()) {
-      ldpp_dout(tier_ctx.dpp, 0) << "ERROR: failed to initialize xml parser for parsing create_bucket response from server" << dendl;
-      return -EIO;
-    }
-
-    if (!parser.parse(out_bl.c_str(), out_bl.length(), 1)) {
-      string str(out_bl.c_str(), out_bl.length());
-      ldpp_dout(tier_ctx.dpp, 5) << "ERROR: failed to parse xml createbucket: " << str << dendl;
-      return -EIO;
-    }
-
-    try {
-      RGWXMLDecoder::decode_xml("Error", result, &parser, true);
-    } catch (RGWXMLDecoder::err& err) {
-      string str(out_bl.c_str(), out_bl.length());
-      ldpp_dout(tier_ctx.dpp, 5) << "ERROR: unexpected xml: " << str << dendl;
-      return -EIO;
-    }
-
-    if (result.code != "BucketAlreadyOwnedByYou" && result.code != "BucketAlreadyExists") {
-      ldpp_dout(tier_ctx.dpp, 0) << "ERROR: Creating target bucket failed with error: " << result.code << dendl;
-      return -EIO;
-    }
-  }
-
-  return 0;
-}
-
-int rgw_cloud_tier_transfer_object(RGWLCCloudTierCtx& tier_ctx, std::set<std::string>& cloud_targets) {
-  int ret = 0;
-
-  // check if target_path is already created
-  std::set<std::string>::iterator it;
-
-  it = cloud_targets.find(tier_ctx.target_bucket_name);
-  tier_ctx.target_bucket_created = (it != cloud_targets.end());
-
-  /* If run first time attempt to create the target bucket */
-  if (!tier_ctx.target_bucket_created) {
-    ret = cloud_tier_create_bucket(tier_ctx);
-
-    if (ret < 0) {
-      ldpp_dout(tier_ctx.dpp, 0) << "ERROR: failed to create target bucket on the cloud endpoint ret=" << ret << dendl;
-      return ret;
-    }
-    tier_ctx.target_bucket_created = true;
-    cloud_targets.insert(tier_ctx.target_bucket_name);
-  }
-
-  /* Since multiple zones may try to transition the same object to the cloud,
-   * verify if the object is already transitioned. And since its just a best
-   * effort, do not bail out in case of any errors.
-   */
-  bool already_tiered = false;
-  ret = cloud_tier_check_object(tier_ctx, already_tiered);
-
-  if (ret < 0) {
-    ldpp_dout(tier_ctx.dpp, 0) << "ERROR: failed to check object on the cloud endpoint ret=" << ret << dendl;
-  }
-
-  if (already_tiered) {
-    ldpp_dout(tier_ctx.dpp, 20) << "Object (" << tier_ctx.o.key << ") is already tiered" << dendl;
-    return 0;
-  }
-
-  uint64_t size = tier_ctx.o.meta.size;
-  uint64_t multipart_sync_threshold = tier_ctx.multipart_sync_threshold;
-
-  if (multipart_sync_threshold < MULTIPART_MIN_POSSIBLE_PART_SIZE) {
-    multipart_sync_threshold = MULTIPART_MIN_POSSIBLE_PART_SIZE;
-  }
-
-  if (size < multipart_sync_threshold) {
-    ret = cloud_tier_plain_transfer(tier_ctx);
-  } else {
-    tier_ctx.is_multipart_upload = true;
-    ret = cloud_tier_multipart_transfer(tier_ctx);
-  } 
-
-  if (ret < 0) {
-    ldpp_dout(tier_ctx.dpp, 0) << "ERROR: failed to transition object ret=" << ret << dendl;
-  }
-
-  return ret;
-}
diff --git a/src/rgw/store/rados/rgw_lc_tier.h b/src/rgw/store/rados/rgw_lc_tier.h
deleted file mode 100644 (file)
index 1b21f26..0000000
+++ /dev/null
@@ -1,54 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab ft=cpp
-
-#ifndef CEPH_RGW_LC_TIER_H
-#define CEPH_RGW_LC_TIER_H
-
-#include "rgw_lc.h"
-#include "rgw_rest_conn.h"
-#include "rgw_rados.h"
-#include "rgw_zone.h"
-#include "rgw_sal_rados.h"
-#include "rgw_cr_rest.h"
-
-#define DEFAULT_MULTIPART_SYNC_PART_SIZE (32 * 1024 * 1024)
-#define MULTIPART_MIN_POSSIBLE_PART_SIZE (5 * 1024 * 1024)
-
-struct RGWLCCloudTierCtx {
-  CephContext *cct;
-  const DoutPrefixProvider *dpp;
-
-  /* Source */
-  rgw_bucket_dir_entry& o;
-  rgw::sal::Driver *driver;
-  RGWBucketInfo& bucket_info;
-  std::string storage_class;
-
-  rgw::sal::Object *obj;
-
-  /* Remote */
-  RGWRESTConn& conn;
-  std::string target_bucket_name;
-  std::string target_storage_class;
-
-  std::map<std::string, RGWTierACLMapping> acl_mappings;
-  uint64_t multipart_min_part_size;
-  uint64_t multipart_sync_threshold;
-
-  bool is_multipart_upload{false};
-  bool target_bucket_created{true};
-
-  RGWLCCloudTierCtx(CephContext* _cct, const DoutPrefixProvider *_dpp,
-      rgw_bucket_dir_entry& _o, rgw::sal::Driver *_driver,
-      RGWBucketInfo &_binfo, rgw::sal::Object *_obj,
-      RGWRESTConn& _conn, std::string& _bucket,
-      std::string& _storage_class) :
-    cct(_cct), dpp(_dpp), o(_o), driver(_driver), bucket_info(_binfo),
-    obj(_obj), conn(_conn), target_bucket_name(_bucket),
-    target_storage_class(_storage_class) {}
-};
-
-/* Transition object to cloud endpoint */
-int rgw_cloud_tier_transfer_object(RGWLCCloudTierCtx& tier_ctx, std::set<std::string>& cloud_targets);
-
-#endif
diff --git a/src/rgw/store/rados/rgw_log_backing.cc b/src/rgw/store/rados/rgw_log_backing.cc
deleted file mode 100644 (file)
index 7c9dafe..0000000
+++ /dev/null
@@ -1,708 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab ft=cpp
-
-#include "cls/log/cls_log_client.h"
-#include "cls/version/cls_version_client.h"
-
-#include "rgw_log_backing.h"
-#include "rgw_tools.h"
-#include "cls_fifo_legacy.h"
-
-using namespace std::chrono_literals;
-namespace cb = ceph::buffer;
-
-static constexpr auto dout_subsys = ceph_subsys_rgw;
-
-enum class shard_check { dne, omap, fifo, corrupt };
-inline std::ostream& operator <<(std::ostream& m, const shard_check& t) {
-  switch (t) {
-  case shard_check::dne:
-    return m << "shard_check::dne";
-  case shard_check::omap:
-    return m << "shard_check::omap";
-  case shard_check::fifo:
-    return m << "shard_check::fifo";
-  case shard_check::corrupt:
-    return m << "shard_check::corrupt";
-  }
-
-  return m << "shard_check::UNKNOWN=" << static_cast<uint32_t>(t);
-}
-
-namespace {
-/// Return the shard type, and a bool to see whether it has entries.
-shard_check
-probe_shard(const DoutPrefixProvider *dpp, librados::IoCtx& ioctx, const std::string& oid,
-           bool& fifo_unsupported, optional_yield y)
-{
-  ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                    << " probing oid=" << oid
-                    << dendl;
-  if (!fifo_unsupported) {
-    std::unique_ptr<rgw::cls::fifo::FIFO> fifo;
-    auto r = rgw::cls::fifo::FIFO::open(dpp, ioctx, oid,
-                                       &fifo, y,
-                                       std::nullopt, true);
-    switch (r) {
-    case 0:
-      ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                        << ": oid=" << oid << " is FIFO"
-                        << dendl;
-      return shard_check::fifo;
-
-    case -ENODATA:
-      ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                        << ": oid=" << oid << " is empty and therefore OMAP"
-                        << dendl;
-      return shard_check::omap;
-
-    case -ENOENT:
-      ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                        << ": oid=" << oid << " does not exist"
-                        << dendl;
-      return shard_check::dne;
-
-    case -EPERM:
-      ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                        << ": FIFO is unsupported, marking."
-                        << dendl;
-      fifo_unsupported = true;
-      return shard_check::omap;
-
-    default:
-      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                        << ": error probing: r=" << r
-                        << ", oid=" << oid << dendl;
-      return shard_check::corrupt;
-    }
-  } else {
-    // Since FIFO is unsupported, OMAP is the only alternative
-    return shard_check::omap;
-  }
-}
-
-tl::expected<log_type, bs::error_code>
-handle_dne(const DoutPrefixProvider *dpp, librados::IoCtx& ioctx,
-          log_type def,
-          std::string oid,
-          bool fifo_unsupported,
-          optional_yield y)
-{
-  if (def == log_type::fifo) {
-    if (fifo_unsupported) {
-      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                << " WARNING: FIFO set as default but not supported by OSD. "
-                << "Falling back to OMAP." << dendl;
-      return log_type::omap;
-    }
-    std::unique_ptr<rgw::cls::fifo::FIFO> fifo;
-    auto r = rgw::cls::fifo::FIFO::create(dpp, ioctx, oid,
-                                         &fifo, y,
-                                         std::nullopt);
-    if (r < 0) {
-      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                << " error creating FIFO: r=" << r
-                << ", oid=" << oid << dendl;
-      return tl::unexpected(bs::error_code(-r, bs::system_category()));
-    }
-  }
-  return def;
-}
-}
-
-tl::expected<log_type, bs::error_code>
-log_backing_type(const DoutPrefixProvider *dpp, 
-                 librados::IoCtx& ioctx,
-                log_type def,
-                int shards,
-                const fu2::unique_function<std::string(int) const>& get_oid,
-                optional_yield y)
-{
-  auto check = shard_check::dne;
-  bool fifo_unsupported = false;
-  for (int i = 0; i < shards; ++i) {
-    auto c = probe_shard(dpp, ioctx, get_oid(i), fifo_unsupported, y);
-    if (c == shard_check::corrupt)
-      return tl::unexpected(bs::error_code(EIO, bs::system_category()));
-    if (c == shard_check::dne) continue;
-    if (check == shard_check::dne) {
-      check = c;
-      continue;
-    }
-
-    if (check != c) {
-      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                << " clashing types: check=" << check
-                << ", c=" << c << dendl;
-      return tl::unexpected(bs::error_code(EIO, bs::system_category()));
-    }
-  }
-  if (check == shard_check::corrupt) {
-    ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
-              << " should be unreachable!" << dendl;
-    return tl::unexpected(bs::error_code(EIO, bs::system_category()));
-  }
-
-  if (check == shard_check::dne)
-    return handle_dne(dpp, ioctx,
-                     def,
-                     get_oid(0),
-                     fifo_unsupported,
-                     y);
-
-  return (check == shard_check::fifo ? log_type::fifo : log_type::omap);
-}
-
-bs::error_code log_remove(const DoutPrefixProvider *dpp, 
-                          librados::IoCtx& ioctx,
-                         int shards,
-                         const fu2::unique_function<std::string(int) const>& get_oid,
-                         bool leave_zero,
-                         optional_yield y)
-{
-  bs::error_code ec;
-  for (int i = 0; i < shards; ++i) {
-    auto oid = get_oid(i);
-    rados::cls::fifo::info info;
-    uint32_t part_header_size = 0, part_entry_overhead = 0;
-
-    auto r = rgw::cls::fifo::get_meta(dpp, ioctx, oid, std::nullopt, &info,
-                                     &part_header_size, &part_entry_overhead,
-                                     0, y, true);
-    if (r == -ENOENT) continue;
-    if (r == 0 && info.head_part_num > -1) {
-      for (auto j = info.tail_part_num; j <= info.head_part_num; ++j) {
-       librados::ObjectWriteOperation op;
-       op.remove();
-       auto part_oid = info.part_oid(j);
-       auto subr = rgw_rados_operate(dpp, ioctx, part_oid, &op, null_yield);
-       if (subr < 0 && subr != -ENOENT) {
-         if (!ec)
-           ec = bs::error_code(-subr, bs::system_category());
-         ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                    << ": failed removing FIFO part: part_oid=" << part_oid
-                    << ", subr=" << subr << dendl;
-       }
-      }
-    }
-    if (r < 0 && r != -ENODATA) {
-      if (!ec)
-       ec = bs::error_code(-r, bs::system_category());
-      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                << ": failed checking FIFO part: oid=" << oid
-                << ", r=" << r << dendl;
-    }
-    librados::ObjectWriteOperation op;
-    if (i == 0 && leave_zero) {
-      // Leave shard 0 in existence, but remove contents and
-      // omap. cls_lock stores things in the xattrs. And sync needs to
-      // rendezvous with locks on generation 0 shard 0.
-      op.omap_set_header({});
-      op.omap_clear();
-      op.truncate(0);
-    } else {
-      op.remove();
-    }
-    r = rgw_rados_operate(dpp, ioctx, oid, &op, null_yield);
-    if (r < 0 && r != -ENOENT) {
-      if (!ec)
-       ec = bs::error_code(-r, bs::system_category());
-      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                << ": failed removing shard: oid=" << oid
-                << ", r=" << r << dendl;
-    }
-  }
-  return ec;
-}
-
-logback_generations::~logback_generations() {
-  if (watchcookie > 0) {
-    auto cct = static_cast<CephContext*>(ioctx.cct());
-    auto r = ioctx.unwatch2(watchcookie);
-    if (r < 0) {
-      lderr(cct) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                << ": failed unwatching oid=" << oid
-                << ", r=" << r << dendl;
-    }
-  }
-}
-
-bs::error_code logback_generations::setup(const DoutPrefixProvider *dpp,
-                                          log_type def,
-                                         optional_yield y) noexcept
-{
-  try {
-    // First, read.
-    auto cct = static_cast<CephContext*>(ioctx.cct());
-    auto res = read(dpp, y);
-    if (!res && res.error() != bs::errc::no_such_file_or_directory) {
-      return res.error();
-    }
-    if (res) {
-      std::unique_lock lock(m);
-      std::tie(entries_, version) = std::move(*res);
-    } else {
-      // Are we the first? Then create generation 0 and the generations
-      // metadata.
-      librados::ObjectWriteOperation op;
-      auto type = log_backing_type(dpp, ioctx, def, shards,
-                                  [this](int shard) {
-                                    return this->get_oid(0, shard);
-                                  }, y);
-      if (!type)
-       return type.error();
-
-      logback_generation l;
-      l.type = *type;
-
-      std::unique_lock lock(m);
-      version.ver = 1;
-      static constexpr auto TAG_LEN = 24;
-      version.tag.clear();
-      append_rand_alpha(cct, version.tag, version.tag, TAG_LEN);
-      op.create(true);
-      cls_version_set(op, version);
-      cb::list bl;
-      entries_.emplace(0, std::move(l));
-      encode(entries_, bl);
-      lock.unlock();
-
-      op.write_full(bl);
-      auto r = rgw_rados_operate(dpp, ioctx, oid, &op, y);
-      if (r < 0 && r != -EEXIST) {
-       ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                  << ": failed writing oid=" << oid
-                  << ", r=" << r << dendl;
-       bs::system_error(-r, bs::system_category());
-      }
-      // Did someone race us? Then re-read.
-      if (r != 0) {
-       res = read(dpp, y);
-       if (!res)
-         return res.error();
-       if (res->first.empty())
-         return bs::error_code(EIO, bs::system_category());
-       auto l = res->first.begin()->second;
-       // In the unlikely event that someone raced us, created
-       // generation zero, incremented, then erased generation zero,
-       // don't leave generation zero lying around.
-       if (l.gen_id != 0) {
-         auto ec = log_remove(dpp, ioctx, shards,
-                              [this](int shard) {
-                                return this->get_oid(0, shard);
-                              }, true, y);
-         if (ec) return ec;
-       }
-       std::unique_lock lock(m);
-       std::tie(entries_, version) = std::move(*res);
-      }
-    }
-    // Pass all non-empty generations to the handler
-    std::unique_lock lock(m);
-    auto i = lowest_nomempty(entries_);
-    entries_t e;
-    std::copy(i, entries_.cend(),
-             std::inserter(e, e.end()));
-    m.unlock();
-    auto ec = watch();
-    if (ec) {
-      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                << ": failed to re-establish watch, unsafe to continue: oid="
-                << oid << ", ec=" << ec.message() << dendl;
-    }
-    return handle_init(std::move(e));
-  } catch (const std::bad_alloc&) {
-    return bs::error_code(ENOMEM, bs::system_category());
-  }
-}
-
-bs::error_code logback_generations::update(const DoutPrefixProvider *dpp, optional_yield y) noexcept
-{
-  try {
-    auto res = read(dpp, y);
-    if (!res) {
-      return res.error();
-    }
-
-    std::unique_lock l(m);
-    auto& [es, v] = *res;
-    if (v == version) {
-      // Nothing to do!
-      return {};
-    }
-
-    // Check consistency and prepare update
-    if (es.empty()) {
-      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                << ": INCONSISTENCY! Read empty update." << dendl;
-      return bs::error_code(EFAULT, bs::system_category());
-    }
-    auto cur_lowest = lowest_nomempty(entries_);
-    // Straight up can't happen
-    assert(cur_lowest != entries_.cend());
-    auto new_lowest = lowest_nomempty(es);
-    if (new_lowest == es.cend()) {
-      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                << ": INCONSISTENCY! Read update with no active head." << dendl;
-      return bs::error_code(EFAULT, bs::system_category());
-    }
-    if (new_lowest->first < cur_lowest->first) {
-      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                << ": INCONSISTENCY! Tail moved wrong way." << dendl;
-      return bs::error_code(EFAULT, bs::system_category());
-    }
-
-    std::optional<uint64_t> highest_empty;
-    if (new_lowest->first > cur_lowest->first && new_lowest != es.begin()) {
-      --new_lowest;
-      highest_empty = new_lowest->first;
-    }
-
-    entries_t new_entries;
-
-    if ((es.end() - 1)->first < (entries_.end() - 1)->first) {
-      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                << ": INCONSISTENCY! Head moved wrong way." << dendl;
-      return bs::error_code(EFAULT, bs::system_category());
-    }
-
-    if ((es.end() - 1)->first > (entries_.end() - 1)->first) {
-      auto ei = es.lower_bound((entries_.end() - 1)->first + 1);
-      std::copy(ei, es.end(), std::inserter(new_entries, new_entries.end()));
-    }
-
-    // Everything checks out!
-
-    version = v;
-    entries_ = es;
-    l.unlock();
-
-    if (highest_empty) {
-      auto ec = handle_empty_to(*highest_empty);
-      if (ec) return ec;
-    }
-
-    if (!new_entries.empty()) {
-      auto ec = handle_new_gens(std::move(new_entries));
-      if (ec) return ec;
-    }
-  } catch (const std::bad_alloc&) {
-    return bs::error_code(ENOMEM, bs::system_category());
-  }
-  return {};
-}
-
-auto logback_generations::read(const DoutPrefixProvider *dpp, optional_yield y) noexcept ->
-  tl::expected<std::pair<entries_t, obj_version>, bs::error_code>
-{
-  try {
-    librados::ObjectReadOperation op;
-    std::unique_lock l(m);
-    cls_version_check(op, version, VER_COND_GE);
-    l.unlock();
-    obj_version v2;
-    cls_version_read(op, &v2);
-    cb::list bl;
-    op.read(0, 0, &bl, nullptr);
-    auto r = rgw_rados_operate(dpp, ioctx, oid, &op, nullptr, y);
-    if (r < 0) {
-      if (r == -ENOENT) {
-       ldpp_dout(dpp, 5) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                     << ": oid=" << oid
-                     << " not found" << dendl;
-      } else {
-       ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                  << ": failed reading oid=" << oid
-                  << ", r=" << r << dendl;
-      }
-      return tl::unexpected(bs::error_code(-r, bs::system_category()));
-    }
-    auto bi = bl.cbegin();
-    entries_t e;
-    try {
-      decode(e, bi);
-    } catch (const cb::error& err) {
-      return tl::unexpected(err.code());
-    }
-    return std::pair{ std::move(e), std::move(v2) };
-  } catch (const std::bad_alloc&) {
-    return tl::unexpected(bs::error_code(ENOMEM, bs::system_category()));
-  }
-}
-
-bs::error_code logback_generations::write(const DoutPrefixProvider *dpp, entries_t&& e,
-                                         std::unique_lock<std::mutex>&& l_,
-                                         optional_yield y) noexcept
-{
-  auto l = std::move(l_);
-  ceph_assert(l.mutex() == &m &&
-             l.owns_lock());
-  try {
-    librados::ObjectWriteOperation op;
-    cls_version_check(op, version, VER_COND_GE);
-    cb::list bl;
-    encode(e, bl);
-    op.write_full(bl);
-    cls_version_inc(op);
-    auto r = rgw_rados_operate(dpp, ioctx, oid, &op, y);
-    if (r == 0) {
-      entries_ = std::move(e);
-      version.inc();
-      return {};
-    }
-    l.unlock();
-    if (r < 0 && r != -ECANCELED) {
-      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                << ": failed reading oid=" << oid
-                << ", r=" << r << dendl;
-      return { -r, bs::system_category() };
-    }
-    if (r == -ECANCELED) {
-      auto ec = update(dpp, y);
-      if (ec) {
-       return ec;
-      } else {
-       return { ECANCELED, bs::system_category() };
-      }
-    }
-  } catch (const std::bad_alloc&) {
-    return { ENOMEM, bs::system_category() };
-  }
-  return {};
-}
-
-
-bs::error_code logback_generations::watch() noexcept {
-  try {
-    auto cct = static_cast<CephContext*>(ioctx.cct());
-    auto r = ioctx.watch2(oid, &watchcookie, this);
-    if (r < 0) {
-      lderr(cct) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                << ": failed to set watch oid=" << oid
-                << ", r=" << r << dendl;
-      return { -r, bs::system_category() };
-    }
-  } catch (const std::bad_alloc&) {
-    return bs::error_code(ENOMEM, bs::system_category());
-  }
-  return {};
-}
-
-bs::error_code logback_generations::new_backing(const DoutPrefixProvider *dpp, 
-                                                log_type type,
-                                               optional_yield y) noexcept {
-  static constexpr auto max_tries = 10;
-  try {
-    auto ec = update(dpp, y);
-    if (ec) return ec;
-    auto tries = 0;
-    entries_t new_entries;
-    do {
-      std::unique_lock l(m);
-      auto last = entries_.end() - 1;
-      if (last->second.type == type) {
-       // Nothing to be done
-       return {};
-      }
-      auto newgenid = last->first + 1;
-      logback_generation newgen;
-      newgen.gen_id = newgenid;
-      newgen.type = type;
-      new_entries.emplace(newgenid, newgen);
-      auto es = entries_;
-      es.emplace(newgenid, std::move(newgen));
-      ec = write(dpp, std::move(es), std::move(l), y);
-      ++tries;
-    } while (ec == bs::errc::operation_canceled &&
-            tries < max_tries);
-    if (tries >= max_tries) {
-      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                << ": exhausted retry attempts." << dendl;
-      return ec;
-    }
-
-    if (ec) {
-      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                << ": write failed with ec=" << ec.message() << dendl;
-      return ec;
-    }
-
-    cb::list bl, rbl;
-
-    auto r = rgw_rados_notify(dpp, ioctx, oid, bl, 10'000, &rbl, y);
-    if (r < 0) {
-      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                << ": notify failed with r=" << r << dendl;
-      return { -r, bs::system_category() };
-    }
-    ec = handle_new_gens(new_entries);
-  } catch (const std::bad_alloc&) {
-    return bs::error_code(ENOMEM, bs::system_category());
-  }
-  return {};
-}
-
-bs::error_code logback_generations::empty_to(const DoutPrefixProvider *dpp, 
-                                             uint64_t gen_id,
-                                            optional_yield y) noexcept {
-  static constexpr auto max_tries = 10;
-  try {
-    auto ec = update(dpp, y);
-    if (ec) return ec;
-    auto tries = 0;
-    uint64_t newtail = 0;
-    do {
-      std::unique_lock l(m);
-      {
-       auto last = entries_.end() - 1;
-       if (gen_id >= last->first) {
-         ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                    << ": Attempt to trim beyond the possible." << dendl;
-         return bs::error_code(EINVAL, bs::system_category());
-       }
-      }
-      auto es = entries_;
-      auto ei = es.upper_bound(gen_id);
-      if (ei == es.begin()) {
-       // Nothing to be done.
-       return {};
-      }
-      for (auto i = es.begin(); i < ei; ++i) {
-       newtail = i->first;
-       i->second.pruned = ceph::real_clock::now();
-      }
-      ec = write(dpp, std::move(es), std::move(l), y);
-      ++tries;
-    } while (ec == bs::errc::operation_canceled &&
-            tries < max_tries);
-    if (tries >= max_tries) {
-      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                << ": exhausted retry attempts." << dendl;
-      return ec;
-    }
-
-    if (ec) {
-      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                << ": write failed with ec=" << ec.message() << dendl;
-      return ec;
-    }
-
-    cb::list bl, rbl;
-
-    auto r = rgw_rados_notify(dpp, ioctx, oid, bl, 10'000, &rbl, y);
-    if (r < 0) {
-      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                << ": notify failed with r=" << r << dendl;
-      return { -r, bs::system_category() };
-    }
-    ec = handle_empty_to(newtail);
-  } catch (const std::bad_alloc&) {
-    return bs::error_code(ENOMEM, bs::system_category());
-  }
-  return {};
-}
-
-bs::error_code logback_generations::remove_empty(const DoutPrefixProvider *dpp, optional_yield y) noexcept {
-  static constexpr auto max_tries = 10;
-  try {
-    auto ec = update(dpp, y);
-    if (ec) return ec;
-    auto tries = 0;
-    entries_t new_entries;
-    std::unique_lock l(m);
-    ceph_assert(!entries_.empty());
-    {
-      auto i = lowest_nomempty(entries_);
-      if (i == entries_.begin()) {
-       return {};
-      }
-    }
-    entries_t es;
-    auto now = ceph::real_clock::now();
-    l.unlock();
-    do {
-      std::copy_if(entries_.cbegin(), entries_.cend(),
-                  std::inserter(es, es.end()),
-                  [now](const auto& e) {
-                    if (!e.second.pruned)
-                      return false;
-
-                    auto pruned = *e.second.pruned;
-                    return (now - pruned) >= 1h;
-                  });
-      auto es2 = entries_;
-      for (const auto& [gen_id, e] : es) {
-       ceph_assert(e.pruned);
-       auto ec = log_remove(dpp, ioctx, shards,
-                            [this, gen_id = gen_id](int shard) {
-                              return this->get_oid(gen_id, shard);
-                            }, (gen_id == 0), y);
-       if (ec) {
-         ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                            << ": Error pruning: gen_id=" << gen_id
-                            << " ec=" << ec.message() << dendl;
-       }
-       if (auto i = es2.find(gen_id); i != es2.end()) {
-         es2.erase(i);
-       }
-      }
-      l.lock();
-      es.clear();
-      ec = write(dpp, std::move(es2), std::move(l), y);
-      ++tries;
-    } while (ec == bs::errc::operation_canceled &&
-            tries < max_tries);
-    if (tries >= max_tries) {
-      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                << ": exhausted retry attempts." << dendl;
-      return ec;
-    }
-
-    if (ec) {
-      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                << ": write failed with ec=" << ec.message() << dendl;
-      return ec;
-    }
-  } catch (const std::bad_alloc&) {
-    return bs::error_code(ENOMEM, bs::system_category());
-  }
-  return {};
-}
-
-void logback_generations::handle_notify(uint64_t notify_id,
-                                       uint64_t cookie,
-                                       uint64_t notifier_id,
-                                       bufferlist& bl)
-{
-  auto cct = static_cast<CephContext*>(ioctx.cct());
-  const DoutPrefix dp(cct, dout_subsys, "logback generations handle_notify: ");
-  if (notifier_id != my_id) {
-    auto ec = update(&dp, null_yield);
-    if (ec) {
-      lderr(cct)
-       << __PRETTY_FUNCTION__ << ":" << __LINE__
-       << ": update failed, no one to report to and no safe way to continue."
-       << dendl;
-      abort();
-    }
-  }
-  cb::list rbl;
-  ioctx.notify_ack(oid, notify_id, watchcookie, rbl);
-}
-
-void logback_generations::handle_error(uint64_t cookie, int err) {
-  auto cct = static_cast<CephContext*>(ioctx.cct());
-  auto r = ioctx.unwatch2(watchcookie);
-  if (r < 0) {
-    lderr(cct) << __PRETTY_FUNCTION__ << ":" << __LINE__
-              << ": failed to set unwatch oid=" << oid
-              << ", r=" << r << dendl;
-  }
-
-  auto ec = watch();
-  if (ec) {
-    lderr(cct) << __PRETTY_FUNCTION__ << ":" << __LINE__
-              << ": failed to re-establish watch, unsafe to continue: oid="
-              << oid << ", ec=" << ec.message() << dendl;
-  }
-}
diff --git a/src/rgw/store/rados/rgw_log_backing.h b/src/rgw/store/rados/rgw_log_backing.h
deleted file mode 100644 (file)
index 3fa67d7..0000000
+++ /dev/null
@@ -1,399 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab ft=cpp
-
-#ifndef CEPH_RGW_LOGBACKING_H
-#define CEPH_RGW_LOGBACKING_H
-
-#include <optional>
-#include <iostream>
-#include <string>
-#include <string_view>
-
-#include <strings.h>
-
-#include <boost/container/flat_map.hpp>
-#include <boost/system/error_code.hpp>
-
-#undef FMT_HEADER_ONLY
-#define FMT_HEADER_ONLY 1
-#include <fmt/format.h>
-
-#include "include/rados/librados.hpp"
-#include "include/encoding.h"
-#include "include/expected.hpp"
-#include "include/function2.hpp"
-
-#include "cls/version/cls_version_types.h"
-
-#include "common/async/yield_context.h"
-#include "common/Formatter.h"
-#include "common/strtol.h"
-
-namespace bc = boost::container;
-namespace bs = boost::system;
-
-#include "cls_fifo_legacy.h"
-
-/// Type of log backing, stored in the mark used in the quick check,
-/// and passed to checking functions.
-enum class log_type {
-  omap = 0,
-  fifo = 1
-};
-
-inline void encode(const log_type& type, ceph::buffer::list& bl) {
-  auto t = static_cast<uint8_t>(type);
-  encode(t, bl);
-}
-
-inline void decode(log_type& type, bufferlist::const_iterator& bl) {
-  uint8_t t;
-  decode(t, bl);
-  type = static_cast<log_type>(t);
-}
-
-inline std::optional<log_type> to_log_type(std::string_view s) {
-  if (strncasecmp(s.data(), "omap", s.length()) == 0) {
-    return log_type::omap;
-  } else if (strncasecmp(s.data(), "fifo", s.length()) == 0) {
-    return log_type::fifo;
-  } else {
-    return std::nullopt;
-  }
-}
-inline std::ostream& operator <<(std::ostream& m, const log_type& t) {
-  switch (t) {
-  case log_type::omap:
-    return m << "log_type::omap";
-  case log_type::fifo:
-    return m << "log_type::fifo";
-  }
-
-  return m << "log_type::UNKNOWN=" << static_cast<uint32_t>(t);
-}
-
-/// Look over the shards in a log and determine the type.
-tl::expected<log_type, bs::error_code>
-log_backing_type(const DoutPrefixProvider *dpp, 
-                 librados::IoCtx& ioctx,
-                log_type def,
-                int shards, //< Total number of shards
-                /// A function taking a shard number and
-                /// returning an oid.
-                const fu2::unique_function<std::string(int) const>& get_oid,
-                optional_yield y);
-
-/// Remove all log shards and associated parts of fifos.
-bs::error_code log_remove(librados::IoCtx& ioctx,
-                         int shards, //< Total number of shards
-                         /// A function taking a shard number and
-                         /// returning an oid.
-                         const fu2::unique_function<std::string(int) const>& get_oid,
-                         bool leave_zero,
-                         optional_yield y);
-
-
-struct logback_generation {
-  uint64_t gen_id = 0;
-  log_type type;
-  std::optional<ceph::real_time> pruned;
-
-  void encode(ceph::buffer::list& bl) const {
-    ENCODE_START(1, 1, bl);
-    encode(gen_id, bl);
-    encode(type, bl);
-    encode(pruned, bl);
-    ENCODE_FINISH(bl);
-  }
-
-  void decode(bufferlist::const_iterator& bl) {
-    DECODE_START(1, bl);
-    decode(gen_id, bl);
-    decode(type, bl);
-    decode(pruned, bl);
-    DECODE_FINISH(bl);
-  }
-};
-WRITE_CLASS_ENCODER(logback_generation)
-inline std::ostream& operator <<(std::ostream& m, const logback_generation& g) {
-  return m << "[" << g.gen_id << "," << g.type << ","
-          << (g.pruned ? "PRUNED" : "NOT PRUNED") << "]";
-}
-
-class logback_generations : public librados::WatchCtx2 {
-public:
-  using entries_t = bc::flat_map<uint64_t, logback_generation>;
-
-protected:
-  librados::IoCtx& ioctx;
-  logback_generations(librados::IoCtx& ioctx,
-                     std::string oid,
-                     fu2::unique_function<std::string(
-                       uint64_t, int) const>&& get_oid,
-                     int shards) noexcept
-    : ioctx(ioctx), oid(oid), get_oid(std::move(get_oid)),
-      shards(shards) {}
-
-    uint64_t my_id = ioctx.get_instance_id();
-
-private:
-  const std::string oid;
-  const fu2::unique_function<std::string(uint64_t, int) const> get_oid;
-
-protected:
-  const int shards;
-
-private:
-
-  uint64_t watchcookie = 0;
-
-  obj_version version;
-  std::mutex m;
-  entries_t entries_;
-
-  tl::expected<std::pair<entries_t, obj_version>, bs::error_code>
-  read(const DoutPrefixProvider *dpp, optional_yield y) noexcept;
-  bs::error_code write(const DoutPrefixProvider *dpp, entries_t&& e, std::unique_lock<std::mutex>&& l_,
-                      optional_yield y) noexcept;
-  bs::error_code setup(const DoutPrefixProvider *dpp, log_type def, optional_yield y) noexcept;
-
-  bs::error_code watch() noexcept;
-
-  auto lowest_nomempty(const entries_t& es) {
-    return std::find_if(es.begin(), es.end(),
-                       [](const auto& e) {
-                         return !e.second.pruned;
-                       });
-  }
-
-public:
-
-  /// For the use of watch/notify.
-
-  void handle_notify(uint64_t notify_id,
-                    uint64_t cookie,
-                    uint64_t notifier_id,
-                    bufferlist& bl) override final;
-
-  void handle_error(uint64_t cookie, int err) override final;
-
-  /// Public interface
-
-  virtual ~logback_generations();
-
-  template<typename T, typename... Args>
-  static tl::expected<std::unique_ptr<T>, bs::error_code>
-  init(const DoutPrefixProvider *dpp, librados::IoCtx& ioctx_, std::string oid_,
-       fu2::unique_function<std::string(uint64_t, int) const>&& get_oid_,
-       int shards_, log_type def, optional_yield y,
-       Args&& ...args) noexcept {
-    try {
-      T* lgp = new T(ioctx_, std::move(oid_),
-                    std::move(get_oid_),
-                    shards_, std::forward<Args>(args)...);
-      std::unique_ptr<T> lg(lgp);
-      lgp = nullptr;
-      auto ec = lg->setup(dpp, def, y);
-      if (ec)
-       return tl::unexpected(ec);
-      // Obnoxiousness for C++ Compiler in Bionic Beaver
-      return tl::expected<std::unique_ptr<T>, bs::error_code>(std::move(lg));
-    } catch (const std::bad_alloc&) {
-      return tl::unexpected(bs::error_code(ENOMEM, bs::system_category()));
-    }
-  }
-
-  bs::error_code update(const DoutPrefixProvider *dpp, optional_yield y) noexcept;
-
-  entries_t entries() const {
-    return entries_;
-  }
-
-  bs::error_code new_backing(const DoutPrefixProvider *dpp, log_type type, optional_yield y) noexcept;
-
-  bs::error_code empty_to(const DoutPrefixProvider *dpp, uint64_t gen_id, optional_yield y) noexcept;
-
-  bs::error_code remove_empty(const DoutPrefixProvider *dpp, optional_yield y) noexcept;
-
-  // Callbacks, to be defined by descendant.
-
-  /// Handle initialization on startup
-  ///
-  /// @param e All non-empty generations
-  virtual bs::error_code handle_init(entries_t e) noexcept = 0;
-
-  /// Handle new generations.
-  ///
-  /// @param e Map of generations added since last update
-  virtual bs::error_code handle_new_gens(entries_t e) noexcept = 0;
-
-  /// Handle generations being marked empty
-  ///
-  /// @param new_tail Lowest non-empty generation
-  virtual bs::error_code handle_empty_to(uint64_t new_tail) noexcept = 0;
-};
-
-inline std::string gencursor(uint64_t gen_id, std::string_view cursor) {
-  return (gen_id > 0 ?
-         fmt::format("G{:0>20}@{}", gen_id, cursor) :
-         std::string(cursor));
-}
-
-inline std::pair<uint64_t, std::string_view>
-cursorgen(std::string_view cursor_) {
-  if (cursor_.empty()) {
-    return { 0, "" };
-  }
-  std::string_view cursor = cursor_;
-  if (cursor[0] != 'G') {
-    return { 0, cursor };
-  }
-  cursor.remove_prefix(1);
-  auto gen_id = ceph::consume<uint64_t>(cursor);
-  if (!gen_id || cursor[0] != '@') {
-    return { 0, cursor_ };
-  }
-  cursor.remove_prefix(1);
-  return { *gen_id, cursor };
-}
-
-class LazyFIFO {
-  librados::IoCtx& ioctx;
-  std::string oid;
-  std::mutex m;
-  std::unique_ptr<rgw::cls::fifo::FIFO> fifo;
-
-  int lazy_init(const DoutPrefixProvider *dpp, optional_yield y) {
-    std::unique_lock l(m);
-    if (fifo) return 0;
-    auto r = rgw::cls::fifo::FIFO::create(dpp, ioctx, oid, &fifo, y);
-    if (r) {
-      fifo.reset();
-    }
-    return r;
-  }
-
-public:
-
-  LazyFIFO(librados::IoCtx& ioctx, std::string oid)
-    : ioctx(ioctx), oid(std::move(oid)) {}
-
-  int read_meta(const DoutPrefixProvider *dpp, optional_yield y) {
-    auto r = lazy_init(dpp, y);
-    if (r < 0) return r;
-    return fifo->read_meta(dpp, y);
-  }
-
-  int meta(const DoutPrefixProvider *dpp, rados::cls::fifo::info& info, optional_yield y) {
-    auto r = lazy_init(dpp, y);
-    if (r < 0) return r;
-    info = fifo->meta();
-    return 0;
-  }
-
-  int get_part_layout_info(const DoutPrefixProvider *dpp, 
-                           std::uint32_t& part_header_size,
-                          std::uint32_t& part_entry_overhead,
-                          optional_yield y) {
-    auto r = lazy_init(dpp, y);
-    if (r < 0) return r;
-    std::tie(part_header_size, part_entry_overhead)
-      = fifo->get_part_layout_info();
-    return 0;
-  }
-
-  int push(const DoutPrefixProvider *dpp, 
-           const ceph::buffer::list& bl,
-          optional_yield y) {
-    auto r = lazy_init(dpp, y);
-    if (r < 0) return r;
-    return fifo->push(dpp, bl, y);
-  }
-
-  int push(const DoutPrefixProvider *dpp, 
-           ceph::buffer::list& bl,
-          librados::AioCompletion* c,
-          optional_yield y) {
-    auto r = lazy_init(dpp, y);
-    if (r < 0) return r;
-    fifo->push(dpp, bl, c);
-    return 0;
-  }
-
-  int push(const DoutPrefixProvider *dpp, 
-           const std::vector<ceph::buffer::list>& data_bufs,
-          optional_yield y) {
-    auto r = lazy_init(dpp, y);
-    if (r < 0) return r;
-    return fifo->push(dpp, data_bufs, y);
-  }
-
-  int push(const DoutPrefixProvider *dpp, 
-            const std::vector<ceph::buffer::list>& data_bufs,
-           librados::AioCompletion* c,
-           optional_yield y) {
-    auto r = lazy_init(dpp, y);
-    if (r < 0) return r;
-    fifo->push(dpp, data_bufs, c);
-    return 0;
-  }
-
-  int list(const DoutPrefixProvider *dpp, 
-           int max_entries, std::optional<std::string_view> markstr,
-          std::vector<rgw::cls::fifo::list_entry>* out,
-          bool* more, optional_yield y) {
-    auto r = lazy_init(dpp, y);
-    if (r < 0) return r;
-    return fifo->list(dpp, max_entries, markstr, out, more, y);
-  }
-
-  int list(const DoutPrefixProvider *dpp, int max_entries, std::optional<std::string_view> markstr,
-          std::vector<rgw::cls::fifo::list_entry>* out, bool* more,
-          librados::AioCompletion* c, optional_yield y) {
-    auto r = lazy_init(dpp, y);
-    if (r < 0) return r;
-    fifo->list(dpp, max_entries, markstr, out, more, c);
-    return 0;
-  }
-
-  int trim(const DoutPrefixProvider *dpp, std::string_view markstr, bool exclusive, optional_yield y) {
-    auto r = lazy_init(dpp, y);
-    if (r < 0) return r;
-    return fifo->trim(dpp, markstr, exclusive, y);
-  }
-
-  int trim(const DoutPrefixProvider *dpp, std::string_view markstr, bool exclusive, librados::AioCompletion* c,
-          optional_yield y) {
-    auto r = lazy_init(dpp, y);
-    if (r < 0) return r;
-    fifo->trim(dpp, markstr, exclusive, c);
-    return 0;
-  }
-
-  int get_part_info(const DoutPrefixProvider *dpp, int64_t part_num, rados::cls::fifo::part_header* header,
-                   optional_yield y) {
-    auto r = lazy_init(dpp, y);
-    if (r < 0) return r;
-    return fifo->get_part_info(dpp, part_num, header, y);
-  }
-
-  int get_part_info(const DoutPrefixProvider *dpp, int64_t part_num, rados::cls::fifo::part_header* header,
-                   librados::AioCompletion* c, optional_yield y) {
-    auto r = lazy_init(dpp, y);
-    if (r < 0) return r;
-    fifo->get_part_info(part_num, header, c);
-    return 0;
-  }
-
-  int get_head_info(const DoutPrefixProvider *dpp, fu2::unique_function<
-                     void(int r, rados::cls::fifo::part_header&&)>&& f,
-                   librados::AioCompletion* c,
-                   optional_yield y) {
-    auto r = lazy_init(dpp, y);
-    if (r < 0) return r;
-    fifo->get_head_info(dpp, std::move(f), c);
-    return 0;
-  }
-};
-
-#endif
diff --git a/src/rgw/store/rados/rgw_metadata.cc b/src/rgw/store/rados/rgw_metadata.cc
deleted file mode 100644 (file)
index e3e4931..0000000
+++ /dev/null
@@ -1,233 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab ft=cpp
-
-#include "rgw_metadata.h"
-
-#include "rgw_zone.h"
-#include "rgw_mdlog.h"
-
-#include "services/svc_zone.h"
-#include "services/svc_cls.h"
-
-#define dout_subsys ceph_subsys_rgw
-
-using namespace std;
-
-const std::string RGWMetadataLogHistory::oid = "meta.history";
-
-struct obj_version;
-
-void rgw_shard_name(const string& prefix, unsigned max_shards, const string& key, string& name, int *shard_id)
-{
-  uint32_t val = ceph_str_hash_linux(key.c_str(), key.size());
-  char buf[16];
-  if (shard_id) {
-    *shard_id = val % max_shards;
-  }
-  snprintf(buf, sizeof(buf), "%u", (unsigned)(val % max_shards));
-  name = prefix + buf;
-}
-
-void rgw_shard_name(const string& prefix, unsigned max_shards, const string& section, const string& key, string& name)
-{
-  uint32_t val = ceph_str_hash_linux(key.c_str(), key.size());
-  val ^= ceph_str_hash_linux(section.c_str(), section.size());
-  char buf[16];
-  snprintf(buf, sizeof(buf), "%u", (unsigned)(val % max_shards));
-  name = prefix + buf;
-}
-
-void rgw_shard_name(const string& prefix, unsigned shard_id, string& name)
-{
-  char buf[16];
-  snprintf(buf, sizeof(buf), "%u", shard_id);
-  name = prefix + buf;
-}
-
-int RGWMetadataLog::add_entry(const DoutPrefixProvider *dpp, const string& hash_key, const string& section, const string& key, bufferlist& bl) {
-  if (!svc.zone->need_to_log_metadata())
-    return 0;
-
-  string oid;
-  int shard_id;
-
-  rgw_shard_name(prefix, cct->_conf->rgw_md_log_max_shards, hash_key, oid, &shard_id);
-  mark_modified(shard_id);
-  real_time now = real_clock::now();
-  return svc.cls->timelog.add(dpp, oid, now, section, key, bl, null_yield);
-}
-
-int RGWMetadataLog::get_shard_id(const string& hash_key, int *shard_id)
-{
-  string oid;
-
-  rgw_shard_name(prefix, cct->_conf->rgw_md_log_max_shards, hash_key, oid, shard_id);
-  return 0;
-}
-
-int RGWMetadataLog::store_entries_in_shard(const DoutPrefixProvider *dpp, list<cls_log_entry>& entries, int shard_id, librados::AioCompletion *completion)
-{
-  string oid;
-
-  mark_modified(shard_id);
-  rgw_shard_name(prefix, shard_id, oid);
-  return svc.cls->timelog.add(dpp, oid, entries, completion, false, null_yield);
-}
-
-void RGWMetadataLog::init_list_entries(int shard_id, const real_time& from_time, const real_time& end_time, 
-                                       const string& marker, void **handle)
-{
-  LogListCtx *ctx = new LogListCtx();
-
-  ctx->cur_shard = shard_id;
-  ctx->from_time = from_time;
-  ctx->end_time  = end_time;
-  ctx->marker    = marker;
-
-  get_shard_oid(ctx->cur_shard, ctx->cur_oid);
-
-  *handle = (void *)ctx;
-}
-
-void RGWMetadataLog::complete_list_entries(void *handle) {
-  LogListCtx *ctx = static_cast<LogListCtx *>(handle);
-  delete ctx;
-}
-
-int RGWMetadataLog::list_entries(const DoutPrefixProvider *dpp, void *handle,
-                                int max_entries,
-                                list<cls_log_entry>& entries,
-                                string *last_marker,
-                                bool *truncated) {
-  LogListCtx *ctx = static_cast<LogListCtx *>(handle);
-
-  if (!max_entries) {
-    *truncated = false;
-    return 0;
-  }
-
-  std::string next_marker;
-  int ret = svc.cls->timelog.list(dpp, ctx->cur_oid, ctx->from_time, ctx->end_time,
-                                  max_entries, entries, ctx->marker,
-                                  &next_marker, truncated, null_yield);
-  if ((ret < 0) && (ret != -ENOENT))
-    return ret;
-
-  ctx->marker = std::move(next_marker);
-  if (last_marker) {
-    *last_marker = ctx->marker;
-  }
-
-  if (ret == -ENOENT)
-    *truncated = false;
-
-  return 0;
-}
-
-int RGWMetadataLog::get_info(const DoutPrefixProvider *dpp, int shard_id, RGWMetadataLogInfo *info)
-{
-  string oid;
-  get_shard_oid(shard_id, oid);
-
-  cls_log_header header;
-
-  int ret = svc.cls->timelog.info(dpp, oid, &header, null_yield);
-  if ((ret < 0) && (ret != -ENOENT))
-    return ret;
-
-  info->marker = header.max_marker;
-  info->last_update = header.max_time.to_real_time();
-
-  return 0;
-}
-
-static void _mdlog_info_completion(librados::completion_t cb, void *arg)
-{
-  auto infoc = static_cast<RGWMetadataLogInfoCompletion *>(arg);
-  infoc->finish(cb);
-  infoc->put(); // drop the ref from get_info_async()
-}
-
-RGWMetadataLogInfoCompletion::RGWMetadataLogInfoCompletion(info_callback_t cb)
-  : completion(librados::Rados::aio_create_completion((void *)this,
-                                                      _mdlog_info_completion)),
-    callback(cb)
-{
-}
-
-RGWMetadataLogInfoCompletion::~RGWMetadataLogInfoCompletion()
-{
-  completion->release();
-}
-
-int RGWMetadataLog::get_info_async(const DoutPrefixProvider *dpp, int shard_id, RGWMetadataLogInfoCompletion *completion)
-{
-  string oid;
-  get_shard_oid(shard_id, oid);
-
-  completion->get(); // hold a ref until the completion fires
-
-  return svc.cls->timelog.info_async(dpp, completion->get_io_obj(), oid,
-                                     &completion->get_header(),
-                                     completion->get_completion());
-}
-
-int RGWMetadataLog::trim(const DoutPrefixProvider *dpp, int shard_id, const real_time& from_time, const real_time& end_time,
-                         const string& start_marker, const string& end_marker)
-{
-  string oid;
-  get_shard_oid(shard_id, oid);
-
-  return svc.cls->timelog.trim(dpp, oid, from_time, end_time, start_marker,
-                               end_marker, nullptr, null_yield);
-}
-  
-int RGWMetadataLog::lock_exclusive(const DoutPrefixProvider *dpp, int shard_id, timespan duration, string& zone_id, string& owner_id) {
-  string oid;
-  get_shard_oid(shard_id, oid);
-
-  return svc.cls->lock.lock_exclusive(dpp, svc.zone->get_zone_params().log_pool, oid, duration, zone_id, owner_id);
-}
-
-int RGWMetadataLog::unlock(const DoutPrefixProvider *dpp, int shard_id, string& zone_id, string& owner_id) {
-  string oid;
-  get_shard_oid(shard_id, oid);
-
-  return svc.cls->lock.unlock(dpp, svc.zone->get_zone_params().log_pool, oid, zone_id, owner_id);
-}
-
-void RGWMetadataLog::mark_modified(int shard_id)
-{
-  lock.get_read();
-  if (modified_shards.find(shard_id) != modified_shards.end()) {
-    lock.unlock();
-    return;
-  }
-  lock.unlock();
-
-  std::unique_lock wl{lock};
-  modified_shards.insert(shard_id);
-}
-
-void RGWMetadataLog::read_clear_modified(set<int> &modified)
-{
-  std::unique_lock wl{lock};
-  modified.swap(modified_shards);
-  modified_shards.clear();
-}
-
-void RGWMetadataLogInfo::dump(Formatter *f) const
-{
-  encode_json("marker", marker, f);
-  utime_t ut(last_update);
-  encode_json("last_update", ut, f);
-}
-
-void RGWMetadataLogInfo::decode_json(JSONObj *obj)
-{
-  JSONDecoder::decode_json("marker", marker, obj);
-  utime_t ut;
-  JSONDecoder::decode_json("last_update", ut, obj);
-  last_update = ut.to_real_time();
-}
-
diff --git a/src/rgw/store/rados/rgw_metadata.h b/src/rgw/store/rados/rgw_metadata.h
deleted file mode 100644 (file)
index 7228370..0000000
+++ /dev/null
@@ -1,300 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab ft=cpp
-
-#ifndef CEPH_RGW_METADATA_H
-#define CEPH_RGW_METADATA_H
-
-#include <string>
-#include <utility>
-#include <boost/optional.hpp>
-
-#include "include/types.h"
-#include "rgw_common.h"
-#include "rgw_period_history.h"
-#include "rgw_mdlog_types.h"
-#include "cls/version/cls_version_types.h"
-#include "cls/log/cls_log_types.h"
-#include "common/RefCountedObj.h"
-#include "common/ceph_time.h"
-#include "services/svc_meta_be.h"
-#include "rgw_sal_fwd.h"
-
-
-class RGWCoroutine;
-class JSONObj;
-struct RGWObjVersionTracker;
-
-struct obj_version;
-
-
-class RGWMetadataObject {
-protected:
-  obj_version objv;
-  ceph::real_time mtime;
-  std::map<std::string, bufferlist> *pattrs{nullptr};
-  
-public:
-  RGWMetadataObject() {}
-  RGWMetadataObject(const obj_version& v,
-                   real_time m) : objv(v), mtime(m) {}
-  virtual ~RGWMetadataObject() {}
-  obj_version& get_version();
-  real_time& get_mtime() { return mtime; }
-  void set_pattrs(std::map<std::string, bufferlist> *_pattrs) {
-    pattrs = _pattrs;
-  }
-  std::map<std::string, bufferlist> *get_pattrs() {
-    return pattrs;
-  }
-
-  virtual void dump(Formatter *f) const {}
-};
-
-class RGWMetadataManager;
-
-class RGWMetadataHandler {
-  friend class RGWMetadataManager;
-
-protected:
-  CephContext *cct;
-
-public:
-  RGWMetadataHandler() {}
-  virtual ~RGWMetadataHandler();
-  virtual std::string get_type() = 0;
-
-  void base_init(CephContext *_cct) {
-    cct = _cct;
-  }
-
-  virtual RGWMetadataObject *get_meta_obj(JSONObj *jo, const obj_version& objv, const ceph::real_time& mtime) = 0;
-
-  virtual int get(std::string& entry, RGWMetadataObject **obj, optional_yield, const DoutPrefixProvider *dpp) = 0;
-  virtual int put(std::string& entry,
-                  RGWMetadataObject *obj,
-                  RGWObjVersionTracker& objv_tracker,
-                  optional_yield, 
-                  const DoutPrefixProvider *dpp,
-                  RGWMDLogSyncType type,
-                  bool from_remote_zone) = 0;
-  virtual int remove(std::string& entry, RGWObjVersionTracker& objv_tracker, optional_yield, const DoutPrefixProvider *dpp) = 0;
-
-  virtual int mutate(const std::string& entry,
-                    const ceph::real_time& mtime,
-                    RGWObjVersionTracker *objv_tracker,
-                     optional_yield y,
-                     const DoutPrefixProvider *dpp,
-                    RGWMDLogStatus op_type,
-                    std::function<int()> f) = 0;
-
-  virtual int list_keys_init(const DoutPrefixProvider *dpp, const std::string& marker, void **phandle) = 0;
-  virtual int list_keys_next(const DoutPrefixProvider *dpp, void *handle, int max, std::list<std::string>& keys, bool *truncated) = 0;
-  virtual void list_keys_complete(void *handle) = 0;
-
-  virtual std::string get_marker(void *handle) = 0;
-
-  virtual int get_shard_id(const std::string& entry, int *shard_id) {
-    *shard_id = 0;
-    return 0;
-  }
-  virtual int attach(RGWMetadataManager *manager);
-};
-
-class RGWMetadataHandler_GenericMetaBE : public RGWMetadataHandler {
-  friend class RGWSI_MetaBackend;
-  friend class RGWMetadataManager;
-  friend class Put;
-
-public:
-  class Put;
-
-protected:
-  RGWSI_MetaBackend_Handler *be_handler;
-
-  virtual int do_get(RGWSI_MetaBackend_Handler::Op *op, std::string& entry, RGWMetadataObject **obj, optional_yield y, const DoutPrefixProvider *dpp) = 0;
-  virtual int do_put(RGWSI_MetaBackend_Handler::Op *op, std::string& entry, RGWMetadataObject *obj,
-                     RGWObjVersionTracker& objv_tracker, optional_yield y,
-                     const DoutPrefixProvider *dpp, RGWMDLogSyncType type, 
-                     bool from_remote_zone) = 0;
-  virtual int do_put_operate(Put *put_op, const DoutPrefixProvider *dpp);
-  virtual int do_remove(RGWSI_MetaBackend_Handler::Op *op, std::string& entry, RGWObjVersionTracker& objv_tracker, optional_yield y, const DoutPrefixProvider *dpp) = 0;
-
-public:
-  RGWMetadataHandler_GenericMetaBE() {}
-
-  void base_init(CephContext *_cct,
-            RGWSI_MetaBackend_Handler *_be_handler) {
-    RGWMetadataHandler::base_init(_cct);
-    be_handler = _be_handler;
-  }
-
-  RGWSI_MetaBackend_Handler *get_be_handler() {
-    return be_handler;
-  }
-
-  class Put {
-  protected:
-    RGWMetadataHandler_GenericMetaBE *handler;
-    RGWSI_MetaBackend_Handler::Op *op;
-    std::string& entry;
-    RGWMetadataObject *obj;
-    RGWObjVersionTracker& objv_tracker;
-    RGWMDLogSyncType apply_type;
-    optional_yield y;
-    bool from_remote_zone{false};
-
-    int get(RGWMetadataObject **obj, const DoutPrefixProvider *dpp) {
-      return handler->do_get(op, entry, obj, y, dpp);
-    }
-  public:
-    Put(RGWMetadataHandler_GenericMetaBE *_handler, RGWSI_MetaBackend_Handler::Op *_op,
-        std::string& _entry, RGWMetadataObject *_obj,
-        RGWObjVersionTracker& _objv_tracker, optional_yield _y,
-        RGWMDLogSyncType _type, bool from_remote_zone);
-
-    virtual ~Put() {}
-
-    virtual int put_pre(const DoutPrefixProvider *dpp) {
-      return 0;
-    }
-    virtual int put(const DoutPrefixProvider *dpp) {
-      return 0;
-    }
-    virtual int put_post(const DoutPrefixProvider *dpp) {
-      return 0;
-    }
-    virtual int finalize() {
-      return 0;
-    }
-  };
-
-  int get(std::string& entry, RGWMetadataObject **obj, optional_yield, const DoutPrefixProvider *dpp) override;
-  int put(std::string& entry, RGWMetadataObject *obj, RGWObjVersionTracker& objv_tracker, optional_yield, const DoutPrefixProvider *dpp, RGWMDLogSyncType type, bool from_remote_zone) override;
-  int remove(std::string& entry, RGWObjVersionTracker& objv_tracker, optional_yield, const DoutPrefixProvider *dpp) override;
-
-  int mutate(const std::string& entry,
-            const ceph::real_time& mtime,
-            RGWObjVersionTracker *objv_tracker,
-             optional_yield y,
-             const DoutPrefixProvider *dpp,
-            RGWMDLogStatus op_type,
-            std::function<int()> f) override;
-
-  int get_shard_id(const std::string& entry, int *shard_id) override;
-
-  int list_keys_init(const DoutPrefixProvider *dpp, const std::string& marker, void **phandle) override;
-  int list_keys_next(const DoutPrefixProvider *dpp, void *handle, int max, std::list<std::string>& keys, bool *truncated) override;
-  void list_keys_complete(void *handle) override;
-
-  std::string get_marker(void *handle) override;
-
-  /**
-   * Compare an incoming versus on-disk tag/version+mtime combo against
-   * the sync mode to see if the new one should replace the on-disk one.
-   *
-   * @return true if the update should proceed, false otherwise.
-   */
-  static bool check_versions(bool exists,
-                             const obj_version& ondisk, const real_time& ondisk_time,
-                             const obj_version& incoming, const real_time& incoming_time,
-                             RGWMDLogSyncType sync_mode) {
-    switch (sync_mode) {
-    case APPLY_UPDATES:
-      if ((ondisk.tag != incoming.tag) ||
-         (ondisk.ver >= incoming.ver))
-       return false;
-      break;
-    case APPLY_NEWER:
-      if (ondisk_time >= incoming_time)
-       return false;
-      break;
-    case APPLY_EXCLUSIVE:
-      if (exists)
-        return false;
-      break;
-    case APPLY_ALWAYS: //deliberate fall-thru -- we always apply!
-    default: break;
-    }
-    return true;
-  }
-};
-
-class RGWMetadataTopHandler;
-
-class RGWMetadataManager {
-  friend class RGWMetadataHandler;
-
-  CephContext *cct;
-  RGWSI_Meta *meta_svc;
-  std::map<std::string, RGWMetadataHandler *> handlers;
-  std::unique_ptr<RGWMetadataTopHandler> md_top_handler;
-
-  int find_handler(const std::string& metadata_key, RGWMetadataHandler **handler, std::string& entry);
-  int register_handler(RGWMetadataHandler *handler);
-
-public:
-  RGWMetadataManager(RGWSI_Meta *_meta_svc);
-  ~RGWMetadataManager();
-
-  RGWMetadataHandler *get_handler(const std::string& type);
-
-  int get(std::string& metadata_key, Formatter *f, optional_yield y, const DoutPrefixProvider *dpp);
-  int put(std::string& metadata_key, bufferlist& bl, optional_yield y,
-          const DoutPrefixProvider *dpp,
-          RGWMDLogSyncType sync_mode,
-          bool from_remote_zone,
-          obj_version *existing_version = NULL);
-  int remove(std::string& metadata_key, optional_yield y, const DoutPrefixProvider *dpp);
-
-  int mutate(const std::string& metadata_key,
-            const ceph::real_time& mtime,
-            RGWObjVersionTracker *objv_tracker,
-             optional_yield y,
-             const DoutPrefixProvider *dpp,
-            RGWMDLogStatus op_type,
-            std::function<int()> f);
-
-  int list_keys_init(const DoutPrefixProvider *dpp, const std::string& section, void **phandle);
-  int list_keys_init(const DoutPrefixProvider *dpp, const std::string& section, const std::string& marker, void **phandle);
-  int list_keys_next(const DoutPrefixProvider *dpp, void *handle, int max, std::list<std::string>& keys, bool *truncated);
-  void list_keys_complete(void *handle);
-
-  std::string get_marker(void *handle);
-
-  void dump_log_entry(cls_log_entry& entry, Formatter *f);
-
-  void get_sections(std::list<std::string>& sections);
-
-  void parse_metadata_key(const std::string& metadata_key, std::string& type, std::string& entry);
-
-  int get_shard_id(const std::string& section, const std::string& key, int *shard_id);
-};
-
-class RGWMetadataHandlerPut_SObj : public RGWMetadataHandler_GenericMetaBE::Put
-{
-protected:
-  std::unique_ptr<RGWMetadataObject> oo;
-  RGWMetadataObject *old_obj{nullptr};
-  bool exists{false};
-
-public:
-  RGWMetadataHandlerPut_SObj(RGWMetadataHandler_GenericMetaBE *handler, RGWSI_MetaBackend_Handler::Op *op,
-                             std::string& entry, RGWMetadataObject *obj, RGWObjVersionTracker& objv_tracker,
-                            optional_yield y,
-                             RGWMDLogSyncType type, bool from_remote_zone);
-  ~RGWMetadataHandlerPut_SObj();
-
-  int put_pre(const DoutPrefixProvider *dpp) override;
-  int put(const DoutPrefixProvider *dpp) override;
-  virtual int put_check(const DoutPrefixProvider *dpp) {
-    return 0;
-  }
-  virtual int put_checked(const DoutPrefixProvider *dpp);
-  virtual void encode_obj(bufferlist *bl) {}
-};
-
-void rgw_shard_name(const std::string& prefix, unsigned max_shards, const std::string& key, std::string& name, int *shard_id);
-void rgw_shard_name(const std::string& prefix, unsigned max_shards, const std::string& section, const std::string& key, std::string& name);
-void rgw_shard_name(const std::string& prefix, unsigned shard_id, std::string& name);
-
-#endif
diff --git a/src/rgw/store/rados/rgw_notify.cc b/src/rgw/store/rados/rgw_notify.cc
deleted file mode 100644 (file)
index 253a3bc..0000000
+++ /dev/null
@@ -1,1009 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
-// vim: ts=8 sw=2 smarttab
-
-#include "rgw_notify.h"
-#include "cls/2pc_queue/cls_2pc_queue_client.h"
-#include "cls/lock/cls_lock_client.h"
-#include <memory>
-#include <boost/algorithm/hex.hpp>
-#include <boost/context/protected_fixedsize_stack.hpp>
-#include <spawn/spawn.hpp>
-#include "rgw_sal_rados.h"
-#include "rgw_pubsub.h"
-#include "rgw_pubsub_push.h"
-#include "rgw_perf_counters.h"
-#include "common/dout.h"
-#include <chrono>
-
-#define dout_subsys ceph_subsys_rgw
-
-namespace rgw::notify {
-
-struct event_entry_t {
-  rgw_pubsub_s3_event event;
-  std::string push_endpoint;
-  std::string push_endpoint_args;
-  std::string arn_topic;
-  
-  void encode(bufferlist& bl) const {
-    ENCODE_START(1, 1, bl);
-    encode(event, bl);
-    encode(push_endpoint, bl);
-    encode(push_endpoint_args, bl);
-    encode(arn_topic, bl);
-    ENCODE_FINISH(bl);
-  }
-
-  void decode(bufferlist::const_iterator& bl) {
-    DECODE_START(1, bl);
-    decode(event, bl);
-    decode(push_endpoint, bl);
-    decode(push_endpoint_args, bl);
-    decode(arn_topic, bl);
-    DECODE_FINISH(bl);
-  }
-};
-WRITE_CLASS_ENCODER(event_entry_t)
-
-using queues_t = std::set<std::string>;
-
-// use mmap/mprotect to allocate 128k coroutine stacks
-auto make_stack_allocator() {
-  return boost::context::protected_fixedsize_stack{128*1024};
-}
-
-class Manager : public DoutPrefixProvider {
-  const size_t max_queue_size;
-  const uint32_t queues_update_period_ms;
-  const uint32_t queues_update_retry_ms;
-  const uint32_t queue_idle_sleep_us;
-  const utime_t failover_time;
-  CephContext* const cct;
-  librados::IoCtx& rados_ioctx;
-  static constexpr auto COOKIE_LEN = 16;
-  const std::string lock_cookie;
-  boost::asio::io_context io_context;
-  boost::asio::executor_work_guard<boost::asio::io_context::executor_type> work_guard;
-  const uint32_t worker_count;
-  std::vector<std::thread> workers;
-  const uint32_t stale_reservations_period_s;
-  const uint32_t reservations_cleanup_period_s;
-  const std::string Q_LIST_OBJECT_NAME = "queues_list_object";
-
-  CephContext *get_cct() const override { return cct; }
-  unsigned get_subsys() const override { return dout_subsys; }
-  std::ostream& gen_prefix(std::ostream& out) const override { return out << "rgw notify: "; }
-
-  // read the list of queues from the queue list object
-  int read_queue_list(queues_t& queues, optional_yield y) {
-    constexpr auto max_chunk = 1024U;
-    std::string start_after;
-    bool more = true;
-    int rval;
-    while (more) {
-      librados::ObjectReadOperation op;
-      queues_t queues_chunk;
-      op.omap_get_keys2(start_after, max_chunk, &queues_chunk, &more, &rval);
-      const auto ret = rgw_rados_operate(this, rados_ioctx, Q_LIST_OBJECT_NAME, &op, nullptr, y);
-      if (ret == -ENOENT) {
-        // queue list object was not created - nothing to do
-        return 0;
-      }
-      if (ret < 0) {
-        // TODO: do we need to check on rval as well as ret?
-        ldpp_dout(this, 1) << "ERROR: failed to read queue list. error: " << ret << dendl;
-        return ret;
-      }
-      queues.merge(queues_chunk);
-    }
-    return 0;
-  }
-
-  // set m1 to be the minimum between m1 and m2
-  static int set_min_marker(std::string& m1, const std::string m2) {
-    cls_queue_marker mr1;
-    cls_queue_marker mr2;
-    if (mr1.from_str(m1.c_str()) < 0 || mr2.from_str(m2.c_str()) < 0) {
-      return -EINVAL;
-    }
-    if (mr2.gen <= mr1.gen && mr2.offset < mr1.offset) {
-      m1 = m2;
-    }
-    return 0;
-  }
-
-  using Clock = ceph::coarse_mono_clock;
-  using Executor = boost::asio::io_context::executor_type;
-  using Timer = boost::asio::basic_waitable_timer<Clock,
-        boost::asio::wait_traits<Clock>, Executor>;
-
-  class tokens_waiter {
-    const std::chrono::hours infinite_duration;
-    size_t pending_tokens;
-    Timer timer;
-    struct token {
-      tokens_waiter& waiter;
-      token(tokens_waiter& _waiter) : waiter(_waiter) {
-        ++waiter.pending_tokens;
-      }
-      
-      ~token() {
-        --waiter.pending_tokens;
-        if (waiter.pending_tokens == 0) {
-          waiter.timer.cancel();
-        }   
-      }   
-    };
-  
-  public:
-
-    tokens_waiter(boost::asio::io_context& io_context) :
-      infinite_duration(1000),
-      pending_tokens(0),
-      timer(io_context) {}  
-    void async_wait(yield_context yield) {
-      if (pending_tokens == 0) {
-        return;
-      }
-      timer.expires_from_now(infinite_duration);
-      boost::system::error_code ec; 
-      timer.async_wait(yield[ec]);
-      ceph_assert(ec == boost::system::errc::operation_canceled);
-    }   
-    token make_token() {    
-      return token(*this);
-    }   
-  };
-
-  // processing of a specific entry
-  // return whether processing was successfull (true) or not (false)
-  bool process_entry(const cls_queue_entry& entry, yield_context yield) {
-    event_entry_t event_entry;
-    auto iter = entry.data.cbegin();
-    try {
-      decode(event_entry, iter);
-    } catch (buffer::error& err) {
-      ldpp_dout(this, 5) << "WARNING: failed to decode entry. error: " << err.what() << dendl;
-      return false;
-    }
-    try {
-      // TODO move endpoint creation to queue level
-      const auto push_endpoint = RGWPubSubEndpoint::create(event_entry.push_endpoint, event_entry.arn_topic,
-          RGWHTTPArgs(event_entry.push_endpoint_args, this), 
-          cct);
-      ldpp_dout(this, 20) << "INFO: push endpoint created: " << event_entry.push_endpoint <<
-        " for entry: " << entry.marker << dendl;
-      const auto ret = push_endpoint->send_to_completion_async(cct, event_entry.event, optional_yield(io_context, yield));
-      if (ret < 0) {
-        ldpp_dout(this, 5) << "WARNING: push entry: " << entry.marker << " to endpoint: " << event_entry.push_endpoint 
-          << " failed. error: " << ret << " (will retry)" << dendl;
-        return false;
-      } else {
-        ldpp_dout(this, 20) << "INFO: push entry: " << entry.marker << " to endpoint: " << event_entry.push_endpoint 
-          << " ok" <<  dendl;
-        if (perfcounter) perfcounter->inc(l_rgw_pubsub_push_ok);
-        return true;
-      }
-    } catch (const RGWPubSubEndpoint::configuration_error& e) {
-      ldpp_dout(this, 5) << "WARNING: failed to create push endpoint: " 
-          << event_entry.push_endpoint << " for entry: " << entry.marker << ". error: " << e.what() << " (will retry) " << dendl;
-      return false;
-    }
-  }
-
-  // clean stale reservation from queue
-  void cleanup_queue(const std::string& queue_name, yield_context yield) {
-    while (true) {
-      ldpp_dout(this, 20) << "INFO: trying to perform stale reservation cleanup for queue: " << queue_name << dendl;
-      const auto now = ceph::coarse_real_time::clock::now();
-      const auto stale_time = now - std::chrono::seconds(stale_reservations_period_s);
-      librados::ObjectWriteOperation op;
-      op.assert_exists();
-      rados::cls::lock::assert_locked(&op, queue_name+"_lock", 
-        ClsLockType::EXCLUSIVE,
-        lock_cookie, 
-        "" /*no tag*/);
-      cls_2pc_queue_expire_reservations(op, stale_time);
-      // check ownership and do reservation cleanup in one batch
-      auto ret = rgw_rados_operate(this, rados_ioctx, queue_name, &op, optional_yield(io_context, yield));
-      if (ret == -ENOENT) {
-        // queue was deleted
-        ldpp_dout(this, 5) << "INFO: queue: " 
-          << queue_name << ". was removed. cleanup will stop" << dendl;
-        return;
-      }
-      if (ret == -EBUSY) {
-        ldpp_dout(this, 5) << "WARNING: queue: " << queue_name << " ownership moved to another daemon. processing will stop" << dendl;
-        return;
-      }
-      if (ret < 0) {
-        ldpp_dout(this, 5) << "WARNING: failed to cleanup stale reservation from queue and/or lock queue: " << queue_name
-          << ". error: " << ret << dendl;
-      }
-      Timer timer(io_context);
-      timer.expires_from_now(std::chrono::seconds(reservations_cleanup_period_s));
-      boost::system::error_code ec;
-           timer.async_wait(yield[ec]);
-    }
-  }
-
-  // processing of a specific queue
-  void process_queue(const std::string& queue_name, yield_context yield) {
-    constexpr auto max_elements = 1024;
-    auto is_idle = false;
-    const std::string start_marker;
-
-    // start a the cleanup coroutine for the queue
-    spawn::spawn(io_context, [this, queue_name](yield_context yield) {
-            cleanup_queue(queue_name, yield);
-            }, make_stack_allocator());
-    
-    while (true) {
-      // if queue was empty the last time, sleep for idle timeout
-      if (is_idle) {
-        Timer timer(io_context);
-        timer.expires_from_now(std::chrono::microseconds(queue_idle_sleep_us));
-        boost::system::error_code ec;
-             timer.async_wait(yield[ec]);
-      }
-
-      // get list of entries in the queue
-      is_idle = true;
-      bool truncated = false;
-      std::string end_marker;
-      std::vector<cls_queue_entry> entries;
-      auto total_entries = 0U;
-      {
-        librados::ObjectReadOperation op;
-        op.assert_exists();
-        bufferlist obl;
-        int rval;
-        rados::cls::lock::assert_locked(&op, queue_name+"_lock", 
-          ClsLockType::EXCLUSIVE,
-          lock_cookie, 
-          "" /*no tag*/);
-        cls_2pc_queue_list_entries(op, start_marker, max_elements, &obl, &rval);
-        // check ownership and list entries in one batch
-        auto ret = rgw_rados_operate(this, rados_ioctx, queue_name, &op, nullptr, optional_yield(io_context, yield));
-        if (ret == -ENOENT) {
-          // queue was deleted
-          ldpp_dout(this, 5) << "INFO: queue: " 
-            << queue_name << ". was removed. processing will stop" << dendl;
-          return;
-        }
-        if (ret == -EBUSY) {
-          ldpp_dout(this, 5) << "WARNING: queue: " << queue_name << " ownership moved to another daemon. processing will stop" << dendl;
-          return;
-        }
-        if (ret < 0) {
-          ldpp_dout(this, 5) << "WARNING: failed to get list of entries in queue and/or lock queue: " 
-            << queue_name << ". error: " << ret << " (will retry)" << dendl;
-          continue;
-        }
-        ret = cls_2pc_queue_list_entries_result(obl, entries, &truncated, end_marker);
-        if (ret < 0) {
-          ldpp_dout(this, 5) << "WARNING: failed to parse list of entries in queue: " 
-            << queue_name << ". error: " << ret << " (will retry)" << dendl;
-          continue;
-        }
-      }
-      total_entries = entries.size();
-      if (total_entries == 0) {
-        // nothing in the queue
-        continue;
-      }
-      // log when queue is not idle
-      ldpp_dout(this, 20) << "INFO: found: " << total_entries << " entries in: " << queue_name <<
-        ". end marker is: " << end_marker << dendl;
-      
-      is_idle = false;
-      auto has_error = false;
-      auto remove_entries = false;
-      auto entry_idx = 1U;
-      tokens_waiter waiter(io_context);
-      for (auto& entry : entries) {
-        if (has_error) {
-          // bail out on first error
-          break;
-        }
-        // TODO pass entry pointer instead of by-value
-        spawn::spawn(yield, [this, &queue_name, entry_idx, total_entries, &end_marker, &remove_entries, &has_error, &waiter, entry](yield_context yield) {
-            const auto token = waiter.make_token();
-            if (process_entry(entry, yield)) {
-              ldpp_dout(this, 20) << "INFO: processing of entry: " << 
-                entry.marker << " (" << entry_idx << "/" << total_entries << ") from: " << queue_name << " ok" << dendl;
-              remove_entries = true;
-            }  else {
-              if (set_min_marker(end_marker, entry.marker) < 0) {
-                ldpp_dout(this, 1) << "ERROR: cannot determin minimum between malformed markers: " << end_marker << ", " << entry.marker << dendl;
-              } else {
-                ldpp_dout(this, 20) << "INFO: new end marker for removal: " << end_marker << " from: " << queue_name << dendl;
-              }
-              has_error = true;
-              ldpp_dout(this, 20) << "INFO: processing of entry: " << 
-                entry.marker << " (" << entry_idx << "/" << total_entries << ") from: " << queue_name << " failed" << dendl;
-            } 
-        }, make_stack_allocator());
-        ++entry_idx;
-      }
-
-      // wait for all pending work to finish
-      waiter.async_wait(yield);
-
-      // delete all published entries from queue
-      if (remove_entries) {
-        librados::ObjectWriteOperation op;
-        op.assert_exists();
-        rados::cls::lock::assert_locked(&op, queue_name+"_lock", 
-          ClsLockType::EXCLUSIVE,
-          lock_cookie, 
-          "" /*no tag*/);
-        cls_2pc_queue_remove_entries(op, end_marker); 
-        // check ownership and deleted entries in one batch
-        const auto ret = rgw_rados_operate(this, rados_ioctx, queue_name, &op, optional_yield(io_context, yield)); 
-        if (ret == -ENOENT) {
-          // queue was deleted
-          ldpp_dout(this, 5) << "INFO: queue: " 
-            << queue_name << ". was removed. processing will stop" << dendl;
-          return;
-        }
-        if (ret == -EBUSY) {
-          ldpp_dout(this, 5) << "WARNING: queue: " << queue_name << " ownership moved to another daemon. processing will stop" << dendl;
-          return;
-        }
-        if (ret < 0) {
-          ldpp_dout(this, 1) << "ERROR: failed to remove entries and/or lock queue up to: " << end_marker <<  " from queue: " 
-            << queue_name << ". error: " << ret << dendl;
-        } else {
-          ldpp_dout(this, 20) << "INFO: removed entries up to: " << end_marker <<  " from queue: " 
-          << queue_name << dendl;
-        }
-      }
-    }
-  }
-
-  // lits of owned queues
-  using owned_queues_t = std::unordered_set<std::string>;
-
-  // process all queues
-  // find which of the queues is owned by this daemon and process it
-  void process_queues(yield_context yield) {
-    auto has_error = false;
-    owned_queues_t owned_queues;
-
-    // add randomness to the duration between queue checking
-    // to make sure that different daemons are not synced
-    std::random_device seed;
-    std::mt19937 rnd_gen(seed());
-    const auto min_jitter = 100; // ms
-    const auto max_jitter = 500; // ms
-    std::uniform_int_distribution<> duration_jitter(min_jitter, max_jitter);
-
-    std::vector<std::string> queue_gc;
-    std::mutex queue_gc_lock;
-    while (true) {
-      Timer timer(io_context);
-      const auto duration = (has_error ? 
-        std::chrono::milliseconds(queues_update_retry_ms) : std::chrono::milliseconds(queues_update_period_ms)) + 
-        std::chrono::milliseconds(duration_jitter(rnd_gen));
-      timer.expires_from_now(duration);
-      const auto tp = ceph::coarse_real_time::clock::to_time_t(ceph::coarse_real_time::clock::now() + duration);
-      ldpp_dout(this, 20) << "INFO: next queues processing will happen at: " << std::ctime(&tp)  << dendl;
-      boost::system::error_code ec;
-      timer.async_wait(yield[ec]);
-
-      queues_t queues;
-      auto ret = read_queue_list(queues, optional_yield(io_context, yield));
-      if (ret < 0) {
-        has_error = true;
-        continue;
-      }
-
-      for (const auto& queue_name : queues) {
-        // try to lock the queue to check if it is owned by this rgw
-        // or if ownershif needs to be taken
-        librados::ObjectWriteOperation op;
-        op.assert_exists();
-        rados::cls::lock::lock(&op, queue_name+"_lock", 
-              ClsLockType::EXCLUSIVE,
-              lock_cookie, 
-              "" /*no tag*/,
-              "" /*no description*/,
-              failover_time,
-              LOCK_FLAG_MAY_RENEW);
-
-        ret = rgw_rados_operate(this, rados_ioctx, queue_name, &op, optional_yield(io_context, yield));
-        if (ret == -EBUSY) {
-          // lock is already taken by another RGW
-          ldpp_dout(this, 20) << "INFO: queue: " << queue_name << " owned (locked) by another daemon" << dendl;
-          // if queue was owned by this RGW, processing should be stopped, queue would be deleted from list afterwards
-          continue;
-        }
-        if (ret == -ENOENT) {
-          // queue is deleted - processing will stop the next time we try to read from the queue
-          ldpp_dout(this, 10) << "INFO: queue: " << queue_name << " should not be locked - already deleted" << dendl;
-          continue;
-        }
-        if (ret < 0) {
-          // failed to lock for another reason, continue to process other queues
-          ldpp_dout(this, 1) << "ERROR: failed to lock queue: " << queue_name << ". error: " << ret << dendl;
-          has_error = true;
-          continue;
-        }
-        // add queue to list of owned queues
-        if (owned_queues.insert(queue_name).second) {
-          ldpp_dout(this, 10) << "INFO: queue: " << queue_name << " now owned (locked) by this daemon" << dendl;
-          // start processing this queue
-          spawn::spawn(io_context, [this, &queue_gc, &queue_gc_lock, queue_name](yield_context yield) {
-            process_queue(queue_name, yield);
-            // if queue processing ended, it measn that the queue was removed or not owned anymore
-            // mark it for deletion
-            std::lock_guard lock_guard(queue_gc_lock);
-            queue_gc.push_back(queue_name);
-            ldpp_dout(this, 10) << "INFO: queue: " << queue_name << " marked for removal" << dendl;
-          }, make_stack_allocator());
-        } else {
-          ldpp_dout(this, 20) << "INFO: queue: " << queue_name << " ownership (lock) renewed" << dendl;
-        }
-      }
-      // erase all queue that were deleted
-      {
-        std::lock_guard lock_guard(queue_gc_lock);
-        std::for_each(queue_gc.begin(), queue_gc.end(), [this, &owned_queues](const std::string& queue_name) {
-          owned_queues.erase(queue_name);
-          ldpp_dout(this, 20) << "INFO: queue: " << queue_name << " removed" << dendl;
-        });
-        queue_gc.clear();
-      }
-    }
-  }
-
-public:
-
-  ~Manager() {
-    work_guard.reset();
-    io_context.stop();
-    std::for_each(workers.begin(), workers.end(), [] (auto& worker) { worker.join(); });
-  }
-
-  // ctor: start all threads
-  Manager(CephContext* _cct, uint32_t _max_queue_size, uint32_t _queues_update_period_ms, 
-          uint32_t _queues_update_retry_ms, uint32_t _queue_idle_sleep_us, u_int32_t failover_time_ms, 
-          uint32_t _stale_reservations_period_s, uint32_t _reservations_cleanup_period_s,
-          uint32_t _worker_count, rgw::sal::RadosStore* store) :
-    max_queue_size(_max_queue_size),
-    queues_update_period_ms(_queues_update_period_ms),
-    queues_update_retry_ms(_queues_update_retry_ms),
-    queue_idle_sleep_us(_queue_idle_sleep_us),
-    failover_time(std::chrono::milliseconds(failover_time_ms)),
-    cct(_cct),
-    rados_ioctx(store->getRados()->get_notif_pool_ctx()),
-    lock_cookie(gen_rand_alphanumeric(cct, COOKIE_LEN)),
-    work_guard(boost::asio::make_work_guard(io_context)),
-    worker_count(_worker_count),
-    stale_reservations_period_s(_stale_reservations_period_s),
-    reservations_cleanup_period_s(_reservations_cleanup_period_s)
-    {
-      spawn::spawn(io_context, [this] (yield_context yield) {
-            process_queues(yield);
-          }, make_stack_allocator());
-
-      // start the worker threads to do the actual queue processing
-      const std::string WORKER_THREAD_NAME = "notif-worker";
-      for (auto worker_id = 0U; worker_id < worker_count; ++worker_id) {
-        workers.emplace_back([this]() {
-          try {
-            io_context.run(); 
-          } catch (const std::exception& err) {
-            ldpp_dout(this, 10) << "Notification worker failed with error: " << err.what() << dendl;
-            throw(err);
-          }
-        });
-        const auto rc = ceph_pthread_setname(workers.back().native_handle(), 
-          (WORKER_THREAD_NAME+std::to_string(worker_id)).c_str());
-        ceph_assert(rc == 0);
-      }
-      ldpp_dout(this, 10) << "Started notification manager with: " << worker_count << " workers" << dendl;
-    }
-
-  int add_persistent_topic(const std::string& topic_name, optional_yield y) {
-    if (topic_name == Q_LIST_OBJECT_NAME) {
-      ldpp_dout(this, 1) << "ERROR: topic name cannot be: " << Q_LIST_OBJECT_NAME << " (conflict with queue list object name)" << dendl;
-      return -EINVAL;
-    }
-    librados::ObjectWriteOperation op;
-    op.create(true);
-    cls_2pc_queue_init(op, topic_name, max_queue_size);
-    auto ret = rgw_rados_operate(this, rados_ioctx, topic_name, &op, y);
-    if (ret == -EEXIST) {
-      // queue already exists - nothing to do
-      ldpp_dout(this, 20) << "INFO: queue for topic: " << topic_name << " already exists. nothing to do" << dendl;
-      return 0;
-    }
-    if (ret < 0) {
-      // failed to create queue
-      ldpp_dout(this, 1) << "ERROR: failed to create queue for topic: " << topic_name << ". error: " << ret << dendl;
-      return ret;
-    }
-   
-    bufferlist empty_bl;
-    std::map<std::string, bufferlist> new_topic{{topic_name, empty_bl}};
-    op.omap_set(new_topic);
-    ret = rgw_rados_operate(this, rados_ioctx, Q_LIST_OBJECT_NAME, &op, y);
-    if (ret < 0) {
-      ldpp_dout(this, 1) << "ERROR: failed to add queue: " << topic_name << " to queue list. error: " << ret << dendl;
-      return ret;
-    } 
-    ldpp_dout(this, 20) << "INFO: queue: " << topic_name << " added to queue list"  << dendl;
-    return 0;
-  }
-  
-  int remove_persistent_topic(const std::string& topic_name, optional_yield y) {
-    librados::ObjectWriteOperation op;
-    op.remove();
-    auto ret = rgw_rados_operate(this, rados_ioctx, topic_name, &op, y);
-    if (ret == -ENOENT) {
-      // queue already removed - nothing to do
-      ldpp_dout(this, 20) << "INFO: queue for topic: " << topic_name << " already removed. nothing to do" << dendl;
-      return 0;
-    }
-    if (ret < 0) {
-      // failed to remove queue
-      ldpp_dout(this, 1) << "ERROR: failed to remove queue for topic: " << topic_name << ". error: " << ret << dendl;
-      return ret;
-    }
-  
-    std::set<std::string> topic_to_remove{{topic_name}};
-    op.omap_rm_keys(topic_to_remove);
-    ret = rgw_rados_operate(this, rados_ioctx, Q_LIST_OBJECT_NAME, &op, y);
-    if (ret < 0) {
-      ldpp_dout(this, 1) << "ERROR: failed to remove queue: " << topic_name << " from queue list. error: " << ret << dendl;
-      return ret;
-    } 
-    ldpp_dout(this, 20) << "INFO: queue: " << topic_name << " removed from queue list"  << dendl;
-    return 0;
-  }
-};
-
-// singleton manager
-// note that the manager itself is not a singleton, and multiple instances may co-exist
-// TODO make the pointer atomic in allocation and deallocation to avoid race conditions
-static Manager* s_manager = nullptr;
-
-constexpr size_t MAX_QUEUE_SIZE = 128*1000*1000; // 128MB
-constexpr uint32_t Q_LIST_UPDATE_MSEC = 1000*30;     // check queue list every 30seconds
-constexpr uint32_t Q_LIST_RETRY_MSEC = 1000;         // retry every second if queue list update failed
-constexpr uint32_t IDLE_TIMEOUT_USEC = 100*1000;     // idle sleep 100ms
-constexpr uint32_t FAILOVER_TIME_MSEC = 3*Q_LIST_UPDATE_MSEC; // FAILOVER TIME 3x renew time
-constexpr uint32_t WORKER_COUNT = 1;                 // 1 worker thread
-constexpr uint32_t STALE_RESERVATIONS_PERIOD_S = 120;   // cleanup reservations that are more than 2 minutes old
-constexpr uint32_t RESERVATIONS_CLEANUP_PERIOD_S = 30; // reservation cleanup every 30 seconds
-
-bool init(CephContext* cct, rgw::sal::RadosStore* store, const DoutPrefixProvider *dpp) {
-  if (s_manager) {
-    return false;
-  }
-  // TODO: take conf from CephContext
-  s_manager = new Manager(cct, MAX_QUEUE_SIZE, 
-      Q_LIST_UPDATE_MSEC, Q_LIST_RETRY_MSEC, 
-      IDLE_TIMEOUT_USEC, FAILOVER_TIME_MSEC, 
-      STALE_RESERVATIONS_PERIOD_S, RESERVATIONS_CLEANUP_PERIOD_S,
-      WORKER_COUNT,
-      store);
-  return true;
-}
-
-void shutdown() {
-  delete s_manager;
-  s_manager = nullptr;
-}
-
-int add_persistent_topic(const std::string& topic_name, optional_yield y) {
-  if (!s_manager) {
-    return -EAGAIN;
-  }
-  return s_manager->add_persistent_topic(topic_name, y);
-}
-
-int remove_persistent_topic(const std::string& topic_name, optional_yield y) {
-  if (!s_manager) {
-    return -EAGAIN;
-  }
-  return s_manager->remove_persistent_topic(topic_name, y);
-}
-
-rgw::sal::Object* get_object_with_atttributes(
-  const reservation_t& res, rgw::sal::Object* obj) {
-  // in case of copy obj, the tags and metadata are taken from source
-  const auto src_obj = res.src_object ? res.src_object : obj;
-  if (src_obj->get_attrs().empty()) {
-    if (!src_obj->get_bucket()) {
-      src_obj->set_bucket(res.bucket);
-    }
-    const auto ret = src_obj->get_obj_attrs(res.yield, res.dpp);
-    if (ret < 0) {
-      ldpp_dout(res.dpp, 20) << "failed to get attributes from object: " << 
-        src_obj->get_key() << ". ret = " << ret << dendl;
-      return nullptr;
-    }
-  }
-  return src_obj;
-}
-
-static inline void metadata_from_attributes(
-  reservation_t& res, rgw::sal::Object* obj) {
-  auto& metadata = res.x_meta_map;
-  const auto src_obj = get_object_with_atttributes(res, obj);
-  if (!src_obj) {
-    return;
-  }
-  res.metadata_fetched_from_attributes = true;
-  for (auto& attr : src_obj->get_attrs()) {
-    if (boost::algorithm::starts_with(attr.first, RGW_ATTR_META_PREFIX)) {
-      std::string_view key(attr.first);
-      key.remove_prefix(sizeof(RGW_ATTR_PREFIX)-1);
-      // we want to pass a null terminated version
-      // of the bufferlist, hence "to_str().c_str()"
-      metadata.emplace(key, attr.second.to_str().c_str());
-    }
-  }
-}
-
-static inline void tags_from_attributes(
-  const reservation_t& res, rgw::sal::Object* obj, KeyMultiValueMap& tags) {
-  const auto src_obj = get_object_with_atttributes(res, obj);
-  if (!src_obj) {
-    return;
-  }
-  const auto& attrs = src_obj->get_attrs();
-  const auto attr_iter = attrs.find(RGW_ATTR_TAGS);
-  if (attr_iter != attrs.end()) {
-    auto bliter = attr_iter->second.cbegin();
-    RGWObjTags obj_tags;
-    try {
-      ::decode(obj_tags, bliter);
-    } catch(buffer::error&) {
-      // not able to decode tags
-      return;
-    }
-    tags = std::move(obj_tags.get_tags());
-  }
-}
-
-// populate event from request
-static inline void populate_event(reservation_t& res,
-        rgw::sal::Object* obj,
-        uint64_t size,
-        const ceph::real_time& mtime, 
-        const std::string& etag, 
-        const std::string& version, 
-        EventType event_type,
-        rgw_pubsub_s3_event& event) {
-  event.eventTime = mtime;
-  event.eventName = to_event_string(event_type);
-  event.userIdentity = res.user_id;    // user that triggered the change
-  event.x_amz_request_id = res.req_id; // request ID of the original change
-  event.x_amz_id_2 = res.store->getRados()->host_id; // RGW on which the change was made
-  // configurationId is filled from notification configuration
-  event.bucket_name = res.bucket->get_name();
-  event.bucket_ownerIdentity = res.bucket->get_owner() ? res.bucket->get_owner()->get_id().id : "";
-  const auto region = res.store->get_zone()->get_zonegroup().get_api_name();
-  rgw::ARN bucket_arn(res.bucket->get_key());
-  bucket_arn.region = region; 
-  event.bucket_arn = to_string(bucket_arn);
-  event.object_key = res.object_name ? *res.object_name : obj->get_name();
-  event.object_size = size;
-  event.object_etag = etag;
-  event.object_versionId = version;
-  event.awsRegion = region;
-  // use timestamp as per key sequence id (hex encoded)
-  const utime_t ts(real_clock::now());
-  boost::algorithm::hex((const char*)&ts, (const char*)&ts + sizeof(utime_t), 
-          std::back_inserter(event.object_sequencer));
-  set_event_id(event.id, etag, ts);
-  event.bucket_id = res.bucket->get_bucket_id();
-  // pass meta data
-  if (!res.metadata_fetched_from_attributes) {
-    // either no metadata exist or no metadata filter was used
-    metadata_from_attributes(res, obj);
-  }
-  event.x_meta_map = res.x_meta_map;
-  // pass tags
-  if (!res.tagset ||
-      (*res.tagset).get_tags().empty()) {
-    // try to fetch the tags from the attributes
-    tags_from_attributes(res, obj, event.tags);
-  } else {
-    event.tags = (*res.tagset).get_tags();
-  }
-  // opaque data will be filled from topic configuration
-}
-
-static inline bool notification_match(reservation_t& res,
-                                     const rgw_pubsub_topic_filter& filter,
-                                     EventType event,
-                                     const RGWObjTags* req_tags) {
-  if (!match(filter.events, event)) { 
-    return false;
-  }
-  const auto obj = res.object;
-  if (!match(filter.s3_filter.key_filter, 
-        res.object_name ? *res.object_name : obj->get_name())) {
-    return false;
-  }
-
-  if (!filter.s3_filter.metadata_filter.kv.empty()) {
-    // metadata filter exists
-    if (res.s) {
-      res.x_meta_map = res.s->info.x_meta_map;
-    }
-    metadata_from_attributes(res, obj);
-    if (!match(filter.s3_filter.metadata_filter, res.x_meta_map)) {
-      return false;
-    }
-  }
-
-  if (!filter.s3_filter.tag_filter.kv.empty()) {
-    // tag filter exists
-    if (req_tags) {
-      // tags in the request
-      if (!match(filter.s3_filter.tag_filter, req_tags->get_tags())) {
-        return false;
-      }
-    } else if (res.tagset && !(*res.tagset).get_tags().empty()) {
-      // tags were cached in req_state
-      if (!match(filter.s3_filter.tag_filter, (*res.tagset).get_tags())) {
-        return false;
-      }
-    } else {
-      // try to fetch tags from the attributes
-      KeyMultiValueMap tags;
-      tags_from_attributes(res, obj, tags);
-      if (!match(filter.s3_filter.tag_filter, tags)) {
-        return false;
-      }
-    }
-  }
-
-  return true;
-}
-
-  int publish_reserve(const DoutPrefixProvider* dpp,
-                     EventType event_type,
-                     reservation_t& res,
-                     const RGWObjTags* req_tags)
-{
-  RGWPubSub ps(res.store, res.user_tenant);
-  RGWPubSub::Bucket ps_bucket(&ps, res.bucket->get_key());
-  rgw_pubsub_bucket_topics bucket_topics;
-  auto rc = ps_bucket.get_topics(&bucket_topics);
-  if (rc < 0) {
-    // failed to fetch bucket topics
-    return rc;
-  }
-  for (const auto& bucket_topic : bucket_topics.topics) {
-    const rgw_pubsub_topic_filter& topic_filter = bucket_topic.second;
-    const rgw_pubsub_topic& topic_cfg = topic_filter.topic;
-    if (!notification_match(res, topic_filter, event_type, req_tags)) {
-      // notification does not apply to req_state
-      continue;
-    }
-    ldpp_dout(res.dpp, 20) << "INFO: notification: '" << topic_filter.s3_id <<
-        "' on topic: '" << topic_cfg.dest.arn_topic << 
-        "' and bucket: '" << res.bucket->get_name() <<
-        "' (unique topic: '" << topic_cfg.name <<
-        "') apply to event of type: '" << to_string(event_type) << "'" << dendl;
-
-    cls_2pc_reservation::id_t res_id;
-    if (topic_cfg.dest.persistent) {
-      // TODO: take default reservation size from conf
-      constexpr auto DEFAULT_RESERVATION = 4*1024U; // 4K
-      res.size = DEFAULT_RESERVATION;
-      librados::ObjectWriteOperation op;
-      bufferlist obl;
-      int rval;
-      const auto& queue_name = topic_cfg.dest.arn_topic;
-      cls_2pc_queue_reserve(op, res.size, 1, &obl, &rval);
-      auto ret = rgw_rados_operate(
-       res.dpp, res.store->getRados()->get_notif_pool_ctx(),
-       queue_name, &op, res.yield, librados::OPERATION_RETURNVEC);
-      if (ret < 0) {
-        ldpp_dout(res.dpp, 1) <<
-         "ERROR: failed to reserve notification on queue: "
-                             << queue_name << ". error: " << ret << dendl;
-        // if no space is left in queue we ask client to slow down
-        return (ret == -ENOSPC) ? -ERR_RATE_LIMITED : ret;
-      }
-      ret = cls_2pc_queue_reserve_result(obl, res_id);
-      if (ret < 0) {
-        ldpp_dout(res.dpp, 1) << "ERROR: failed to parse reservation id. error: " << ret << dendl;
-        return ret;
-      }
-    }
-    res.topics.emplace_back(topic_filter.s3_id, topic_cfg, res_id);
-  }
-  return 0;
-}
-
-int publish_commit(rgw::sal::Object* obj,
-                  uint64_t size,
-                  const ceph::real_time& mtime,
-                  const std::string& etag,
-                  const std::string& version,
-                  EventType event_type,
-                  reservation_t& res,
-                  const DoutPrefixProvider* dpp)
-{
-  for (auto& topic : res.topics) {
-    if (topic.cfg.dest.persistent &&
-       topic.res_id == cls_2pc_reservation::NO_ID) {
-      // nothing to commit or already committed/aborted
-      continue;
-    }
-    event_entry_t event_entry;
-    populate_event(res, obj, size, mtime, etag, version, event_type, event_entry.event);
-    event_entry.event.configurationId = topic.configurationId;
-    event_entry.event.opaque_data = topic.cfg.opaque_data;
-    if (topic.cfg.dest.persistent) { 
-      event_entry.push_endpoint = std::move(topic.cfg.dest.push_endpoint);
-      event_entry.push_endpoint_args =
-       std::move(topic.cfg.dest.push_endpoint_args);
-      event_entry.arn_topic = topic.cfg.dest.arn_topic;
-      bufferlist bl;
-      encode(event_entry, bl);
-      const auto& queue_name = topic.cfg.dest.arn_topic;
-      if (bl.length() > res.size) {
-        // try to make a larger reservation, fail only if this is not possible
-        ldpp_dout(dpp, 5) << "WARNING: committed size: " << bl.length()
-                         << " exceeded reserved size: " << res.size
-                         <<
-          " . trying to make a larger reservation on queue:" << queue_name
-                         << dendl;
-        // first cancel the existing reservation
-        librados::ObjectWriteOperation op;
-        cls_2pc_queue_abort(op, topic.res_id);
-        auto ret = rgw_rados_operate(
-         dpp, res.store->getRados()->get_notif_pool_ctx(),
-         topic.cfg.dest.arn_topic, &op,
-         res.yield);
-        if (ret < 0) {
-          ldpp_dout(dpp, 1) << "ERROR: failed to abort reservation: "
-                           << topic.res_id << 
-            " when trying to make a larger reservation on queue: " << queue_name
-                           << ". error: " << ret << dendl;
-          return ret;
-        }
-        // now try to make a bigger one
-       buffer::list obl;
-        int rval;
-        cls_2pc_queue_reserve(op, bl.length(), 1, &obl, &rval);
-        ret = rgw_rados_operate(
-         dpp, res.store->getRados()->get_notif_pool_ctx(),
-          queue_name, &op, res.yield, librados::OPERATION_RETURNVEC);
-        if (ret < 0) {
-          ldpp_dout(dpp, 1) << "ERROR: failed to reserve extra space on queue: "
-                           << queue_name
-                           << ". error: " << ret << dendl;
-          return (ret == -ENOSPC) ? -ERR_RATE_LIMITED : ret;
-        }
-        ret = cls_2pc_queue_reserve_result(obl, topic.res_id);
-        if (ret < 0) {
-          ldpp_dout(dpp, 1) << "ERROR: failed to parse reservation id for "
-           "extra space. error: " << ret << dendl;
-          return ret;
-        }
-      }
-      std::vector<buffer::list> bl_data_vec{std::move(bl)};
-      librados::ObjectWriteOperation op;
-      cls_2pc_queue_commit(op, bl_data_vec, topic.res_id);
-      const auto ret = rgw_rados_operate(
-       dpp, res.store->getRados()->get_notif_pool_ctx(),
-       queue_name, &op, res.yield);
-      topic.res_id = cls_2pc_reservation::NO_ID;
-      if (ret < 0) {
-        ldpp_dout(dpp, 1) << "ERROR: failed to commit reservation to queue: "
-                         << queue_name << ". error: " << ret
-                         << dendl;
-        return ret;
-      }
-    } else {
-      try {
-        // TODO add endpoint LRU cache
-        const auto push_endpoint = RGWPubSubEndpoint::create(
-         topic.cfg.dest.push_endpoint,
-         topic.cfg.dest.arn_topic,
-         RGWHTTPArgs(topic.cfg.dest.push_endpoint_args, dpp),
-         dpp->get_cct());
-        ldpp_dout(res.dpp, 20) << "INFO: push endpoint created: "
-                              << topic.cfg.dest.push_endpoint << dendl;
-        const auto ret = push_endpoint->send_to_completion_async(
-         dpp->get_cct(), event_entry.event, res.yield);
-        if (ret < 0) {
-          ldpp_dout(dpp, 1) << "ERROR: push to endpoint "
-                           << topic.cfg.dest.push_endpoint
-                           << " failed. error: " << ret << dendl;
-          if (perfcounter) perfcounter->inc(l_rgw_pubsub_push_failed);
-          return ret;
-        }
-        if (perfcounter) perfcounter->inc(l_rgw_pubsub_push_ok);
-      } catch (const RGWPubSubEndpoint::configuration_error& e) {
-        ldpp_dout(dpp, 1) << "ERROR: failed to create push endpoint: " 
-            << topic.cfg.dest.push_endpoint << ". error: " << e.what() << dendl;
-        if (perfcounter) perfcounter->inc(l_rgw_pubsub_push_failed);
-        return -EINVAL;
-      }
-    }
-  }
-  return 0;
-}
-
-int publish_abort(reservation_t& res) {
-  for (auto& topic : res.topics) {
-    if (!topic.cfg.dest.persistent ||
-       topic.res_id == cls_2pc_reservation::NO_ID) {
-      // nothing to abort or already committed/aborted
-      continue;
-    }
-    const auto& queue_name = topic.cfg.dest.arn_topic;
-    librados::ObjectWriteOperation op;
-    cls_2pc_queue_abort(op, topic.res_id);
-    const auto ret = rgw_rados_operate(
-      res.dpp, res.store->getRados()->get_notif_pool_ctx(),
-      queue_name, &op, res.yield);
-    if (ret < 0) {
-      ldpp_dout(res.dpp, 1) << "ERROR: failed to abort reservation: "
-                           << topic.res_id <<
-        " from queue: " << queue_name << ". error: " << ret << dendl;
-      return ret;
-    }
-    topic.res_id = cls_2pc_reservation::NO_ID;
-  }
-  return 0;
-}
-
-reservation_t::reservation_t(const DoutPrefixProvider* _dpp,
-                            rgw::sal::RadosStore* _store,
-                            const req_state* _s,
-                            rgw::sal::Object* _object,
-                            rgw::sal::Object* _src_object,
-                            const std::string* _object_name) :
-  dpp(_s), store(_store), s(_s), size(0) /* XXX */,
-  object(_object), src_object(_src_object), bucket(_s->bucket.get()),
-  object_name(_object_name),
-  tagset(_s->tagset),
-  x_meta_map(_s->info.x_meta_map),
-  metadata_fetched_from_attributes(false),
-  user_id(_s->user->get_id().id),
-  user_tenant(_s->user->get_id().tenant),
-  req_id(_s->req_id),
-  yield(_s->yield)
-{}
-
-reservation_t::reservation_t(const DoutPrefixProvider* _dpp,
-                            rgw::sal::RadosStore* _store,
-                            rgw::sal::Object* _object,
-                            rgw::sal::Object* _src_object,
-                            rgw::sal::Bucket* _bucket,
-                            const std::string& _user_id,
-                            const std::string& _user_tenant,
-                            const std::string& _req_id,
-                            optional_yield y) :
-    dpp(_dpp), store(_store), s(nullptr), size(0) /* XXX */,
-    object(_object), src_object(_src_object), bucket(_bucket),
-    object_name(nullptr),
-    metadata_fetched_from_attributes(false),
-    user_id(_user_id),
-    user_tenant(_user_tenant),
-    req_id(_req_id),
-    yield(y)
-{}
-
-reservation_t::~reservation_t() {
-  publish_abort(*this);
-}
-
-} // namespace rgw::notify
diff --git a/src/rgw/store/rados/rgw_notify.h b/src/rgw/store/rados/rgw_notify.h
deleted file mode 100644 (file)
index 175dc11..0000000
+++ /dev/null
@@ -1,117 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
-// vim: ts=8 sw=2 smarttab ft=cpp
-
-#pragma once
-
-#include <string>
-#include "common/ceph_time.h"
-#include "include/common_fwd.h"
-#include "rgw_notify_event_type.h"
-#include "common/async/yield_context.h"
-#include "cls/2pc_queue/cls_2pc_queue_types.h"
-#include "rgw_pubsub.h"
-
-// forward declarations
-namespace rgw::sal {
-    class RadosStore;
-    class RGWObject;
-}
-
-class RGWRados;
-struct rgw_obj_key;
-
-namespace rgw::notify {
-
-// initialize the notification manager
-// notification manager is dequeing the 2-phase-commit queues
-// and send the notifications to the endpoints
-bool init(CephContext* cct, rgw::sal::RadosStore* store, const DoutPrefixProvider *dpp);
-
-// shutdown the notification manager
-void shutdown();
-
-// create persistent delivery queue for a topic (endpoint)
-// this operation also add a topic name to the common (to all RGWs) list of all topics
-int add_persistent_topic(const std::string& topic_name, optional_yield y);
-
-// remove persistent delivery queue for a topic (endpoint)
-// this operation also remove the topic name from the common (to all RGWs) list of all topics
-int remove_persistent_topic(const std::string& topic_name, optional_yield y);
-
-// struct holding reservation information
-// populated in the publish_reserve call
-// then used to commit or abort the reservation
-struct reservation_t {
-  struct topic_t {
-    topic_t(const std::string& _configurationId, const rgw_pubsub_topic& _cfg,
-           cls_2pc_reservation::id_t _res_id) :
-      configurationId(_configurationId), cfg(_cfg), res_id(_res_id) {}
-
-    const std::string configurationId;
-    const rgw_pubsub_topic cfg;
-    // res_id is reset after topic is committed/aborted
-    cls_2pc_reservation::id_t res_id;
-  };
-
-  const DoutPrefixProvider* const dpp;
-  std::vector<topic_t> topics;
-  rgw::sal::RadosStore* const store;
-  const req_state* const s;
-  size_t size;
-  rgw::sal::Object* const object;
-  rgw::sal::Object* const src_object; // may differ from object
-  rgw::sal::Bucket* const bucket;
-  const std::string* const object_name;
-  boost::optional<const RGWObjTags&> tagset;
-  meta_map_t x_meta_map; // metadata cached by value
-  bool metadata_fetched_from_attributes;
-  const std::string user_id;
-  const std::string user_tenant;
-  const std::string req_id;
-  optional_yield yield;
-
-  /* ctor for rgw_op callers */
-  reservation_t(const DoutPrefixProvider* _dpp,
-               rgw::sal::RadosStore* _store,
-               const req_state* _s,
-               rgw::sal::Object* _object,
-               rgw::sal::Object* _src_object,
-               const std::string* _object_name);
-
-  /* ctor for non-request caller (e.g., lifecycle) */
-  reservation_t(const DoutPrefixProvider* _dpp,
-               rgw::sal::RadosStore* _store,
-               rgw::sal::Object* _object,
-               rgw::sal::Object* _src_object,
-               rgw::sal::Bucket* _bucket,
-               const std::string& _user_id,
-               const std::string& _user_tenant,
-               const std::string& _req_id,
-               optional_yield y);
-
-  // dtor doing resource leak guarding
-  // aborting the reservation if not already committed or aborted
-  ~reservation_t();
-};
-
-// create a reservation on the 2-phase-commit queue
-  int publish_reserve(const DoutPrefixProvider *dpp,
-                     EventType event_type,
-                     reservation_t& reservation,
-                     const RGWObjTags* req_tags);
-
-// commit the reservation to the queue
-int publish_commit(rgw::sal::Object* obj,
-        uint64_t size,
-        const ceph::real_time& mtime, 
-        const std::string& etag, 
-        const std::string& version,
-        EventType event_type,
-        reservation_t& reservation,
-        const DoutPrefixProvider *dpp);
-
-// cancel the reservation
-int publish_abort(reservation_t& reservation);
-
-}
-
diff --git a/src/rgw/store/rados/rgw_obj_manifest.cc b/src/rgw/store/rados/rgw_obj_manifest.cc
deleted file mode 100644 (file)
index 3838f5c..0000000
+++ /dev/null
@@ -1,404 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab ft=cpp
-
-#include "rgw_obj_manifest.h"
-
-#include "services/svc_zone.h"
-#include "rgw_rados.h"
-#include "rgw_bucket.h"
-
-#define dout_context g_ceph_context
-#define dout_subsys ceph_subsys_rgw
-
-using namespace std;
-
-int RGWObjManifest::generator::create_next(uint64_t ofs)
-{
-  if (ofs < last_ofs) /* only going forward */
-    return -EINVAL;
-
-  uint64_t max_head_size = manifest->get_max_head_size();
-
-  if (ofs < max_head_size) {
-    manifest->set_head_size(ofs);
-  }
-
-  if (ofs >= max_head_size) {
-    manifest->set_head_size(max_head_size);
-    cur_stripe = (ofs - max_head_size) / rule.stripe_max_size;
-    cur_stripe_size = rule.stripe_max_size;
-
-    if (cur_part_id == 0 && max_head_size > 0) {
-      cur_stripe++;
-    }
-  }
-
-  last_ofs = ofs;
-  manifest->set_obj_size(ofs);
-
-  manifest->get_implicit_location(cur_part_id, cur_stripe, ofs, NULL, &cur_obj);
-
-  return 0;
-}
-
-int RGWObjManifest::append(const DoutPrefixProvider *dpp, RGWObjManifest& m, const RGWZoneGroup& zonegroup,
-                           const RGWZoneParams& zone_params)
-{
-  if (explicit_objs || m.explicit_objs) {
-    return append_explicit(dpp, m, zonegroup, zone_params);
-  }
-
-  if (rules.empty()) {
-    *this = m;
-    return 0;
-  }
-
-  string override_prefix;
-
-  if (prefix.empty()) {
-    prefix = m.prefix;
-  }
-
-  if (prefix != m.prefix) {
-    override_prefix = m.prefix;
-  }
-
-  map<uint64_t, RGWObjManifestRule>::iterator miter = m.rules.begin();
-  if (miter == m.rules.end()) {
-    return append_explicit(dpp, m, zonegroup, zone_params);
-  }
-
-  for (; miter != m.rules.end(); ++miter) {
-    map<uint64_t, RGWObjManifestRule>::reverse_iterator last_rule = rules.rbegin();
-
-    RGWObjManifestRule& rule = last_rule->second;
-
-    if (rule.part_size == 0) {
-      rule.part_size = obj_size - rule.start_ofs;
-    }
-
-    RGWObjManifestRule& next_rule = miter->second;
-    if (!next_rule.part_size) {
-      next_rule.part_size = m.obj_size - next_rule.start_ofs;
-    }
-
-    string rule_prefix = prefix;
-    if (!rule.override_prefix.empty()) {
-      rule_prefix = rule.override_prefix;
-    }
-
-    string next_rule_prefix = m.prefix;
-    if (!next_rule.override_prefix.empty()) {
-      next_rule_prefix = next_rule.override_prefix;
-    }
-
-    if (rule.part_size != next_rule.part_size ||
-        rule.stripe_max_size != next_rule.stripe_max_size ||
-        rule_prefix != next_rule_prefix) {
-      if (next_rule_prefix != prefix) {
-        append_rules(m, miter, &next_rule_prefix);
-      } else {
-        append_rules(m, miter, NULL);
-      }
-      break;
-    }
-
-    uint64_t expected_part_num = rule.start_part_num + 1;
-    if (rule.part_size > 0) {
-      expected_part_num = rule.start_part_num + (obj_size + next_rule.start_ofs - rule.start_ofs) / rule.part_size;
-    }
-
-    if (expected_part_num != next_rule.start_part_num) {
-      append_rules(m, miter, NULL);
-      break;
-    }
-  }
-
-  set_obj_size(obj_size + m.obj_size);
-
-  return 0;
-}
-
-void RGWObjManifest::append_rules(RGWObjManifest& m, map<uint64_t, RGWObjManifestRule>::iterator& miter,
-                                  string *override_prefix)
-{
-  for (; miter != m.rules.end(); ++miter) {
-    RGWObjManifestRule rule = miter->second;
-    rule.start_ofs += obj_size;
-    if (override_prefix)
-      rule.override_prefix = *override_prefix;
-    rules[rule.start_ofs] = rule;
-  }
-}
-
-void RGWObjManifest::convert_to_explicit(const DoutPrefixProvider *dpp, const RGWZoneGroup& zonegroup, const RGWZoneParams& zone_params)
-{
-  if (explicit_objs) {
-    return;
-  }
-  obj_iterator iter = obj_begin(dpp);
-
-  while (iter != obj_end(dpp)) {
-    RGWObjManifestPart& part = objs[iter.get_stripe_ofs()];
-    const rgw_obj_select& os = iter.get_location();
-    const rgw_raw_obj& raw_loc = os.get_raw_obj(zonegroup, zone_params);
-    part.loc_ofs = 0;
-
-    uint64_t ofs = iter.get_stripe_ofs();
-
-    if (ofs == 0) {
-      part.loc = obj;
-    } else {
-      RGWSI_Tier_RADOS::raw_obj_to_obj(tail_placement.bucket, raw_loc, &part.loc);
-    }
-    ++iter;
-    uint64_t next_ofs = iter.get_stripe_ofs();
-
-    part.size = next_ofs - ofs;
-  }
-
-  explicit_objs = true;
-  rules.clear();
-  prefix.clear();
-}
-
-int RGWObjManifest::append_explicit(const DoutPrefixProvider *dpp, RGWObjManifest& m, const RGWZoneGroup& zonegroup, const RGWZoneParams& zone_params)
-{
-  if (!explicit_objs) {
-    convert_to_explicit(dpp, zonegroup, zone_params);
-  }
-  if (!m.explicit_objs) {
-    m.convert_to_explicit(dpp, zonegroup, zone_params);
-  }
-  map<uint64_t, RGWObjManifestPart>::iterator iter;
-  uint64_t base = obj_size;
-  for (iter = m.objs.begin(); iter != m.objs.end(); ++iter) {
-    RGWObjManifestPart& part = iter->second;
-    objs[base + iter->first] = part;
-  }
-  obj_size += m.obj_size;
-
-  return 0;
-}
-
-bool RGWObjManifest::get_rule(uint64_t ofs, RGWObjManifestRule *rule)
-{
-  if (rules.empty()) {
-    return false;
-  }
-
-  map<uint64_t, RGWObjManifestRule>::iterator iter = rules.upper_bound(ofs);
-  if (iter != rules.begin()) {
-    --iter;
-  }
-
-  *rule = iter->second;
-
-  return true;
-}
-
-int RGWObjManifest::generator::create_begin(CephContext *cct, RGWObjManifest *_m,
-                                            const rgw_placement_rule& head_placement_rule,
-                                            const rgw_placement_rule *tail_placement_rule,
-                                            const rgw_bucket& _b, const rgw_obj& _obj)
-{
-  manifest = _m;
-
-  if (!tail_placement_rule) {
-    manifest->set_tail_placement(head_placement_rule, _b);
-  } else {
-    rgw_placement_rule new_tail_rule = *tail_placement_rule;
-    new_tail_rule.inherit_from(head_placement_rule);
-    manifest->set_tail_placement(new_tail_rule, _b);
-  }
-
-  manifest->set_head(head_placement_rule, _obj, 0);
-  last_ofs = 0;
-
-  if (manifest->get_prefix().empty()) {
-    char buf[33];
-    gen_rand_alphanumeric(cct, buf, sizeof(buf) - 1);
-
-    string oid_prefix = ".";
-    oid_prefix.append(buf);
-    oid_prefix.append("_");
-
-    manifest->set_prefix(oid_prefix);
-  }
-
-  bool found = manifest->get_rule(0, &rule);
-  if (!found) {
-    derr << "ERROR: manifest->get_rule() could not find rule" << dendl;
-    return -EIO;
-  }
-
-  uint64_t head_size = manifest->get_head_size();
-
-  if (head_size > 0) {
-    cur_stripe_size = head_size;
-  } else {
-    cur_stripe_size = rule.stripe_max_size;
-  }
-  
-  cur_part_id = rule.start_part_num;
-
-  manifest->get_implicit_location(cur_part_id, cur_stripe, 0, NULL, &cur_obj);
-
-  // Normal object which not generated through copy operation 
-  manifest->set_tail_instance(_obj.key.instance);
-
-  return 0;
-}
-
-void RGWObjManifestPart::generate_test_instances(std::list<RGWObjManifestPart*>& o)
-{
-  o.push_back(new RGWObjManifestPart);
-
-  RGWObjManifestPart *p = new RGWObjManifestPart;
-  rgw_bucket b;
-  init_bucket(&b, "tenant", "bucket", ".pool", ".index_pool", "marker_", "12");
-
-  p->loc = rgw_obj(b, "object");
-  p->loc_ofs = 512 * 1024;
-  p->size = 128 * 1024;
-  o.push_back(p);
-}
-
-void RGWObjManifest::generate_test_instances(std::list<RGWObjManifest*>& o)
-{
-  RGWObjManifest *m = new RGWObjManifest;
-  map<uint64_t, RGWObjManifestPart> objs;
-  uint64_t total_size = 0;
-  for (int i = 0; i<10; i++) {
-    RGWObjManifestPart p;
-    rgw_bucket b;
-    init_bucket(&b, "tenant", "bucket", ".pool", ".index_pool", "marker_", "12");
-    p.loc = rgw_obj(b, "object");
-    p.loc_ofs = 0;
-    p.size = 512 * 1024;
-    total_size += p.size;
-    objs[total_size] = p;
-  }
-  m->set_explicit(total_size, objs);
-  o.push_back(m);
-  o.push_back(new RGWObjManifest);
-}
-
-void RGWObjManifestPart::dump(Formatter *f) const
-{
-  f->open_object_section("loc");
-  loc.dump(f);
-  f->close_section();
-  f->dump_unsigned("loc_ofs", loc_ofs);
-  f->dump_unsigned("size", size);
-}
-
-void RGWObjManifest::obj_iterator::dump(Formatter *f) const
-{
-  f->dump_unsigned("part_ofs", part_ofs);
-  f->dump_unsigned("stripe_ofs", stripe_ofs);
-  f->dump_unsigned("ofs", ofs);
-  f->dump_unsigned("stripe_size", stripe_size);
-  f->dump_int("cur_part_id", cur_part_id);
-  f->dump_int("cur_stripe", cur_stripe);
-  f->dump_string("cur_override_prefix", cur_override_prefix);
-  f->dump_object("location", location);
-}
-
-void RGWObjManifest::dump(Formatter *f) const
-{
-  map<uint64_t, RGWObjManifestPart>::const_iterator iter = objs.begin();
-  f->open_array_section("objs");
-  for (; iter != objs.end(); ++iter) {
-    f->dump_unsigned("ofs", iter->first);
-    f->open_object_section("part");
-    iter->second.dump(f);
-    f->close_section();
-  }
-  f->close_section();
-  f->dump_unsigned("obj_size", obj_size);
-  ::encode_json("explicit_objs", explicit_objs, f);
-  ::encode_json("head_size", head_size, f);
-  ::encode_json("max_head_size", max_head_size, f);
-  ::encode_json("prefix", prefix, f);
-  ::encode_json("rules", rules, f);
-  ::encode_json("tail_instance", tail_instance, f);
-  ::encode_json("tail_placement", tail_placement, f);
-
-  // nullptr being passed into iterators since there
-  // is no cct and we aren't doing anything with these
-  // iterators that would write do the log
-  f->dump_object("begin_iter", obj_begin(nullptr));
-  f->dump_object("end_iter", obj_end(nullptr));
-}
-
-void RGWObjManifestRule::dump(Formatter *f) const
-{
-  encode_json("start_part_num", start_part_num, f);
-  encode_json("start_ofs", start_ofs, f);
-  encode_json("part_size", part_size, f);
-  encode_json("stripe_max_size", stripe_max_size, f);
-  encode_json("override_prefix", override_prefix, f);
-}
-
-void rgw_obj_select::dump(Formatter *f) const
-{
-  f->dump_string("placement_rule", placement_rule.to_str());
-  f->dump_object("obj", obj);
-  f->dump_object("raw_obj", raw_obj);
-  f->dump_bool("is_raw", is_raw);
-}
-
-void RGWObjTier::dump(Formatter *f) const
-{
-  encode_json("name", name, f);
-  encode_json("tier_placement", tier_placement, f);
-  encode_json("is_multipart_upload", is_multipart_upload, f);
-}
-
-// returns true on success, false on failure
-static bool rgw_get_obj_data_pool(const RGWZoneGroup& zonegroup, const RGWZoneParams& zone_params,
-                                  const rgw_placement_rule& head_placement_rule,
-                                  const rgw_obj& obj, rgw_pool *pool)
-{
-  if (!zone_params.get_head_data_pool(head_placement_rule, obj, pool)) {
-    RGWZonePlacementInfo placement;
-    if (!zone_params.get_placement(zonegroup.default_placement.name, &placement)) {
-      return false;
-    }
-
-    if (!obj.in_extra_data) {
-      *pool = placement.get_data_pool(zonegroup.default_placement.storage_class);
-    } else {
-      *pool = placement.get_data_extra_pool();
-    }
-  }
-
-  return true;
-}
-
-static bool rgw_obj_to_raw(const RGWZoneGroup& zonegroup, const RGWZoneParams& zone_params,
-                           const rgw_placement_rule& head_placement_rule,
-                           const rgw_obj& obj, rgw_raw_obj *raw_obj)
-{
-  get_obj_bucket_and_oid_loc(obj, raw_obj->oid, raw_obj->loc);
-
-  return rgw_get_obj_data_pool(zonegroup, zone_params, head_placement_rule, obj, &raw_obj->pool);
-}
-
-rgw_raw_obj rgw_obj_select::get_raw_obj(const RGWZoneGroup& zonegroup, const RGWZoneParams& zone_params) const
-{
-  if (!is_raw) {
-    rgw_raw_obj r;
-    rgw_obj_to_raw(zonegroup, zone_params, placement_rule, obj, &r);
-    return r;
-  }
-  return raw_obj;
-}
-
-// returns true on success, false on failure
-bool RGWRados::get_obj_data_pool(const rgw_placement_rule& placement_rule, const rgw_obj& obj, rgw_pool *pool)
-{
-  return rgw_get_obj_data_pool(svc.zone->get_zonegroup(), svc.zone->get_zone_params(), placement_rule, obj, pool);
-}
-
diff --git a/src/rgw/store/rados/rgw_obj_manifest.h b/src/rgw/store/rados/rgw_obj_manifest.h
deleted file mode 100644 (file)
index ac73359..0000000
+++ /dev/null
@@ -1,609 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab ft=cpp
-
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2019 Red Hat, Inc.
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation. See file COPYING.
- *
- */
-
-#pragma once
-
-#include "rgw_common.h"
-#include "rgw_compression_types.h"
-#include "rgw_sal.h"
-#include "rgw_zone.h"
-
-class RGWSI_Zone;
-struct RGWZoneGroup;
-struct RGWZoneParams;
-class RGWRados;
-namespace rgw { namespace sal {
-  class RadosStore;
-} };
-
-class rgw_obj_select {
-  rgw_placement_rule placement_rule;
-  rgw_obj obj;
-  rgw_raw_obj raw_obj;
-  bool is_raw;
-
-public:
-  rgw_obj_select() : is_raw(false) {}
-  explicit rgw_obj_select(const rgw_obj& _obj) : obj(_obj), is_raw(false) {}
-  explicit rgw_obj_select(const rgw_raw_obj& _raw_obj) : raw_obj(_raw_obj), is_raw(true) {}
-  rgw_obj_select(const rgw_obj_select& rhs) {
-    placement_rule = rhs.placement_rule;
-    is_raw = rhs.is_raw;
-    if (is_raw) {
-      raw_obj = rhs.raw_obj;
-    } else {
-      obj = rhs.obj;
-    }
-  }
-
-  rgw_raw_obj get_raw_obj(const RGWZoneGroup& zonegroup, const RGWZoneParams& zone_params) const;
-  rgw_raw_obj get_raw_obj(rgw::sal::RadosStore* store) const;
-
-  rgw_obj_select& operator=(const rgw_obj& rhs) {
-    obj = rhs;
-    is_raw = false;
-    return *this;
-  }
-
-  rgw_obj_select& operator=(const rgw_raw_obj& rhs) {
-    raw_obj = rhs;
-    is_raw = true;
-    return *this;
-  }
-
-  void set_placement_rule(const rgw_placement_rule& rule) {
-    placement_rule = rule;
-  }
-  void dump(Formatter *f) const;
-};
-
-struct RGWObjManifestPart {
-  rgw_obj loc;   /* the object where the data is located */
-  uint64_t loc_ofs;  /* the offset at that object where the data is located */
-  uint64_t size;     /* the part size */
-
-  RGWObjManifestPart() : loc_ofs(0), size(0) {}
-
-  void encode(bufferlist& bl) const {
-    ENCODE_START(2, 2, bl);
-    encode(loc, bl);
-    encode(loc_ofs, bl);
-    encode(size, bl);
-    ENCODE_FINISH(bl);
-  }
-
-  void decode(bufferlist::const_iterator& bl) {
-     DECODE_START_LEGACY_COMPAT_LEN_32(2, 2, 2, bl);
-     decode(loc, bl);
-     decode(loc_ofs, bl);
-     decode(size, bl);
-     DECODE_FINISH(bl);
-  }
-
-  void dump(Formatter *f) const;
-  static void generate_test_instances(std::list<RGWObjManifestPart*>& o);
-};
-WRITE_CLASS_ENCODER(RGWObjManifestPart)
-
-/*
- The manifest defines a set of rules for structuring the object parts.
- There are a few terms to note:
-     - head: the head part of the object, which is the part that contains
-       the first chunk of data. An object might not have a head (as in the
-       case of multipart-part objects).
-     - stripe: data portion of a single rgw object that resides on a single
-       rados object.
-     - part: a collection of stripes that make a contiguous part of an
-       object. A regular object will only have one part (although might have
-       many stripes), a multipart object might have many parts. Each part
-       has a fixed stripe size, although the last stripe of a part might
-       be smaller than that. Consecutive parts may be merged if their stripe
-       value is the same.
-*/
-
-struct RGWObjManifestRule {
-  uint32_t start_part_num;
-  uint64_t start_ofs;
-  uint64_t part_size; /* each part size, 0 if there's no part size, meaning it's unlimited */
-  uint64_t stripe_max_size; /* underlying obj max size */
-  std::string override_prefix;
-
-  RGWObjManifestRule() : start_part_num(0), start_ofs(0), part_size(0), stripe_max_size(0) {}
-  RGWObjManifestRule(uint32_t _start_part_num, uint64_t _start_ofs, uint64_t _part_size, uint64_t _stripe_max_size) :
-                       start_part_num(_start_part_num), start_ofs(_start_ofs), part_size(_part_size), stripe_max_size(_stripe_max_size) {}
-
-  void encode(bufferlist& bl) const {
-    ENCODE_START(2, 1, bl);
-    encode(start_part_num, bl);
-    encode(start_ofs, bl);
-    encode(part_size, bl);
-    encode(stripe_max_size, bl);
-    encode(override_prefix, bl);
-    ENCODE_FINISH(bl);
-  }
-
-  void decode(bufferlist::const_iterator& bl) {
-    DECODE_START(2, bl);
-    decode(start_part_num, bl);
-    decode(start_ofs, bl);
-    decode(part_size, bl);
-    decode(stripe_max_size, bl);
-    if (struct_v >= 2)
-      decode(override_prefix, bl);
-    DECODE_FINISH(bl);
-  }
-  void dump(Formatter *f) const;
-};
-WRITE_CLASS_ENCODER(RGWObjManifestRule)
-
-struct RGWObjTier {
-    std::string name;
-    RGWZoneGroupPlacementTier tier_placement;
-    bool is_multipart_upload{false};
-
-    RGWObjTier(): name("none") {}
-
-    void encode(bufferlist& bl) const {
-      ENCODE_START(2, 2, bl);
-      encode(name, bl);
-      encode(tier_placement, bl);
-      encode(is_multipart_upload, bl);
-      ENCODE_FINISH(bl);
-    }
-
-    void decode(bufferlist::const_iterator& bl) {
-      DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl);
-      decode(name, bl);
-      decode(tier_placement, bl);
-      decode(is_multipart_upload, bl);
-      DECODE_FINISH(bl);
-    }
-    void dump(Formatter *f) const;
-};
-WRITE_CLASS_ENCODER(RGWObjTier)
-
-class RGWObjManifest {
-protected:
-  bool explicit_objs{false}; /* really old manifest? */
-  std::map<uint64_t, RGWObjManifestPart> objs;
-
-  uint64_t obj_size{0};
-
-  rgw_obj obj;
-  uint64_t head_size{0};
-  rgw_placement_rule head_placement_rule;
-
-  uint64_t max_head_size{0};
-  std::string prefix;
-  rgw_bucket_placement tail_placement; /* might be different than the original bucket,
-                                       as object might have been copied across pools */
-  std::map<uint64_t, RGWObjManifestRule> rules;
-
-  std::string tail_instance; /* tail object's instance */
-
-  std::string tier_type;
-  RGWObjTier tier_config;
-
-  void convert_to_explicit(const DoutPrefixProvider *dpp, const RGWZoneGroup& zonegroup, const RGWZoneParams& zone_params);
-  int append_explicit(const DoutPrefixProvider *dpp, RGWObjManifest& m, const RGWZoneGroup& zonegroup, const RGWZoneParams& zone_params);
-  void append_rules(RGWObjManifest& m, std::map<uint64_t, RGWObjManifestRule>::iterator& iter, std::string *override_prefix);
-
-public:
-
-  RGWObjManifest() = default;
-  RGWObjManifest(const RGWObjManifest& rhs) {
-    *this = rhs;
-  }
-  RGWObjManifest& operator=(const RGWObjManifest& rhs) {
-    explicit_objs = rhs.explicit_objs;
-    objs = rhs.objs;
-    obj_size = rhs.obj_size;
-    obj = rhs.obj;
-    head_size = rhs.head_size;
-    max_head_size = rhs.max_head_size;
-    prefix = rhs.prefix;
-    tail_placement = rhs.tail_placement;
-    rules = rhs.rules;
-    tail_instance = rhs.tail_instance;
-    tier_type = rhs.tier_type;
-    tier_config = rhs.tier_config;
-    return *this;
-  }
-
-  std::map<uint64_t, RGWObjManifestPart>& get_explicit_objs() {
-    return objs;
-  }
-
-
-  void set_explicit(uint64_t _size, std::map<uint64_t, RGWObjManifestPart>& _objs) {
-    explicit_objs = true;
-    objs.swap(_objs);
-    set_obj_size(_size);
-  }
-
-  void get_implicit_location(uint64_t cur_part_id, uint64_t cur_stripe, uint64_t ofs,
-                             std::string *override_prefix, rgw_obj_select *location) const;
-
-  void set_trivial_rule(uint64_t tail_ofs, uint64_t stripe_max_size) {
-    RGWObjManifestRule rule(0, tail_ofs, 0, stripe_max_size);
-    rules[0] = rule;
-    max_head_size = tail_ofs;
-  }
-
-  void set_multipart_part_rule(uint64_t stripe_max_size, uint64_t part_num) {
-    RGWObjManifestRule rule(0, 0, 0, stripe_max_size);
-    rule.start_part_num = part_num;
-    rules[0] = rule;
-    max_head_size = 0;
-  }
-
-  void encode(bufferlist& bl) const {
-    ENCODE_START(8, 6, bl);
-    encode(obj_size, bl);
-    encode(objs, bl);
-    encode(explicit_objs, bl);
-    encode(obj, bl);
-    encode(head_size, bl);
-    encode(max_head_size, bl);
-    encode(prefix, bl);
-    encode(rules, bl);
-    bool encode_tail_bucket = !(tail_placement.bucket == obj.bucket);
-    encode(encode_tail_bucket, bl);
-    if (encode_tail_bucket) {
-      encode(tail_placement.bucket, bl);
-    }
-    bool encode_tail_instance = (tail_instance != obj.key.instance);
-    encode(encode_tail_instance, bl);
-    if (encode_tail_instance) {
-      encode(tail_instance, bl);
-    }
-    encode(head_placement_rule, bl);
-    encode(tail_placement.placement_rule, bl);
-    encode(tier_type, bl);
-    encode(tier_config, bl);
-    ENCODE_FINISH(bl);
-  }
-
-  void decode(bufferlist::const_iterator& bl) {
-    DECODE_START_LEGACY_COMPAT_LEN_32(7, 2, 2, bl);
-    decode(obj_size, bl);
-    decode(objs, bl);
-    if (struct_v >= 3) {
-      decode(explicit_objs, bl);
-      decode(obj, bl);
-      decode(head_size, bl);
-      decode(max_head_size, bl);
-      decode(prefix, bl);
-      decode(rules, bl);
-    } else {
-      explicit_objs = true;
-      if (!objs.empty()) {
-        std::map<uint64_t, RGWObjManifestPart>::iterator iter = objs.begin();
-        obj = iter->second.loc;
-        head_size = iter->second.size;
-        max_head_size = head_size;
-      }
-    }
-
-    if (explicit_objs && head_size > 0 && !objs.empty()) {
-      /* patch up manifest due to issue 16435:
-       * the first object in the explicit objs list might not be the one we need to access, use the
-       * head object instead if set. This would happen if we had an old object that was created
-       * when the explicit objs manifest was around, and it got copied.
-       */
-      rgw_obj& obj_0 = objs[0].loc;
-      if (!obj_0.get_oid().empty() && obj_0.key.ns.empty()) {
-        objs[0].loc = obj;
-        objs[0].size = head_size;
-      }
-    }
-
-    if (struct_v >= 4) {
-      if (struct_v < 6) {
-        decode(tail_placement.bucket, bl);
-      } else {
-        bool need_to_decode;
-        decode(need_to_decode, bl);
-        if (need_to_decode) {
-          decode(tail_placement.bucket, bl);
-        } else {
-          tail_placement.bucket = obj.bucket;
-        }
-      }
-    }
-
-    if (struct_v >= 5) {
-      if (struct_v < 6) {
-        decode(tail_instance, bl);
-      } else {
-        bool need_to_decode;
-        decode(need_to_decode, bl);
-        if (need_to_decode) {
-          decode(tail_instance, bl);
-        } else {
-          tail_instance = obj.key.instance;
-        }
-      }
-    } else { // old object created before 'tail_instance' field added to manifest
-      tail_instance = obj.key.instance;
-    }
-
-    if (struct_v >= 7) {
-      decode(head_placement_rule, bl);
-      decode(tail_placement.placement_rule, bl);
-    }
-
-    if (struct_v >= 8) {
-      decode(tier_type, bl);
-      decode(tier_config, bl);
-    }
-
-    DECODE_FINISH(bl);
-  }
-
-  void dump(Formatter *f) const;
-  static void generate_test_instances(std::list<RGWObjManifest*>& o);
-
-  int append(const DoutPrefixProvider *dpp, RGWObjManifest& m, const RGWZoneGroup& zonegroup,
-             const RGWZoneParams& zone_params);
-
-  bool get_rule(uint64_t ofs, RGWObjManifestRule *rule);
-
-  bool empty() const {
-    if (explicit_objs)
-      return objs.empty();
-    return rules.empty();
-  }
-
-  bool has_explicit_objs() const {
-    return explicit_objs;
-  }
-
-  bool has_tail() const {
-    if (explicit_objs) {
-      if (objs.size() == 1) {
-        auto iter = objs.begin();
-        const rgw_obj& o = iter->second.loc;
-        return !(obj == o);
-      }
-      return (objs.size() >= 2);
-    }
-    return (obj_size > head_size);
-  }
-
-  void set_head(const rgw_placement_rule& placement_rule, const rgw_obj& _o, uint64_t _s) {
-    head_placement_rule = placement_rule;
-    obj = _o;
-    head_size = _s;
-
-    if (explicit_objs && head_size > 0) {
-      objs[0].loc = obj;
-      objs[0].size = head_size;
-    }
-  }
-
-  const rgw_obj& get_obj() const {
-    return obj;
-  }
-
-  void set_tail_placement(const rgw_placement_rule& placement_rule, const rgw_bucket& _b) {
-    tail_placement.placement_rule = placement_rule;
-    tail_placement.bucket = _b;
-  }
-
-  const rgw_bucket_placement& get_tail_placement() const {
-    return tail_placement;
-  }
-
-  const rgw_placement_rule& get_head_placement_rule() const {
-    return head_placement_rule;
-  }
-
-  void set_prefix(const std::string& _p) {
-    prefix = _p;
-  }
-
-  const std::string& get_prefix() const {
-    return prefix;
-  }
-
-  void set_tail_instance(const std::string& _ti) {
-    tail_instance = _ti;
-  }
-
-  const std::string& get_tail_instance() const {
-    return tail_instance;
-  }
-
-  void set_head_size(uint64_t _s) {
-    head_size = _s;
-  }
-
-  void set_obj_size(uint64_t s) {
-    obj_size = s;
-  }
-
-  uint64_t get_obj_size() const {
-    return obj_size;
-  }
-
-  uint64_t get_head_size() const {
-    return head_size;
-  }
-
-  uint64_t get_max_head_size() const {
-    return max_head_size;
-  }
-
-  const std::string& get_tier_type() {
-      return tier_type;
-  }
-
-  inline void set_tier_type(std::string value) {
-      /* Only "cloud-s3" tier-type is supported for now */
-      if (value == "cloud-s3") {
-        tier_type = value;
-      }
-  }
-
-  inline void set_tier_config(RGWObjTier t) {
-      /* Set only if tier_type set to "cloud-s3" */
-      if (tier_type != "cloud-s3")
-        return;
-
-      tier_config.name = t.name;
-      tier_config.tier_placement = t.tier_placement;
-      tier_config.is_multipart_upload = t.is_multipart_upload;
-  }
-
-  inline const void get_tier_config(RGWObjTier* t) {
-      if (tier_type != "cloud-s3")
-        return;
-
-      t->name = tier_config.name;
-      t->tier_placement = tier_config.tier_placement;
-      t->is_multipart_upload = tier_config.is_multipart_upload;
-  }
-
-  class obj_iterator {
-    const DoutPrefixProvider *dpp;
-    const RGWObjManifest *manifest = nullptr;
-    uint64_t part_ofs = 0;   /* where current part starts */
-    uint64_t stripe_ofs = 0; /* where current stripe starts */
-    uint64_t ofs = 0;        /* current position within the object */
-    uint64_t stripe_size = 0;      /* current part size */
-
-    int cur_part_id = 0;
-    int cur_stripe = 0;
-    std::string cur_override_prefix;
-
-    rgw_obj_select location;
-
-    std::map<uint64_t, RGWObjManifestRule>::const_iterator rule_iter;
-    std::map<uint64_t, RGWObjManifestRule>::const_iterator next_rule_iter;
-    std::map<uint64_t, RGWObjManifestPart>::const_iterator explicit_iter;
-
-    void update_explicit_pos();
-
-  public:
-    obj_iterator() = default;
-    explicit obj_iterator(const DoutPrefixProvider *_dpp, const RGWObjManifest *_m)
-      : obj_iterator(_dpp, _m, 0)
-    {}
-    obj_iterator(const DoutPrefixProvider *_dpp, const RGWObjManifest *_m, uint64_t _ofs) : dpp(_dpp), manifest(_m) {
-      seek(_ofs);
-    }
-    void seek(uint64_t ofs);
-
-    void operator++();
-    bool operator==(const obj_iterator& rhs) const {
-      return (ofs == rhs.ofs);
-    }
-    bool operator!=(const obj_iterator& rhs) const {
-      return (ofs != rhs.ofs);
-    }
-    const rgw_obj_select& get_location() {
-      return location;
-    }
-
-    /* where current part starts */
-    uint64_t get_part_ofs() const {
-      return part_ofs;
-    }
-
-    /* start of current stripe */
-    uint64_t get_stripe_ofs() {
-      if (manifest->explicit_objs) {
-        return explicit_iter->first;
-      }
-      return stripe_ofs;
-    }
-
-    /* current ofs relative to start of rgw object */
-    uint64_t get_ofs() const {
-      return ofs;
-    }
-
-    /* stripe number */
-    int get_cur_stripe() const {
-      return cur_stripe;
-    }
-
-    /* current stripe size */
-    uint64_t get_stripe_size() {
-      if (manifest->explicit_objs) {
-        return explicit_iter->second.size;
-      }
-      return stripe_size;
-    }
-
-    /* offset where data starts within current stripe */
-    uint64_t location_ofs() {
-      if (manifest->explicit_objs) {
-        return explicit_iter->second.loc_ofs;
-      }
-      return 0; /* all stripes start at zero offset */
-    }
-
-    void update_location();
-
-    void dump(Formatter *f) const;
-  }; // class obj_iterator
-
-  obj_iterator obj_begin(const DoutPrefixProvider *dpp) const { return obj_iterator{dpp, this}; }
-  obj_iterator obj_end(const DoutPrefixProvider *dpp) const { return obj_iterator{dpp, this, obj_size}; }
-  obj_iterator obj_find(const DoutPrefixProvider *dpp, uint64_t ofs) const {
-    return obj_iterator{dpp, this, std::min(ofs, obj_size)};
-  }
-
-  /*
-   * simple object generator. Using a simple single rule manifest.
-   */
-  class generator {
-    RGWObjManifest *manifest;
-    uint64_t last_ofs;
-    uint64_t cur_part_ofs;
-    int cur_part_id;
-    int cur_stripe;
-    uint64_t cur_stripe_size;
-    std::string cur_oid;
-    
-    std::string oid_prefix;
-
-    rgw_obj_select cur_obj;
-
-    RGWObjManifestRule rule;
-
-  public:
-    generator() : manifest(NULL), last_ofs(0), cur_part_ofs(0), cur_part_id(0), 
-                 cur_stripe(0), cur_stripe_size(0) {}
-    int create_begin(CephContext *cct, RGWObjManifest *manifest,
-                     const rgw_placement_rule& head_placement_rule,
-                     const rgw_placement_rule *tail_placement_rule,
-                     const rgw_bucket& bucket,
-                     const rgw_obj& obj);
-
-    int create_next(uint64_t ofs);
-
-    rgw_raw_obj get_cur_obj(RGWZoneGroup& zonegroup, RGWZoneParams& zone_params) { return cur_obj.get_raw_obj(zonegroup, zone_params); }
-    rgw_raw_obj get_cur_obj(rgw::sal::RadosStore* store) const { return cur_obj.get_raw_obj(store); }
-
-    /* total max size of current stripe (including head obj) */
-    uint64_t cur_stripe_max_size() const {
-      return cur_stripe_size;
-    }
-  };
-};
-WRITE_CLASS_ENCODER(RGWObjManifest)
diff --git a/src/rgw/store/rados/rgw_object_expirer_core.cc b/src/rgw/store/rados/rgw_object_expirer_core.cc
deleted file mode 100644 (file)
index ec1bf3f..0000000
+++ /dev/null
@@ -1,442 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab ft=cpp
-
-#include <errno.h>
-#include <iostream>
-#include <sstream>
-#include <string>
-
-
-#include "auth/Crypto.h"
-
-#include "common/armor.h"
-#include "common/ceph_json.h"
-#include "common/config.h"
-#include "common/ceph_argparse.h"
-#include "common/Formatter.h"
-#include "common/errno.h"
-
-#include "global/global_init.h"
-
-#include "include/utime.h"
-#include "include/str_list.h"
-
-#include "rgw_user.h"
-#include "rgw_bucket.h"
-#include "rgw_acl.h"
-#include "rgw_acl_s3.h"
-#include "rgw_log.h"
-#include "rgw_formats.h"
-#include "rgw_usage.h"
-#include "rgw_object_expirer_core.h"
-#include "rgw_zone.h"
-#include "rgw_sal_rados.h"
-
-#include "services/svc_rados.h"
-#include "services/svc_zone.h"
-#include "services/svc_sys_obj.h"
-#include "services/svc_bi_rados.h"
-
-#include "cls/lock/cls_lock_client.h"
-#include "cls/timeindex/cls_timeindex_client.h"
-
-#define dout_context g_ceph_context
-#define dout_subsys ceph_subsys_rgw
-
-using namespace std;
-
-static string objexp_lock_name = "gc_process";
-
-static string objexp_hint_get_shardname(int shard_num)
-{
-  char buf[64];
-  snprintf(buf, sizeof(buf), "obj_delete_at_hint.%010u", (unsigned)shard_num);
-  return buf;
-}
-
-static int objexp_key_shard(const rgw_obj_index_key& key, int num_shards)
-{
-  string obj_key = key.name + key.instance;
-  return RGWSI_BucketIndex_RADOS::bucket_shard_index(obj_key, num_shards);
-}
-
-static string objexp_hint_get_keyext(const string& tenant_name,
-                                     const string& bucket_name,
-                                     const string& bucket_id,
-                                     const rgw_obj_key& obj_key) {
-  return tenant_name + (tenant_name.empty() ? "" : ":") + bucket_name + ":" + bucket_id +
-    ":" + obj_key.name + ":" + obj_key.instance;
-}
-
-static void objexp_get_shard(int shard_num,
-                             string *shard)
-{
-  *shard = objexp_hint_get_shardname(shard_num);
-}
-
-static int objexp_hint_parse(const DoutPrefixProvider *dpp, CephContext *cct, cls_timeindex_entry &ti_entry,
-                             objexp_hint_entry *hint_entry)
-{
-  try {
-    auto iter = ti_entry.value.cbegin();
-    decode(*hint_entry, iter);
-  } catch (buffer::error& err) {
-    ldpp_dout(dpp, 0) << "ERROR: couldn't decode avail_pools" << dendl;
-  }
-
-  return 0;
-}
-
-int RGWObjExpStore::objexp_hint_add(const DoutPrefixProvider *dpp, 
-                              const ceph::real_time& delete_at,
-                              const string& tenant_name,
-                              const string& bucket_name,
-                              const string& bucket_id,
-                              const rgw_obj_index_key& obj_key)
-{
-  const string keyext = objexp_hint_get_keyext(tenant_name, bucket_name,
-          bucket_id, obj_key);
-  objexp_hint_entry he = {
-      .tenant = tenant_name,
-      .bucket_name = bucket_name,
-      .bucket_id = bucket_id,
-      .obj_key = obj_key,
-      .exp_time = delete_at };
-  bufferlist hebl;
-  encode(he, hebl);
-  librados::ObjectWriteOperation op;
-  cls_timeindex_add(op, utime_t(delete_at), keyext, hebl);
-
-  string shard_name = objexp_hint_get_shardname(objexp_key_shard(obj_key, cct->_conf->rgw_objexp_hints_num_shards));
-  auto obj = rados_svc->obj(rgw_raw_obj(driver->svc()->zone->get_zone_params().log_pool, shard_name));
-  int r = obj.open(dpp);
-  if (r < 0) {
-    ldpp_dout(dpp, 0) << "ERROR: " << __func__ << "(): failed to open obj=" << obj << " (r=" << r << ")" << dendl;
-    return r;
-  }
-  return obj.operate(dpp, &op, null_yield);
-}
-
-int RGWObjExpStore::objexp_hint_list(const DoutPrefixProvider *dpp, 
-                               const string& oid,
-                               const ceph::real_time& start_time,
-                               const ceph::real_time& end_time,
-                               const int max_entries,
-                               const string& marker,
-                               list<cls_timeindex_entry>& entries, /* out */
-                               string *out_marker,                 /* out */
-                               bool *truncated)                    /* out */
-{
-  librados::ObjectReadOperation op;
-  cls_timeindex_list(op, utime_t(start_time), utime_t(end_time), marker, max_entries, entries,
-        out_marker, truncated);
-
-  auto obj = rados_svc->obj(rgw_raw_obj(driver->svc()->zone->get_zone_params().log_pool, oid));
-  int r = obj.open(dpp);
-  if (r < 0) {
-    ldpp_dout(dpp, 0) << "ERROR: " << __func__ << "(): failed to open obj=" << obj << " (r=" << r << ")" << dendl;
-    return r;
-  }
-  bufferlist obl;
-  int ret = obj.operate(dpp, &op, &obl, null_yield);
-
-  if ((ret < 0 ) && (ret != -ENOENT)) {
-    return ret;
-  }
-
-  if ((ret == -ENOENT) && truncated) {
-    *truncated = false;
-  }
-
-  return 0;
-}
-
-static int cls_timeindex_trim_repeat(const DoutPrefixProvider *dpp, 
-                                rgw_rados_ref ref,
-                                const string& oid,
-                                const utime_t& from_time,
-                                const utime_t& to_time,
-                                const string& from_marker,
-                                const string& to_marker)
-{
-  bool done = false;
-  do {
-    librados::ObjectWriteOperation op;
-    cls_timeindex_trim(op, from_time, to_time, from_marker, to_marker);
-    int r = rgw_rados_operate(dpp, ref.pool.ioctx(), oid, &op, null_yield);
-    if (r == -ENODATA)
-      done = true;
-    else if (r < 0)
-      return r;
-  } while (!done);
-
-  return 0;
-}
-
-int RGWObjExpStore::objexp_hint_trim(const DoutPrefixProvider *dpp, 
-                               const string& oid,
-                               const ceph::real_time& start_time,
-                               const ceph::real_time& end_time,
-                               const string& from_marker,
-                               const string& to_marker)
-{
-  auto obj = rados_svc->obj(rgw_raw_obj(driver->svc()->zone->get_zone_params().log_pool, oid));
-  int r = obj.open(dpp);
-  if (r < 0) {
-    ldpp_dout(dpp, 0) << "ERROR: " << __func__ << "(): failed to open obj=" << obj << " (r=" << r << ")" << dendl;
-    return r;
-  }
-  auto& ref = obj.get_ref();
-  int ret = cls_timeindex_trim_repeat(dpp, ref, oid, utime_t(start_time), utime_t(end_time),
-          from_marker, to_marker);
-  if ((ret < 0 ) && (ret != -ENOENT)) {
-    return ret;
-  }
-
-  return 0;
-}
-
-int RGWObjectExpirer::garbage_single_object(const DoutPrefixProvider *dpp, objexp_hint_entry& hint)
-{
-  RGWBucketInfo bucket_info;
-  std::unique_ptr<rgw::sal::Bucket> bucket;
-
-  int ret = driver->get_bucket(dpp, nullptr, rgw_bucket(hint.tenant, hint.bucket_name, hint.bucket_id), &bucket, null_yield);
-  if (-ENOENT == ret) {
-    ldpp_dout(dpp, 15) << "NOTICE: cannot find bucket = " \
-        << hint.bucket_name << ". The object must be already removed" << dendl;
-    return -ERR_PRECONDITION_FAILED;
-  } else if (ret < 0) {
-    ldpp_dout(dpp, 1) << "ERROR: could not init bucket = " \
-        << hint.bucket_name << "due to ret = " << ret << dendl;
-    return ret;
-  }
-
-  rgw_obj_key key = hint.obj_key;
-  if (key.instance.empty()) {
-    key.instance = "null";
-  }
-
-  std::unique_ptr<rgw::sal::Object> obj = bucket->get_object(key);
-  obj->set_atomic();
-  ret = obj->delete_object(dpp, null_yield);
-
-  return ret;
-}
-
-void RGWObjectExpirer::garbage_chunk(const DoutPrefixProvider *dpp, 
-                                  list<cls_timeindex_entry>& entries,      /* in  */
-                                  bool& need_trim)                         /* out */
-{
-  need_trim = false;
-
-  for (list<cls_timeindex_entry>::iterator iter = entries.begin();
-       iter != entries.end();
-       ++iter)
-  {
-    objexp_hint_entry hint;
-    ldpp_dout(dpp, 15) << "got removal hint for: " << iter->key_ts.sec() \
-        << " - " << iter->key_ext << dendl;
-
-    int ret = objexp_hint_parse(dpp, driver->ctx(), *iter, &hint);
-    if (ret < 0) {
-      ldpp_dout(dpp, 1) << "cannot parse removal hint for " << hint.obj_key << dendl;
-      continue;
-    }
-
-    /* PRECOND_FAILED simply means that our hint is not valid.
-     * We can silently ignore that and move forward. */
-    ret = garbage_single_object(dpp, hint);
-    if (ret == -ERR_PRECONDITION_FAILED) {
-      ldpp_dout(dpp, 15) << "not actual hint for object: " << hint.obj_key << dendl;
-    } else if (ret < 0) {
-      ldpp_dout(dpp, 1) << "cannot remove expired object: " << hint.obj_key << dendl;
-    }
-
-    need_trim = true;
-  }
-
-  return;
-}
-
-void RGWObjectExpirer::trim_chunk(const DoutPrefixProvider *dpp, 
-                                  const string& shard,
-                                  const utime_t& from,
-                                  const utime_t& to,
-                                  const string& from_marker,
-                                  const string& to_marker)
-{
-  ldpp_dout(dpp, 20) << "trying to trim removal hints to=" << to
-                          << ", to_marker=" << to_marker << dendl;
-
-  real_time rt_from = from.to_real_time();
-  real_time rt_to = to.to_real_time();
-
-  int ret = exp_store.objexp_hint_trim(dpp, shard, rt_from, rt_to,
-                                       from_marker, to_marker);
-  if (ret < 0) {
-    ldpp_dout(dpp, 0) << "ERROR during trim: " << ret << dendl;
-  }
-
-  return;
-}
-
-bool RGWObjectExpirer::process_single_shard(const DoutPrefixProvider *dpp, 
-                                            const string& shard,
-                                            const utime_t& last_run,
-                                            const utime_t& round_start)
-{
-  string marker;
-  string out_marker;
-  bool truncated = false;
-  bool done = true;
-
-  CephContext *cct = driver->ctx();
-  int num_entries = cct->_conf->rgw_objexp_chunk_size;
-
-  int max_secs = cct->_conf->rgw_objexp_gc_interval;
-  utime_t end = ceph_clock_now();
-  end += max_secs;
-
-  rados::cls::lock::Lock l(objexp_lock_name);
-
-  utime_t time(max_secs, 0);
-  l.set_duration(time);
-
-  int ret = l.lock_exclusive(&static_cast<rgw::sal::RadosStore*>(driver)->getRados()->objexp_pool_ctx, shard);
-  if (ret == -EBUSY) { /* already locked by another processor */
-    ldpp_dout(dpp, 5) << __func__ << "(): failed to acquire lock on " << shard << dendl;
-    return false;
-  }
-
-  do {
-    real_time rt_last = last_run.to_real_time();
-    real_time rt_start = round_start.to_real_time();
-
-    list<cls_timeindex_entry> entries;
-    ret = exp_store.objexp_hint_list(dpp, shard, rt_last, rt_start,
-                                     num_entries, marker, entries,
-                                     &out_marker, &truncated);
-    if (ret < 0) {
-      ldpp_dout(dpp, 10) << "cannot get removal hints from shard: " << shard
-                     << dendl;
-      continue;
-    }
-
-    bool need_trim;
-    garbage_chunk(dpp, entries, need_trim);
-
-    if (need_trim) {
-      trim_chunk(dpp, shard, last_run, round_start, marker, out_marker);
-    }
-
-    utime_t now = ceph_clock_now();
-    if (now >= end) {
-      done = false;
-      break;
-    }
-
-    marker = out_marker;
-  } while (truncated);
-
-  l.unlock(&static_cast<rgw::sal::RadosStore*>(driver)->getRados()->objexp_pool_ctx, shard);
-  return done;
-}
-
-/* Returns true if all shards have been processed successfully. */
-bool RGWObjectExpirer::inspect_all_shards(const DoutPrefixProvider *dpp, 
-                                          const utime_t& last_run,
-                                          const utime_t& round_start)
-{
-  CephContext * const cct = driver->ctx();
-  int num_shards = cct->_conf->rgw_objexp_hints_num_shards;
-  bool all_done = true;
-
-  for (int i = 0; i < num_shards; i++) {
-    string shard;
-    objexp_get_shard(i, &shard);
-
-    ldpp_dout(dpp, 20) << "processing shard = " << shard << dendl;
-
-    if (! process_single_shard(dpp, shard, last_run, round_start)) {
-      all_done = false;
-    }
-  }
-
-  return all_done;
-}
-
-bool RGWObjectExpirer::going_down()
-{
-  return down_flag;
-}
-
-void RGWObjectExpirer::start_processor()
-{
-  worker = new OEWorker(driver->ctx(), this);
-  worker->create("rgw_obj_expirer");
-}
-
-void RGWObjectExpirer::stop_processor()
-{
-  down_flag = true;
-  if (worker) {
-    worker->stop();
-    worker->join();
-  }
-  delete worker;
-  worker = NULL;
-}
-
-void *RGWObjectExpirer::OEWorker::entry() {
-  utime_t last_run;
-  do {
-    utime_t start = ceph_clock_now();
-    ldpp_dout(this, 2) << "object expiration: start" << dendl;
-    if (oe->inspect_all_shards(this, last_run, start)) {
-      /* All shards have been processed properly. Next time we can start
-       * from this moment. */
-      last_run = start;
-    }
-    ldpp_dout(this, 2) << "object expiration: stop" << dendl;
-
-
-    if (oe->going_down())
-      break;
-
-    utime_t end = ceph_clock_now();
-    end -= start;
-    int secs = cct->_conf->rgw_objexp_gc_interval;
-
-    if (secs <= end.sec())
-      continue; // next round
-
-    secs -= end.sec();
-
-    std::unique_lock l{lock};
-    cond.wait_for(l, std::chrono::seconds(secs));
-  } while (!oe->going_down());
-
-  return NULL;
-}
-
-void RGWObjectExpirer::OEWorker::stop()
-{
-  std::lock_guard l{lock};
-  cond.notify_all();
-}
-
-CephContext *RGWObjectExpirer::OEWorker::get_cct() const 
-{ 
-  return cct; 
-}
-
-unsigned RGWObjectExpirer::OEWorker::get_subsys() const 
-{
-    return dout_subsys;
-}
-
-std::ostream& RGWObjectExpirer::OEWorker::gen_prefix(std::ostream& out) const 
-{ 
-  return out << "rgw object expirer Worker thread: "; 
-}
diff --git a/src/rgw/store/rados/rgw_object_expirer_core.h b/src/rgw/store/rados/rgw_object_expirer_core.h
deleted file mode 100644 (file)
index fccd419..0000000
+++ /dev/null
@@ -1,148 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab ft=cpp
-
-#ifndef CEPH_OBJEXP_H
-#define CEPH_OBJEXP_H
-
-#include <atomic>
-#include <string>
-#include <cerrno>
-#include <sstream>
-#include <iostream>
-
-#include "auth/Crypto.h"
-
-#include "common/armor.h"
-#include "common/ceph_json.h"
-#include "common/config.h"
-#include "common/ceph_argparse.h"
-#include "common/Formatter.h"
-#include "common/errno.h"
-
-#include "common/ceph_mutex.h"
-#include "common/Cond.h"
-#include "common/Thread.h"
-
-#include "global/global_init.h"
-
-#include "include/common_fwd.h"
-#include "include/utime.h"
-#include "include/str_list.h"
-
-#include "rgw_sal_rados.h"
-
-class RGWSI_RADOS;
-class RGWSI_Zone;
-class RGWBucketInfo;
-class cls_timeindex_entry;
-
-class RGWObjExpStore {
-  CephContext *cct;
-  RGWSI_RADOS *rados_svc;
-  rgw::sal::RadosStore* driver;
-public:
-  RGWObjExpStore(CephContext *_cct, RGWSI_RADOS *_rados_svc, rgw::sal::RadosStore* _driver) : cct(_cct),
-                                                                                      rados_svc(_rados_svc),
-                                                                                      driver(_driver) {}
-
-  int objexp_hint_add(const DoutPrefixProvider *dpp, 
-                      const ceph::real_time& delete_at,
-                      const std::string& tenant_name,
-                      const std::string& bucket_name,
-                      const std::string& bucket_id,
-                      const rgw_obj_index_key& obj_key);
-
-  int objexp_hint_list(const DoutPrefixProvider *dpp, 
-                       const std::string& oid,
-                       const ceph::real_time& start_time,
-                       const ceph::real_time& end_time,
-                       const int max_entries,
-                       const std::string& marker,
-                       std::list<cls_timeindex_entry>& entries, /* out */
-                       std::string *out_marker,                 /* out */
-                       bool *truncated);                   /* out */
-
-  int objexp_hint_trim(const DoutPrefixProvider *dpp, 
-                       const std::string& oid,
-                       const ceph::real_time& start_time,
-                       const ceph::real_time& end_time,
-                       const std::string& from_marker,
-                       const std::string& to_marker);
-};
-
-class RGWObjectExpirer {
-protected:
-  rgw::sal::Driver* driver;
-  RGWObjExpStore exp_store;
-
-  class OEWorker : public Thread, public DoutPrefixProvider {
-    CephContext *cct;
-    RGWObjectExpirer *oe;
-    ceph::mutex lock = ceph::make_mutex("OEWorker");
-    ceph::condition_variable cond;
-
-  public:
-    OEWorker(CephContext * const cct,
-             RGWObjectExpirer * const oe)
-      : cct(cct),
-        oe(oe) {
-    }
-
-    void *entry() override;
-    void stop();
-
-    CephContext *get_cct() const override;
-    unsigned get_subsys() const override;
-    std::ostream& gen_prefix(std::ostream& out) const override;
-  };
-
-  OEWorker *worker{nullptr};
-  std::atomic<bool> down_flag = { false };
-
-public:
-  explicit RGWObjectExpirer(rgw::sal::Driver* _driver)
-    : driver(_driver),
-      exp_store(_driver->ctx(), static_cast<rgw::sal::RadosStore*>(driver)->svc()->rados, static_cast<rgw::sal::RadosStore*>(driver)),
-      worker(NULL) {
-  }
-  ~RGWObjectExpirer() {
-    stop_processor();
-  }
-
-  int hint_add(const DoutPrefixProvider *dpp, 
-               const ceph::real_time& delete_at,
-               const std::string& tenant_name,
-               const std::string& bucket_name,
-               const std::string& bucket_id,
-               const rgw_obj_index_key& obj_key) {
-    return exp_store.objexp_hint_add(dpp, delete_at, tenant_name, bucket_name,
-                                     bucket_id, obj_key);
-  }
-
-  int garbage_single_object(const DoutPrefixProvider *dpp, objexp_hint_entry& hint);
-
-  void garbage_chunk(const DoutPrefixProvider *dpp, 
-                     std::list<cls_timeindex_entry>& entries, /* in  */
-                     bool& need_trim);                        /* out */
-
-  void trim_chunk(const DoutPrefixProvider *dpp, 
-                  const std::string& shard,
-                  const utime_t& from,
-                  const utime_t& to,
-                  const std::string& from_marker,
-                  const std::string& to_marker);
-
-  bool process_single_shard(const DoutPrefixProvider *dpp, 
-                            const std::string& shard,
-                            const utime_t& last_run,
-                            const utime_t& round_start);
-
-  bool inspect_all_shards(const DoutPrefixProvider *dpp, 
-                          const utime_t& last_run,
-                          const utime_t& round_start);
-
-  bool going_down();
-  void start_processor();
-  void stop_processor();
-};
-#endif /* CEPH_OBJEXP_H */
diff --git a/src/rgw/store/rados/rgw_otp.cc b/src/rgw/store/rados/rgw_otp.cc
deleted file mode 100644 (file)
index 07cc14f..0000000
+++ /dev/null
@@ -1,211 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab ft=cpp
-
-#include <errno.h>
-
-#include <string>
-#include <map>
-#include <boost/algorithm/string.hpp>
-
-#include "common/errno.h"
-#include "common/Formatter.h"
-#include "common/ceph_json.h"
-#include "rgw_otp.h"
-#include "rgw_zone.h"
-#include "rgw_metadata.h"
-
-#include "include/types.h"
-
-#include "rgw_common.h"
-#include "rgw_tools.h"
-
-#include "services/svc_zone.h"
-#include "services/svc_meta.h"
-#include "services/svc_meta_be.h"
-#include "services/svc_meta_be_otp.h"
-#include "services/svc_otp.h"
-
-#define dout_subsys ceph_subsys_rgw
-
-using namespace std;
-
-
-class RGWOTPMetadataHandler;
-
-class RGWOTPMetadataObject : public RGWMetadataObject {
-  friend class RGWOTPMetadataHandler;
-
-  otp_devices_list_t devices;
-public:
-  RGWOTPMetadataObject() {}
-  RGWOTPMetadataObject(otp_devices_list_t&& _devices, const obj_version& v, const real_time m) {
-    devices = std::move(_devices);
-    objv = v;
-    mtime = m;
-  }
-
-  void dump(Formatter *f) const override {
-    encode_json("devices", devices, f);
-  }
-
-  otp_devices_list_t& get_devs() {
-    return devices;
-  }
-};
-
-
-class RGWOTPMetadataHandler : public RGWOTPMetadataHandlerBase {
-  friend class RGWOTPCtl;
-
-  struct Svc {
-    RGWSI_Zone *zone;
-    RGWSI_MetaBackend *meta_be;
-    RGWSI_OTP *otp;
-  } svc;
-
-  int init(RGWSI_Zone *zone,
-           RGWSI_MetaBackend *_meta_be,
-           RGWSI_OTP *_otp) {
-    base_init(zone->ctx(), _otp->get_be_handler().get());
-    svc.zone = zone;
-    svc.meta_be = _meta_be;
-    svc.otp = _otp;
-    return 0;
-  }
-
-  int call(std::function<int(RGWSI_OTP_BE_Ctx& ctx)> f) {
-    return be_handler->call([&](RGWSI_MetaBackend_Handler::Op *op) {
-      RGWSI_OTP_BE_Ctx ctx(op->ctx());
-      return f(ctx);
-    });
-  }
-
-  RGWMetadataObject *get_meta_obj(JSONObj *jo, const obj_version& objv, const ceph::real_time& mtime) override {
-    otp_devices_list_t devices;
-    try {
-      JSONDecoder::decode_json("devices", devices, jo);
-    } catch (JSONDecoder::err& e) {
-      return nullptr;
-    }
-
-    return new RGWOTPMetadataObject(std::move(devices), objv, mtime);
-  }
-
-  int do_get(RGWSI_MetaBackend_Handler::Op *op, string& entry, RGWMetadataObject **obj, optional_yield y, const DoutPrefixProvider *dpp) override {
-    RGWObjVersionTracker objv_tracker;
-
-    std::unique_ptr<RGWOTPMetadataObject> mdo(new RGWOTPMetadataObject);
-
-    
-    RGWSI_OTP_BE_Ctx be_ctx(op->ctx());
-
-    int ret = svc.otp->read_all(be_ctx,
-                                entry,
-                                &mdo->get_devs(),
-                                &mdo->get_mtime(),
-                                &objv_tracker,
-                                y,
-                                dpp);
-    if (ret < 0) {
-      return ret;
-    }
-
-    mdo->objv = objv_tracker.read_version;
-
-    *obj = mdo.release();
-
-    return 0;
-  }
-
-  int do_put(RGWSI_MetaBackend_Handler::Op *op, string& entry,
-             RGWMetadataObject *_obj, RGWObjVersionTracker& objv_tracker,
-             optional_yield y,
-             const DoutPrefixProvider *dpp,
-             RGWMDLogSyncType type, bool from_remote_zone) override {
-    RGWOTPMetadataObject *obj = static_cast<RGWOTPMetadataObject *>(_obj);
-
-    RGWSI_OTP_BE_Ctx be_ctx(op->ctx());
-
-    int ret = svc.otp->store_all(dpp, be_ctx,
-                                 entry,
-                                 obj->devices,
-                                 obj->mtime,
-                                 &objv_tracker,
-                                 y);
-    if (ret < 0) {
-      return ret;
-    }
-
-    return STATUS_APPLIED;
-  }
-
-  int do_remove(RGWSI_MetaBackend_Handler::Op *op, string& entry, RGWObjVersionTracker& objv_tracker,
-                optional_yield y, const DoutPrefixProvider *dpp) override {
-    RGWSI_MBOTP_RemoveParams params;
-
-    RGWSI_OTP_BE_Ctx be_ctx(op->ctx());
-
-    return svc.otp->remove_all(dpp, be_ctx,
-                               entry,
-                               &objv_tracker,
-                               y);
-  }
-
-public:
-  RGWOTPMetadataHandler() {}
-
-  string get_type() override { return "otp"; }
-};
-
-
-RGWOTPCtl::RGWOTPCtl(RGWSI_Zone *zone_svc,
-                    RGWSI_OTP *otp_svc)
-{
-  svc.zone = zone_svc;
-  svc.otp = otp_svc;
-}
-
-
-void RGWOTPCtl::init(RGWOTPMetadataHandler *_meta_handler)
-{
-  meta_handler = _meta_handler;
-  be_handler = meta_handler->get_be_handler();
-}
-
-int RGWOTPCtl::read_all(const rgw_user& uid,
-                        RGWOTPInfo *info,
-                        optional_yield y,
-                        const DoutPrefixProvider *dpp,
-                        const GetParams& params)
-{
-  info->uid = uid;
-  return meta_handler->call([&](RGWSI_OTP_BE_Ctx& ctx) {
-    return svc.otp->read_all(ctx, uid, &info->devices, params.mtime, params.objv_tracker, y, dpp);
-  });
-}
-
-int RGWOTPCtl::store_all(const DoutPrefixProvider *dpp, 
-                         const RGWOTPInfo& info,
-                         optional_yield y,
-                         const PutParams& params)
-{
-  return meta_handler->call([&](RGWSI_OTP_BE_Ctx& ctx) {
-    return svc.otp->store_all(dpp, ctx, info.uid, info.devices, params.mtime, params.objv_tracker, y);
-  });
-}
-
-int RGWOTPCtl::remove_all(const DoutPrefixProvider *dpp,
-                          const rgw_user& uid,
-                          optional_yield y,
-                          const RemoveParams& params)
-{
-  return meta_handler->call([&](RGWSI_OTP_BE_Ctx& ctx) {
-    return svc.otp->remove_all(dpp, ctx, uid, params.objv_tracker, y);
-  });
-}
-
-
-RGWMetadataHandler *RGWOTPMetaHandlerAllocator::alloc()
-{
-  return new RGWOTPMetadataHandler();
-}
diff --git a/src/rgw/store/rados/rgw_otp.h b/src/rgw/store/rados/rgw_otp.h
deleted file mode 100644 (file)
index eacff15..0000000
+++ /dev/null
@@ -1,114 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab ft=cpp
-
-#ifndef CEPH_RGW_OTP_H
-#define CEPH_RGW_OTP_H
-
-#include "rgw_sal_fwd.h"
-#include "cls/otp/cls_otp_types.h"
-#include "services/svc_meta_be_otp.h"
-
-#include "rgw_basic_types.h"
-#include "rgw_metadata.h"
-
-
-class RGWObjVersionTracker;
-class RGWMetadataHandler;
-class RGWOTPMetadataHandler;
-class RGWSI_Zone;
-class RGWSI_OTP;
-class RGWSI_MetaBackend;
-
-class RGWOTPMetadataHandlerBase : public RGWMetadataHandler_GenericMetaBE {
-public:
-  virtual ~RGWOTPMetadataHandlerBase() {}
-  virtual int init(RGWSI_Zone *zone,
-                  RGWSI_MetaBackend *_meta_be,
-                  RGWSI_OTP *_otp) = 0;
-};
-
-class RGWOTPMetaHandlerAllocator {
-public:
-  static RGWMetadataHandler *alloc();
-};
-
-struct RGWOTPInfo {
-  rgw_user uid;
-  otp_devices_list_t devices;
-};
-
-
-class RGWOTPCtl
-{
-  struct Svc {
-    RGWSI_Zone *zone{nullptr};
-    RGWSI_OTP *otp{nullptr};
-  } svc;
-
-  RGWOTPMetadataHandler *meta_handler;
-  RGWSI_MetaBackend_Handler *be_handler;
-  
-public:
-  RGWOTPCtl(RGWSI_Zone *zone_svc,
-           RGWSI_OTP *otp_svc);
-
-  void init(RGWOTPMetadataHandler *_meta_handler);
-
-  struct GetParams {
-    RGWObjVersionTracker *objv_tracker{nullptr};
-    ceph::real_time *mtime{nullptr};
-
-    GetParams() {}
-
-    GetParams& set_objv_tracker(RGWObjVersionTracker *_objv_tracker) {
-      objv_tracker = _objv_tracker;
-      return *this;
-    }
-
-    GetParams& set_mtime(ceph::real_time *_mtime) {
-      mtime = _mtime;
-      return *this;
-    }
-  };
-
-  struct PutParams {
-    RGWObjVersionTracker *objv_tracker{nullptr};
-    ceph::real_time mtime;
-
-    PutParams() {}
-
-    PutParams& set_objv_tracker(RGWObjVersionTracker *_objv_tracker) {
-      objv_tracker = _objv_tracker;
-      return *this;
-    }
-
-    PutParams& set_mtime(const ceph::real_time& _mtime) {
-      mtime = _mtime;
-      return *this;
-    }
-  };
-
-  struct RemoveParams {
-    RGWObjVersionTracker *objv_tracker{nullptr};
-
-    RemoveParams() {}
-
-    RemoveParams& set_objv_tracker(RGWObjVersionTracker *_objv_tracker) {
-      objv_tracker = _objv_tracker;
-      return *this;
-    }
-  };
-
-  int read_all(const rgw_user& uid, RGWOTPInfo *info, optional_yield y,
-               const DoutPrefixProvider *dpp,
-               const GetParams& params = {});
-  int store_all(const DoutPrefixProvider *dpp, 
-                const RGWOTPInfo& info, optional_yield y,
-                const PutParams& params = {});
-  int remove_all(const DoutPrefixProvider *dpp, 
-                 const rgw_user& user, optional_yield y,
-                 const RemoveParams& params = {});
-};
-
-#endif
-
diff --git a/src/rgw/store/rados/rgw_period.cc b/src/rgw/store/rados/rgw_period.cc
deleted file mode 100644 (file)
index 61602b3..0000000
+++ /dev/null
@@ -1,324 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab ft=cpp
-
-#include "rgw_sync.h"
-
-#include "services/svc_zone.h"
-
-#define dout_subsys ceph_subsys_rgw
-
-using namespace std;
-using namespace rgw_zone_defaults;
-
-int RGWPeriod::get_zonegroup(RGWZoneGroup& zonegroup,
-                             const string& zonegroup_id) const
-{
-  map<string, RGWZoneGroup>::const_iterator iter;
-  if (!zonegroup_id.empty()) {
-    iter = period_map.zonegroups.find(zonegroup_id);
-  } else {
-    iter = period_map.zonegroups.find("default");
-  }
-  if (iter != period_map.zonegroups.end()) {
-    zonegroup = iter->second;
-    return 0;
-  }
-
-  return -ENOENT;
-}
-
-int RGWPeriod::get_latest_epoch(const DoutPrefixProvider *dpp, epoch_t& latest_epoch, optional_yield y)
-{
-  RGWPeriodLatestEpochInfo info;
-
-  int ret = read_latest_epoch(dpp, info, y);
-  if (ret < 0) {
-    return ret;
-  }
-
-  latest_epoch = info.epoch;
-
-  return 0;
-}
-
-int RGWPeriod::delete_obj(const DoutPrefixProvider *dpp, optional_yield y)
-{
-  rgw_pool pool(get_pool(cct));
-
-  // delete the object for each period epoch
-  for (epoch_t e = 1; e <= epoch; e++) {
-    RGWPeriod p{get_id(), e};
-    rgw_raw_obj oid{pool, p.get_period_oid()};
-    auto sysobj = sysobj_svc->get_obj(oid);
-    int ret = sysobj.wop().remove(dpp, y);
-    if (ret < 0) {
-      ldpp_dout(dpp, 0) << "WARNING: failed to delete period object " << oid
-          << ": " << cpp_strerror(-ret) << dendl;
-    }
-  }
-
-  // delete the .latest_epoch object
-  rgw_raw_obj oid{pool, get_period_oid_prefix() + get_latest_epoch_oid()};
-  auto sysobj = sysobj_svc->get_obj(oid);
-  int ret = sysobj.wop().remove(dpp, y);
-  if (ret < 0) {
-    ldpp_dout(dpp, 0) << "WARNING: failed to delete period object " << oid
-        << ": " << cpp_strerror(-ret) << dendl;
-  }
-  return ret;
-}
-
-int RGWPeriod::add_zonegroup(const DoutPrefixProvider *dpp, const RGWZoneGroup& zonegroup, optional_yield y)
-{
-  if (zonegroup.realm_id != realm_id) {
-    return 0;
-  }
-  int ret = period_map.update(zonegroup, cct);
-  if (ret < 0) {
-    ldpp_dout(dpp, 0) << "ERROR: updating period map: " << cpp_strerror(-ret) << dendl;
-    return ret;
-  }
-
-  return store_info(dpp, false, y);
-}
-
-int RGWPeriod::update(const DoutPrefixProvider *dpp, optional_yield y)
-{
-  auto zone_svc = sysobj_svc->get_zone_svc();
-  ldpp_dout(dpp, 20) << __func__ << " realm " << realm_id << " period " << get_id() << dendl;
-  list<string> zonegroups;
-  int ret = zone_svc->list_zonegroups(dpp, zonegroups);
-  if (ret < 0) {
-    ldpp_dout(dpp, 0) << "ERROR: failed to list zonegroups: " << cpp_strerror(-ret) << dendl;
-    return ret;
-  }
-
-  // clear zone short ids of removed zones. period_map.update() will add the
-  // remaining zones back
-  period_map.short_zone_ids.clear();
-
-  for (auto& iter : zonegroups) {
-    RGWZoneGroup zg(string(), iter);
-    ret = zg.init(dpp, cct, sysobj_svc, y);
-    if (ret < 0) {
-      ldpp_dout(dpp, 0) << "WARNING: zg.init() failed: " << cpp_strerror(-ret) << dendl;
-      continue;
-    }
-
-    if (zg.realm_id != realm_id) {
-      ldpp_dout(dpp, 20) << "skipping zonegroup " << zg.get_name() << " zone realm id " << zg.realm_id << ", not on our realm " << realm_id << dendl;
-      continue;
-    }
-
-    if (zg.master_zone.empty()) {
-      ldpp_dout(dpp, 0) << "ERROR: zonegroup " << zg.get_name() << " should have a master zone " << dendl;
-      return -EINVAL;
-    }
-
-    if (zg.zones.find(zg.master_zone) == zg.zones.end()) {
-      ldpp_dout(dpp, 0) << "ERROR: zonegroup " << zg.get_name()
-                   << " has a non existent master zone "<< dendl;
-      return -EINVAL;
-    }
-
-    if (zg.is_master_zonegroup()) {
-      master_zonegroup = zg.get_id();
-      master_zone = zg.master_zone;
-    }
-
-    int ret = period_map.update(zg, cct);
-    if (ret < 0) {
-      return ret;
-    }
-  }
-
-  ret = period_config.read(dpp, sysobj_svc, realm_id, y);
-  if (ret < 0 && ret != -ENOENT) {
-    ldpp_dout(dpp, 0) << "ERROR: failed to read period config: "
-        << cpp_strerror(ret) << dendl;
-    return ret;
-  }
-  return 0;
-}
-
-void RGWPeriod::fork()
-{
-  ldout(cct, 20) << __func__ << " realm " << realm_id << " period " << id << dendl;
-  predecessor_uuid = id;
-  id = get_staging_id(realm_id);
-  period_map.reset();
-  realm_epoch++;
-}
-
-static int read_sync_status(const DoutPrefixProvider *dpp, rgw::sal::Driver* driver, rgw_meta_sync_status *sync_status)
-{
-  rgw::sal::RadosStore* rados_store = static_cast<rgw::sal::RadosStore*>(driver);
-  // initialize a sync status manager to read the status
-  RGWMetaSyncStatusManager mgr(rados_store, rados_store->svc()->rados->get_async_processor());
-  int r = mgr.init(dpp);
-  if (r < 0) {
-    return r;
-  }
-  r = mgr.read_sync_status(dpp, sync_status);
-  mgr.stop();
-  return r;
-}
-
-int RGWPeriod::update_sync_status(const DoutPrefixProvider *dpp,
-                                  rgw::sal::Driver* driver, /* for now */
-                                 const RGWPeriod &current_period,
-                                  std::ostream& error_stream,
-                                  bool force_if_stale)
-{
-  rgw_meta_sync_status status;
-  int r = read_sync_status(dpp, driver, &status);
-  if (r < 0) {
-    ldpp_dout(dpp, 0) << "period failed to read sync status: "
-        << cpp_strerror(-r) << dendl;
-    return r;
-  }
-
-  std::vector<std::string> markers;
-
-  const auto current_epoch = current_period.get_realm_epoch();
-  if (current_epoch != status.sync_info.realm_epoch) {
-    // no sync status markers for the current period
-    ceph_assert(current_epoch > status.sync_info.realm_epoch);
-    const int behind = current_epoch - status.sync_info.realm_epoch;
-    if (!force_if_stale && current_epoch > 1) {
-      error_stream << "ERROR: This zone is " << behind << " period(s) behind "
-          "the current master zone in metadata sync. If this zone is promoted "
-          "to master, any metadata changes during that time are likely to "
-          "be lost.\n"
-          "Waiting for this zone to catch up on metadata sync (see "
-          "'radosgw-admin sync status') is recommended.\n"
-          "To promote this zone to master anyway, add the flag "
-          "--yes-i-really-mean-it." << std::endl;
-      return -EINVAL;
-    }
-    // empty sync status markers - other zones will skip this period during
-    // incremental metadata sync
-    markers.resize(status.sync_info.num_shards);
-  } else {
-    markers.reserve(status.sync_info.num_shards);
-    for (auto& i : status.sync_markers) {
-      auto& marker = i.second;
-      // filter out markers from other periods
-      if (marker.realm_epoch != current_epoch) {
-        marker.marker.clear();
-      }
-      markers.emplace_back(std::move(marker.marker));
-    }
-  }
-
-  std::swap(sync_status, markers);
-  return 0;
-}
-
-int RGWPeriod::commit(const DoutPrefixProvider *dpp,
-                     rgw::sal::Driver* driver,
-                     RGWRealm& realm, const RGWPeriod& current_period,
-                      std::ostream& error_stream, optional_yield y,
-                     bool force_if_stale)
-{
-  auto zone_svc = sysobj_svc->get_zone_svc();
-  ldpp_dout(dpp, 20) << __func__ << " realm " << realm.get_id() << " period " << current_period.get_id() << dendl;
-  // gateway must be in the master zone to commit
-  if (master_zone != zone_svc->get_zone_params().get_id()) {
-    error_stream << "Cannot commit period on zone "
-        << zone_svc->get_zone_params().get_id() << ", it must be sent to "
-        "the period's master zone " << master_zone << '.' << std::endl;
-    return -EINVAL;
-  }
-  // period predecessor must match current period
-  if (predecessor_uuid != current_period.get_id()) {
-    error_stream << "Period predecessor " << predecessor_uuid
-        << " does not match current period " << current_period.get_id()
-        << ". Use 'period pull' to get the latest period from the master, "
-        "reapply your changes, and try again." << std::endl;
-    return -EINVAL;
-  }
-  // realm epoch must be 1 greater than current period
-  if (realm_epoch != current_period.get_realm_epoch() + 1) {
-    error_stream << "Period's realm epoch " << realm_epoch
-        << " does not come directly after current realm epoch "
-        << current_period.get_realm_epoch() << ". Use 'realm pull' to get the "
-        "latest realm and period from the master zone, reapply your changes, "
-        "and try again." << std::endl;
-    return -EINVAL;
-  }
-  // did the master zone change?
-  if (master_zone != current_period.get_master_zone()) {
-    // store the current metadata sync status in the period
-    int r = update_sync_status(dpp, driver, current_period, error_stream, force_if_stale);
-    if (r < 0) {
-      ldpp_dout(dpp, 0) << "failed to update metadata sync status: "
-          << cpp_strerror(-r) << dendl;
-      return r;
-    }
-    // create an object with a new period id
-    r = create(dpp, y, true);
-    if (r < 0) {
-      ldpp_dout(dpp, 0) << "failed to create new period: " << cpp_strerror(-r) << dendl;
-      return r;
-    }
-    // set as current period
-    r = realm.set_current_period(dpp, *this, y);
-    if (r < 0) {
-      ldpp_dout(dpp, 0) << "failed to update realm's current period: "
-          << cpp_strerror(-r) << dendl;
-      return r;
-    }
-    ldpp_dout(dpp, 4) << "Promoted to master zone and committed new period "
-        << id << dendl;
-    realm.notify_new_period(dpp, *this, y);
-    return 0;
-  }
-  // period must be based on current epoch
-  if (epoch != current_period.get_epoch()) {
-    error_stream << "Period epoch " << epoch << " does not match "
-        "predecessor epoch " << current_period.get_epoch()
-        << ". Use 'period pull' to get the latest epoch from the master zone, "
-        "reapply your changes, and try again." << std::endl;
-    return -EINVAL;
-  }
-  // set period as next epoch
-  set_id(current_period.get_id());
-  set_epoch(current_period.get_epoch() + 1);
-  set_predecessor(current_period.get_predecessor());
-  realm_epoch = current_period.get_realm_epoch();
-  // write the period to rados
-  int r = store_info(dpp, false, y);
-  if (r < 0) {
-    ldpp_dout(dpp, 0) << "failed to store period: " << cpp_strerror(-r) << dendl;
-    return r;
-  }
-  // set as latest epoch
-  r = update_latest_epoch(dpp, epoch, y);
-  if (r == -EEXIST) {
-    // already have this epoch (or a more recent one)
-    return 0;
-  }
-  if (r < 0) {
-    ldpp_dout(dpp, 0) << "failed to set latest epoch: " << cpp_strerror(-r) << dendl;
-    return r;
-  }
-  r = reflect(dpp, y);
-  if (r < 0) {
-    ldpp_dout(dpp, 0) << "failed to update local objects: " << cpp_strerror(-r) << dendl;
-    return r;
-  }
-  ldpp_dout(dpp, 4) << "Committed new epoch " << epoch
-      << " for period " << id << dendl;
-  realm.notify_new_period(dpp, *this, y);
-  return 0;
-}
-
-void RGWPeriod::generate_test_instances(list<RGWPeriod*> &o)
-{
-  RGWPeriod *z = new RGWPeriod;
-  o.push_back(z);
-  o.push_back(new RGWPeriod);
-}
-
-
diff --git a/src/rgw/store/rados/rgw_rest_pubsub.cc b/src/rgw/store/rados/rgw_rest_pubsub.cc
deleted file mode 100644 (file)
index 23d5661..0000000
+++ /dev/null
@@ -1,1069 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-
-#include <algorithm>
-#include <boost/tokenizer.hpp>
-#include <optional>
-#include "rgw_rest_pubsub.h"
-#include "rgw_pubsub_push.h"
-#include "rgw_pubsub.h"
-#include "rgw_op.h"
-#include "rgw_rest.h"
-#include "rgw_rest_s3.h"
-#include "rgw_arn.h"
-#include "rgw_auth_s3.h"
-#include "rgw_notify.h"
-#include "rgw_sal_rados.h"
-#include "services/svc_zone.h"
-#include "common/dout.h"
-#include "rgw_url.h"
-
-#define dout_context g_ceph_context
-#define dout_subsys ceph_subsys_rgw
-
-static const char* AWS_SNS_NS("https://sns.amazonaws.com/doc/2010-03-31/");
-
-bool verify_transport_security(CephContext *cct, const RGWEnv& env) {
-  const auto is_secure = rgw_transport_is_secure(cct, env);
-  if (!is_secure && g_conf().get_val<bool>("rgw_allow_notification_secrets_in_cleartext")) {
-    ldout(cct, 0) << "WARNING: bypassing endpoint validation, allows sending secrets over insecure transport" << dendl;
-    return true;
-  }
-  return is_secure;
-}
-
-// make sure that endpoint is a valid URL
-// make sure that if user/password are passed inside URL, it is over secure connection
-// update rgw_pubsub_sub_dest to indicate that a password is stored in the URL
-bool validate_and_update_endpoint_secret(rgw_pubsub_sub_dest& dest, CephContext *cct, const RGWEnv& env) {
-  if (dest.push_endpoint.empty()) {
-      return true;
-  }
-  std::string user;
-  std::string password;
-  if (!rgw::parse_url_userinfo(dest.push_endpoint, user, password)) {
-    ldout(cct, 1) << "endpoint validation error: malformed endpoint URL:" << dest.push_endpoint << dendl;
-    return false;
-  }
-  // this should be verified inside parse_url()
-  ceph_assert(user.empty() == password.empty());
-  if (!user.empty()) {
-      dest.stored_secret = true;
-      if (!verify_transport_security(cct, env)) {
-        ldout(cct, 1) << "endpoint validation error: sending secrets over insecure transport" << dendl;
-        return false;
-      }
-  }
-  return true;
-}
-
-bool topic_has_endpoint_secret(const rgw_pubsub_topic_subs& topic) {
-    return topic.topic.dest.stored_secret;
-}
-
-bool topics_has_endpoint_secret(const rgw_pubsub_topics& topics) {
-    for (const auto& topic : topics.topics) {
-        if (topic_has_endpoint_secret(topic.second)) return true;
-    }
-    return false;
-}
-
-// command (AWS compliant): 
-// POST
-// Action=CreateTopic&Name=<topic-name>[&OpaqueData=data][&push-endpoint=<endpoint>[&persistent][&<arg1>=<value1>]]
-class RGWPSCreateTopicOp : public RGWOp {
-  private:
-  std::optional<RGWPubSub> ps;
-  std::string topic_name;
-  rgw_pubsub_sub_dest dest;
-  std::string topic_arn;
-  std::string opaque_data;
-  
-  int get_params() {
-    topic_name = s->info.args.get("Name");
-    if (topic_name.empty()) {
-      ldpp_dout(this, 1) << "CreateTopic Action 'Name' argument is missing" << dendl;
-      return -EINVAL;
-    }
-
-    opaque_data = s->info.args.get("OpaqueData");
-
-    dest.push_endpoint = s->info.args.get("push-endpoint");
-    s->info.args.get_bool("persistent", &dest.persistent, false);
-
-    if (!validate_and_update_endpoint_secret(dest, s->cct, *(s->info.env))) {
-      return -EINVAL;
-    }
-    for (const auto& param : s->info.args.get_params()) {
-      if (param.first == "Action" || param.first == "Name" || param.first == "PayloadHash") {
-        continue;
-      }
-      dest.push_endpoint_args.append(param.first+"="+param.second+"&");
-    }
-
-    if (!dest.push_endpoint_args.empty()) {
-      // remove last separator
-      dest.push_endpoint_args.pop_back();
-    }
-    if (!dest.push_endpoint.empty() && dest.persistent) {
-      const auto ret = rgw::notify::add_persistent_topic(topic_name, s->yield);
-      if (ret < 0) {
-        ldpp_dout(this, 1) << "CreateTopic Action failed to create queue for persistent topics. error:" << ret << dendl;
-        return ret;
-      }
-    }
-    
-    // dest object only stores endpoint info
-    dest.arn_topic = topic_name;
-    // the topic ARN will be sent in the reply
-    const rgw::ARN arn(rgw::Partition::aws, rgw::Service::sns, 
-        driver->get_zone()->get_zonegroup().get_name(),
-        s->user->get_tenant(), topic_name);
-    topic_arn = arn.to_string();
-    return 0;
-  }
-
-  public:
-  int verify_permission(optional_yield) override {
-    return 0;
-  }
-
-  void pre_exec() override {
-    rgw_bucket_object_pre_exec(s);
-  }
-  void execute(optional_yield) override;
-
-  const char* name() const override { return "pubsub_topic_create"; }
-  RGWOpType get_type() override { return RGW_OP_PUBSUB_TOPIC_CREATE; }
-  uint32_t op_mask() override { return RGW_OP_TYPE_WRITE; }
-
-  void send_response() override {
-    if (op_ret) {
-      set_req_state_err(s, op_ret);
-    }
-    dump_errno(s);
-    end_header(s, this, "application/xml");
-
-    if (op_ret < 0) {
-      return;
-    }
-
-    const auto f = s->formatter;
-    f->open_object_section_in_ns("CreateTopicResponse", AWS_SNS_NS);
-    f->open_object_section("CreateTopicResult");
-    encode_xml("TopicArn", topic_arn, f); 
-    f->close_section(); // CreateTopicResult
-    f->open_object_section("ResponseMetadata");
-    encode_xml("RequestId", s->req_id, f); 
-    f->close_section(); // ResponseMetadata
-    f->close_section(); // CreateTopicResponse
-    rgw_flush_formatter_and_reset(s, f);
-  }
-};
-
-void RGWPSCreateTopicOp::execute(optional_yield y) {
-  op_ret = get_params();
-  if (op_ret < 0) {
-    return;
-  }
-
-  ps.emplace(static_cast<rgw::sal::RadosStore*>(driver), s->owner.get_id().tenant);
-  op_ret = ps->create_topic(this, topic_name, dest, topic_arn, opaque_data, y);
-  if (op_ret < 0) {
-    ldpp_dout(this, 1) << "failed to create topic '" << topic_name << "', ret=" << op_ret << dendl;
-    return;
-  }
-  ldpp_dout(this, 20) << "successfully created topic '" << topic_name << "'" << dendl;
-}
-
-// command (AWS compliant): 
-// POST 
-// Action=ListTopics
-class RGWPSListTopicsOp : public RGWOp {
-private:
-  std::optional<RGWPubSub> ps;
-  rgw_pubsub_topics result;
-
-public:
-  int verify_permission(optional_yield) override {
-    return 0;
-  }
-  void pre_exec() override {
-    rgw_bucket_object_pre_exec(s);
-  }
-  void execute(optional_yield) override;
-
-  const char* name() const override { return "pubsub_topics_list"; }
-  RGWOpType get_type() override { return RGW_OP_PUBSUB_TOPICS_LIST; }
-  uint32_t op_mask() override { return RGW_OP_TYPE_READ; }
-
-  void send_response() override {
-    if (op_ret) {
-      set_req_state_err(s, op_ret);
-    }
-    dump_errno(s);
-    end_header(s, this, "application/xml");
-
-    if (op_ret < 0) {
-      return;
-    }
-
-    const auto f = s->formatter;
-    f->open_object_section_in_ns("ListTopicsResponse", AWS_SNS_NS);
-    f->open_object_section("ListTopicsResult");
-    encode_xml("Topics", result, f); 
-    f->close_section(); // ListTopicsResult
-    f->open_object_section("ResponseMetadata");
-    encode_xml("RequestId", s->req_id, f); 
-    f->close_section(); // ResponseMetadat
-    f->close_section(); // ListTopicsResponse
-    rgw_flush_formatter_and_reset(s, f);
-  }
-};
-
-void RGWPSListTopicsOp::execute(optional_yield y) {
-  ps.emplace(static_cast<rgw::sal::RadosStore*>(driver), s->owner.get_id().tenant);
-  op_ret = ps->get_topics(&result);
-  // if there are no topics it is not considered an error
-  op_ret = op_ret == -ENOENT ? 0 : op_ret;
-  if (op_ret < 0) {
-    ldpp_dout(this, 1) << "failed to get topics, ret=" << op_ret << dendl;
-    return;
-  }
-  if (topics_has_endpoint_secret(result) && !verify_transport_security(s->cct, *(s->info.env))) {
-    ldpp_dout(this, 1) << "topics contain secrets and cannot be sent over insecure transport" << dendl;
-    op_ret = -EPERM;
-    return;
-  }
-  ldpp_dout(this, 20) << "successfully got topics" << dendl;
-}
-
-// command (extension to AWS): 
-// POST
-// Action=GetTopic&TopicArn=<topic-arn>
-class RGWPSGetTopicOp : public RGWOp {
-  private:
-  std::string topic_name;
-  std::optional<RGWPubSub> ps;
-  rgw_pubsub_topic_subs result;
-  
-  int get_params() {
-    const auto topic_arn = rgw::ARN::parse((s->info.args.get("TopicArn")));
-
-    if (!topic_arn || topic_arn->resource.empty()) {
-        ldpp_dout(this, 1) << "GetTopic Action 'TopicArn' argument is missing or invalid" << dendl;
-        return -EINVAL;
-    }
-
-    topic_name = topic_arn->resource;
-    return 0;
-  }
-
-  public:
-  int verify_permission(optional_yield y) override {
-    return 0;
-  }
-  void pre_exec() override {
-    rgw_bucket_object_pre_exec(s);
-  }
-  void execute(optional_yield y) override;
-
-  const char* name() const override { return "pubsub_topic_get"; }
-  RGWOpType get_type() override { return RGW_OP_PUBSUB_TOPIC_GET; }
-  uint32_t op_mask() override { return RGW_OP_TYPE_READ; }
-
-  void send_response() override {
-    if (op_ret) {
-      set_req_state_err(s, op_ret);
-    }
-    dump_errno(s);
-    end_header(s, this, "application/xml");
-
-    if (op_ret < 0) {
-      return;
-    }
-
-    const auto f = s->formatter;
-    f->open_object_section("GetTopicResponse");
-    f->open_object_section("GetTopicResult");
-    encode_xml("Topic", result.topic, f); 
-    f->close_section();
-    f->open_object_section("ResponseMetadata");
-    encode_xml("RequestId", s->req_id, f); 
-    f->close_section();
-    f->close_section();
-    rgw_flush_formatter_and_reset(s, f);
-  }
-};
-
-void RGWPSGetTopicOp::execute(optional_yield y) {
-  op_ret = get_params();
-  if (op_ret < 0) {
-    return;
-  }
-  ps.emplace(static_cast<rgw::sal::RadosStore*>(driver), s->owner.get_id().tenant);
-  op_ret = ps->get_topic(topic_name, &result);
-  if (op_ret < 0) {
-    ldpp_dout(this, 1) << "failed to get topic '" << topic_name << "', ret=" << op_ret << dendl;
-    return;
-  }
-  if (topic_has_endpoint_secret(result) && !verify_transport_security(s->cct, *(s->info.env))) {
-    ldpp_dout(this, 1) << "topic '" << topic_name << "' contain secret and cannot be sent over insecure transport" << dendl;
-    op_ret = -EPERM;
-    return;
-  }
-  ldpp_dout(this, 1) << "successfully got topic '" << topic_name << "'" << dendl;
-}
-
-// command (AWS compliant): 
-// POST
-// Action=GetTopicAttributes&TopicArn=<topic-arn>
-class RGWPSGetTopicAttributesOp : public RGWOp {
-  private:
-  std::string topic_name;
-  std::optional<RGWPubSub> ps;
-  rgw_pubsub_topic_subs result;
-  
-  int get_params() {
-    const auto topic_arn = rgw::ARN::parse((s->info.args.get("TopicArn")));
-
-    if (!topic_arn || topic_arn->resource.empty()) {
-        ldpp_dout(this, 1) << "GetTopicAttribute Action 'TopicArn' argument is missing or invalid" << dendl;
-        return -EINVAL;
-    }
-
-    topic_name = topic_arn->resource;
-    return 0;
-  }
-
-  public:
-  int verify_permission(optional_yield y) override {
-    return 0;
-  }
-  void pre_exec() override {
-    rgw_bucket_object_pre_exec(s);
-  }
-  void execute(optional_yield y) override;
-
-  const char* name() const override { return "pubsub_topic_get"; }
-  RGWOpType get_type() override { return RGW_OP_PUBSUB_TOPIC_GET; }
-  uint32_t op_mask() override { return RGW_OP_TYPE_READ; }
-
-  void send_response() override {
-    if (op_ret) {
-      set_req_state_err(s, op_ret);
-    }
-    dump_errno(s);
-    end_header(s, this, "application/xml");
-
-    if (op_ret < 0) {
-      return;
-    }
-
-    const auto f = s->formatter;
-    f->open_object_section_in_ns("GetTopicAttributesResponse", AWS_SNS_NS);
-    f->open_object_section("GetTopicAttributesResult");
-    result.topic.dump_xml_as_attributes(f);
-    f->close_section(); // GetTopicAttributesResult
-    f->open_object_section("ResponseMetadata");
-    encode_xml("RequestId", s->req_id, f); 
-    f->close_section(); // ResponseMetadata
-    f->close_section(); // GetTopicAttributesResponse
-    rgw_flush_formatter_and_reset(s, f);
-  }
-};
-
-void RGWPSGetTopicAttributesOp::execute(optional_yield y) {
-  op_ret = get_params();
-  if (op_ret < 0) {
-    return;
-  }
-  ps.emplace(static_cast<rgw::sal::RadosStore*>(driver), s->owner.get_id().tenant);
-  op_ret = ps->get_topic(topic_name, &result);
-  if (op_ret < 0) {
-    ldpp_dout(this, 1) << "failed to get topic '" << topic_name << "', ret=" << op_ret << dendl;
-    return;
-  }
-  if (topic_has_endpoint_secret(result) && !verify_transport_security(s->cct, *(s->info.env))) {
-    ldpp_dout(this, 1) << "topic '" << topic_name << "' contain secret and cannot be sent over insecure transport" << dendl;
-    op_ret = -EPERM;
-    return;
-  }
-  ldpp_dout(this, 1) << "successfully got topic '" << topic_name << "'" << dendl;
-}
-
-// command (AWS compliant): 
-// POST
-// Action=DeleteTopic&TopicArn=<topic-arn>
-class RGWPSDeleteTopicOp : public RGWOp {
-  private:
-  std::string topic_name;
-  std::optional<RGWPubSub> ps;
-  
-  int get_params() {
-    const auto topic_arn = rgw::ARN::parse((s->info.args.get("TopicArn")));
-
-    if (!topic_arn || topic_arn->resource.empty()) {
-      ldpp_dout(this, 1) << "DeleteTopic Action 'TopicArn' argument is missing or invalid" << dendl;
-      return -EINVAL;
-    }
-
-    topic_name = topic_arn->resource;
-
-    // upon deletion it is not known if topic is persistent or not
-    // will try to delete the persistent topic anyway
-    const auto ret = rgw::notify::remove_persistent_topic(topic_name, s->yield);
-    if (ret == -ENOENT) {
-      // topic was not persistent, or already deleted
-      return 0;
-    }
-    if (ret < 0) {
-      ldpp_dout(this, 1) << "DeleteTopic Action failed to remove queue for persistent topics. error:" << ret << dendl;
-      return ret;
-    }
-
-    return 0;
-  }
-
-  public:
-  int verify_permission(optional_yield) override {
-    return 0;
-  }
-  void pre_exec() override {
-    rgw_bucket_object_pre_exec(s);
-  }
-  void execute(optional_yield y) override;
-
-  const char* name() const override { return "pubsub_topic_delete"; }
-  RGWOpType get_type() override { return RGW_OP_PUBSUB_TOPIC_DELETE; }
-  uint32_t op_mask() override { return RGW_OP_TYPE_DELETE; }
-
-  void send_response() override {
-    if (op_ret) {
-      set_req_state_err(s, op_ret);
-    }
-    dump_errno(s);
-    end_header(s, this, "application/xml");
-
-    if (op_ret < 0) {
-      return;
-    }
-
-    const auto f = s->formatter;
-    f->open_object_section_in_ns("DeleteTopicResponse", AWS_SNS_NS);
-    f->open_object_section("ResponseMetadata");
-    encode_xml("RequestId", s->req_id, f); 
-    f->close_section(); // ResponseMetadata
-    f->close_section(); // DeleteTopicResponse
-    rgw_flush_formatter_and_reset(s, f);
-  }
-};
-
-void RGWPSDeleteTopicOp::execute(optional_yield y) {
-  op_ret = get_params();
-  if (op_ret < 0) {
-    return;
-  }
-  ps.emplace(static_cast<rgw::sal::RadosStore*>(driver), s->owner.get_id().tenant);
-  op_ret = ps->remove_topic(this, topic_name, y);
-  if (op_ret < 0) {
-    ldpp_dout(this, 1) << "failed to remove topic '" << topic_name << ", ret=" << op_ret << dendl;
-    return;
-  }
-  ldpp_dout(this, 1) << "successfully removed topic '" << topic_name << "'" << dendl;
-}
-
-namespace {
-// utility classes and functions for handling parameters with the following format:
-// Attributes.entry.{N}.{key|value}={VALUE}
-// N - any unsigned number
-// VALUE - url encoded string
-
-// and Attribute is holding key and value
-// ctor and set are done according to the "type" argument
-// if type is not "key" or "value" its a no-op
-class Attribute {
-  std::string key;
-  std::string value;
-public:
-  Attribute(const std::string& type, const std::string& key_or_value) {
-    set(type, key_or_value);
-  }
-  void set(const std::string& type, const std::string& key_or_value) {
-    if (type == "key") {
-      key = key_or_value;
-    } else if (type == "value") {
-      value = key_or_value;
-    }
-  }
-  const std::string& get_key() const { return key; }
-  const std::string& get_value() const { return value; }
-};
-
-using AttributeMap = std::map<unsigned, Attribute>;
-
-// aggregate the attributes into a map
-// the key and value are associated by the index (N)
-// no assumptions are made on the order in which these parameters are added
-void update_attribute_map(const std::string& input, AttributeMap& map) {
-  const boost::char_separator<char> sep(".");
-  const boost::tokenizer tokens(input, sep);
-  auto token = tokens.begin();
-  if (*token != "Attributes") {
-      return;
-  }
-  ++token;
-
-  if (*token != "entry") {
-      return;
-  }
-  ++token;
-
-  unsigned idx;
-  try {
-    idx = std::stoul(*token);
-  } catch (const std::invalid_argument&) {
-    return;
-  }
-  ++token;
-
-  std::string key_or_value = "";
-  // get the rest of the string regardless of dots
-  // this is to allow dots in the value
-  while (token != tokens.end()) {
-    key_or_value.append(*token+".");
-    ++token;
-  }
-  // remove last separator
-  key_or_value.pop_back();
-
-  auto pos = key_or_value.find("=");
-  if (pos != std::string::npos) {
-    const auto key_or_value_lhs = key_or_value.substr(0, pos);
-    const auto key_or_value_rhs = url_decode(key_or_value.substr(pos + 1, key_or_value.size() - 1));
-    const auto map_it = map.find(idx);
-    if (map_it == map.end()) {
-      // new entry
-      map.emplace(std::make_pair(idx, Attribute(key_or_value_lhs, key_or_value_rhs)));
-    } else {
-      // existing entry
-      map_it->second.set(key_or_value_lhs, key_or_value_rhs);
-    }
-  }
-}
-}
-
-void RGWHandler_REST_PSTopic_AWS::rgw_topic_parse_input() {
-  if (post_body.size() > 0) {
-    ldpp_dout(s, 10) << "Content of POST: " << post_body << dendl;
-
-    if (post_body.find("Action") != std::string::npos) {
-      const boost::char_separator<char> sep("&");
-      const boost::tokenizer<boost::char_separator<char>> tokens(post_body, sep);
-      AttributeMap map;
-      for (const auto& t : tokens) {
-        auto pos = t.find("=");
-        if (pos != std::string::npos) {
-          const auto key = t.substr(0, pos);
-          if (key == "Action") {
-            s->info.args.append(key, t.substr(pos + 1, t.size() - 1));
-          } else if (key == "Name" || key == "TopicArn") {
-            const auto value = url_decode(t.substr(pos + 1, t.size() - 1));
-            s->info.args.append(key, value);
-          } else {
-            update_attribute_map(t, map);
-          }
-        }
-      }
-      // update the regular args with the content of the attribute map
-      for (const auto& attr : map) {
-          s->info.args.append(attr.second.get_key(), attr.second.get_value());
-      }
-    }
-    const auto payload_hash = rgw::auth::s3::calc_v4_payload_hash(post_body);
-    s->info.args.append("PayloadHash", payload_hash);
-  }
-}
-
-RGWOp* RGWHandler_REST_PSTopic_AWS::op_post() {
-  rgw_topic_parse_input();
-
-  if (s->info.args.exists("Action")) {
-    const auto action = s->info.args.get("Action");
-    if (action.compare("CreateTopic") == 0)
-      return new RGWPSCreateTopicOp();
-    if (action.compare("DeleteTopic") == 0)
-      return new RGWPSDeleteTopicOp;
-    if (action.compare("ListTopics") == 0)
-      return new RGWPSListTopicsOp();
-    if (action.compare("GetTopic") == 0)
-      return new RGWPSGetTopicOp();
-    if (action.compare("GetTopicAttributes") == 0)
-      return new RGWPSGetTopicAttributesOp();
-  }
-
-  return nullptr;
-}
-
-int RGWHandler_REST_PSTopic_AWS::authorize(const DoutPrefixProvider* dpp, optional_yield y) {
-  return RGW_Auth_S3::authorize(dpp, driver, auth_registry, s, y);
-}
-
-namespace {
-// return a unique topic by prefexing with the notification name: <notification>_<topic>
-std::string topic_to_unique(const std::string& topic, const std::string& notification) {
-  return notification + "_" + topic;
-}
-
-// extract the topic from a unique topic of the form: <notification>_<topic>
-[[maybe_unused]] std::string unique_to_topic(const std::string& unique_topic, const std::string& notification) {
-  if (unique_topic.find(notification + "_") == std::string::npos) {
-    return "";
-  }
-  return unique_topic.substr(notification.length() + 1);
-}
-
-// from list of bucket topics, find the one that was auto-generated by a notification
-auto find_unique_topic(const rgw_pubsub_bucket_topics& bucket_topics, const std::string& notif_name) {
-    auto it = std::find_if(bucket_topics.topics.begin(), bucket_topics.topics.end(), [&](const auto& val) { return notif_name == val.second.s3_id; });
-    return it != bucket_topics.topics.end() ?
-        std::optional<std::reference_wrapper<const rgw_pubsub_topic_filter>>(it->second):
-        std::nullopt;
-}
-}
-
-int remove_notification_by_topic(const DoutPrefixProvider *dpp, const std::string& topic_name, const RGWPubSub::BucketRef& b, optional_yield y, RGWPubSub& ps) {
-  int op_ret = b->remove_notification(dpp, topic_name, y);
-  if (op_ret < 0) {
-    ldpp_dout(dpp, 1) << "failed to remove notification of topic '" << topic_name << "', ret=" << op_ret << dendl;
-  }
-  op_ret = ps.remove_topic(dpp, topic_name, y);
-  if (op_ret < 0) {
-    ldpp_dout(dpp, 1) << "failed to remove auto-generated topic '" << topic_name << "', ret=" << op_ret << dendl;
-  }
-  return op_ret;
-}
-
-int delete_all_notifications(const DoutPrefixProvider *dpp, const rgw_pubsub_bucket_topics& bucket_topics, const RGWPubSub::BucketRef& b, optional_yield y, RGWPubSub& ps) {
-  // delete all notifications of on a bucket
-  for (const auto& topic : bucket_topics.topics) {
-    const auto op_ret = remove_notification_by_topic(dpp, topic.first, b, y, ps);
-    if (op_ret < 0) {
-      return op_ret;
-    }
-  }
-  return 0;
-}
-
-// command (S3 compliant): PUT /<bucket name>?notification
-// a "notification" and a subscription will be auto-generated
-// actual configuration is XML encoded in the body of the message
-class RGWPSCreateNotifOp : public RGWDefaultResponseOp {
-  private:
-  std::optional<RGWPubSub> ps;
-  std::string bucket_name;
-  RGWBucketInfo bucket_info;
-  rgw_pubsub_s3_notifications configurations;
-
-  int get_params() {
-    bool exists;
-    const auto no_value = s->info.args.get("notification", &exists);
-    if (!exists) {
-      ldpp_dout(this, 1) << "missing required param 'notification'" << dendl;
-      return -EINVAL;
-    } 
-    if (no_value.length() > 0) {
-      ldpp_dout(this, 1) << "param 'notification' should not have any value" << dendl;
-      return -EINVAL;
-    }
-    if (s->bucket_name.empty()) {
-      ldpp_dout(this, 1) << "request must be on a bucket" << dendl;
-      return -EINVAL;
-    }
-    bucket_name = s->bucket_name;
-    return 0;
-  }
-
-  public:
-  int verify_permission(optional_yield y) override;
-
-  void pre_exec() override {
-    rgw_bucket_object_pre_exec(s);
-  }
-
-  const char* name() const override { return "pubsub_notification_create_s3"; }
-  RGWOpType get_type() override { return RGW_OP_PUBSUB_NOTIF_CREATE; }
-  uint32_t op_mask() override { return RGW_OP_TYPE_WRITE; }
-
-  int get_params_from_body() {
-    const auto max_size = s->cct->_conf->rgw_max_put_param_size;
-    int r;
-    bufferlist data;
-    std::tie(r, data) = read_all_input(s, max_size, false);
-
-    if (r < 0) {
-      ldpp_dout(this, 1) << "failed to read XML payload" << dendl;
-      return r;
-    }
-    if (data.length() == 0) {
-      ldpp_dout(this, 1) << "XML payload missing" << dendl;
-      return -EINVAL;
-    }
-
-    RGWXMLDecoder::XMLParser parser;
-
-    if (!parser.init()){
-      ldpp_dout(this, 1) << "failed to initialize XML parser" << dendl;
-      return -EINVAL;
-    }
-    if (!parser.parse(data.c_str(), data.length(), 1)) {
-      ldpp_dout(this, 1) << "failed to parse XML payload" << dendl;
-      return -ERR_MALFORMED_XML;
-    }
-    try {
-      // NotificationConfigurations is mandatory
-      // It can be empty which means we delete all the notifications
-      RGWXMLDecoder::decode_xml("NotificationConfiguration", configurations, &parser, true);
-    } catch (RGWXMLDecoder::err& err) {
-      ldpp_dout(this, 1) << "failed to parse XML payload. error: " << err << dendl;
-      return -ERR_MALFORMED_XML;
-    }
-    return 0;
-  }
-
-  void execute(optional_yield) override;
-};
-
-void RGWPSCreateNotifOp::execute(optional_yield y) {
-  op_ret = get_params_from_body();
-  if (op_ret < 0) {
-    return;
-  }
-
-  ps.emplace(static_cast<rgw::sal::RadosStore*>(driver), s->owner.get_id().tenant);
-  auto b = ps->get_bucket(bucket_info.bucket);
-  ceph_assert(b);
-
-  if(configurations.list.empty()) {
-    // get all topics on a bucket
-    rgw_pubsub_bucket_topics bucket_topics;
-    op_ret = b->get_topics(&bucket_topics);
-    if (op_ret < 0) {
-      ldpp_dout(this, 1) << "failed to get list of topics from bucket '" << bucket_info.bucket.name << "', ret=" << op_ret << dendl;
-      return;
-    }
-
-    op_ret = delete_all_notifications(this, bucket_topics, b, y, *ps);
-    return;
-  }
-
-  for (const auto& c : configurations.list) {
-    const auto& notif_name = c.id;
-    if (notif_name.empty()) {
-      ldpp_dout(this, 1) << "missing notification id" << dendl;
-      op_ret = -EINVAL;
-      return;
-    }
-    if (c.topic_arn.empty()) {
-      ldpp_dout(this, 1) << "missing topic ARN in notification: '" << notif_name << "'" << dendl;
-      op_ret = -EINVAL;
-      return;
-    }
-
-    const auto arn = rgw::ARN::parse(c.topic_arn);
-    if (!arn || arn->resource.empty()) {
-      ldpp_dout(this, 1) << "topic ARN has invalid format: '" << c.topic_arn << "' in notification: '" << notif_name << "'" << dendl;
-      op_ret = -EINVAL;
-      return;
-    }
-
-    if (std::find(c.events.begin(), c.events.end(), rgw::notify::UnknownEvent) != c.events.end()) {
-      ldpp_dout(this, 1) << "unknown event type in notification: '" << notif_name << "'" << dendl;
-      op_ret = -EINVAL;
-      return;
-    }
-
-    const auto topic_name = arn->resource;
-
-    // get topic information. destination information is stored in the topic
-    rgw_pubsub_topic topic_info;  
-    op_ret = ps->get_topic(topic_name, &topic_info);
-    if (op_ret < 0) {
-      ldpp_dout(this, 1) << "failed to get topic '" << topic_name << "', ret=" << op_ret << dendl;
-      return;
-    }
-    // make sure that full topic configuration match
-    // TODO: use ARN match function
-    
-    // create unique topic name. this has 2 reasons:
-    // (1) topics cannot be shared between different S3 notifications because they hold the filter information
-    // (2) make topic clneaup easier, when notification is removed
-    const auto unique_topic_name = topic_to_unique(topic_name, notif_name);
-    // generate the internal topic. destination is stored here for the "push-only" case
-    // when no subscription exists
-    // ARN is cached to make the "GET" method faster
-    op_ret = ps->create_topic(this, unique_topic_name, topic_info.dest, topic_info.arn, topic_info.opaque_data, y);
-    if (op_ret < 0) {
-      ldpp_dout(this, 1) << "failed to auto-generate unique topic '" << unique_topic_name << 
-        "', ret=" << op_ret << dendl;
-      return;
-    }
-    ldpp_dout(this, 20) << "successfully auto-generated unique topic '" << unique_topic_name << "'" << dendl;
-    // generate the notification
-    rgw::notify::EventTypeList events;
-    op_ret = b->create_notification(this, unique_topic_name, c.events, std::make_optional(c.filter), notif_name, y);
-    if (op_ret < 0) {
-      ldpp_dout(this, 1) << "failed to auto-generate notification for unique topic '" << unique_topic_name <<
-        "', ret=" << op_ret << dendl;
-      // rollback generated topic (ignore return value)
-      ps->remove_topic(this, unique_topic_name, y);
-      return;
-    }
-    ldpp_dout(this, 20) << "successfully auto-generated notification for unique topic '" << unique_topic_name << "'" << dendl;
-  }
-}
-
-int RGWPSCreateNotifOp::verify_permission(optional_yield y) {
-  int ret = get_params();
-  if (ret < 0) {
-    return ret;
-  }
-
-  std::unique_ptr<rgw::sal::User> user = driver->get_user(s->owner.get_id());
-  std::unique_ptr<rgw::sal::Bucket> bucket;
-  ret = driver->get_bucket(this, user.get(), s->owner.get_id().tenant, bucket_name, &bucket, y);
-  if (ret < 0) {
-    ldpp_dout(this, 1) << "failed to get bucket info, cannot verify ownership" << dendl;
-    return ret;
-  }
-  bucket_info = bucket->get_info();
-
-  if (bucket_info.owner != s->owner.get_id()) {
-    ldpp_dout(this, 1) << "user doesn't own bucket, not allowed to create notification" << dendl;
-    return -EPERM;
-  }
-  return 0;
-}
-
-// command (extension to S3): DELETE /bucket?notification[=<notification-id>]
-class RGWPSDeleteNotifOp : public RGWDefaultResponseOp {
-  private:
-  std::optional<RGWPubSub> ps;
-  std::string bucket_name;
-  RGWBucketInfo bucket_info;
-  std::string notif_name;
-  
-  public:
-  int verify_permission(optional_yield y) override;
-
-  void pre_exec() override {
-    rgw_bucket_object_pre_exec(s);
-  }
-  
-  const char* name() const override { return "pubsub_notification_delete_s3"; }
-  RGWOpType get_type() override { return RGW_OP_PUBSUB_NOTIF_DELETE; }
-  uint32_t op_mask() override { return RGW_OP_TYPE_DELETE; }
-
-  int get_params() {
-    bool exists;
-    notif_name = s->info.args.get("notification", &exists);
-    if (!exists) {
-      ldpp_dout(this, 1) << "missing required param 'notification'" << dendl;
-      return -EINVAL;
-    } 
-    if (s->bucket_name.empty()) {
-      ldpp_dout(this, 1) << "request must be on a bucket" << dendl;
-      return -EINVAL;
-    }
-    bucket_name = s->bucket_name;
-    return 0;
-  }
-
-  void execute(optional_yield y) override;
-};
-
-void RGWPSDeleteNotifOp::execute(optional_yield y) {
-  op_ret = get_params();
-  if (op_ret < 0) {
-    return;
-  }
-
-  ps.emplace(static_cast<rgw::sal::RadosStore*>(driver), s->owner.get_id().tenant);
-  auto b = ps->get_bucket(bucket_info.bucket);
-  ceph_assert(b);
-
-  // get all topics on a bucket
-  rgw_pubsub_bucket_topics bucket_topics;
-  op_ret = b->get_topics(&bucket_topics);
-  if (op_ret < 0) {
-    ldpp_dout(this, 1) << "failed to get list of topics from bucket '" << bucket_info.bucket.name << "', ret=" << op_ret << dendl;
-    return;
-  }
-
-  if (!notif_name.empty()) {
-    // delete a specific notification
-    const auto unique_topic = find_unique_topic(bucket_topics, notif_name);
-    if (unique_topic) {
-      const auto unique_topic_name = unique_topic->get().topic.name;
-      op_ret = remove_notification_by_topic(this, unique_topic_name, b, y, *ps);
-      return;
-    }
-    // notification to be removed is not found - considered success
-    ldpp_dout(this, 20) << "notification '" << notif_name << "' already removed" << dendl;
-    return;
-  }
-
-  op_ret = delete_all_notifications(this, bucket_topics, b, y, *ps);
-}
-
-int RGWPSDeleteNotifOp::verify_permission(optional_yield y) {
-  int ret = get_params();
-  if (ret < 0) {
-    return ret;
-  }
-
-  std::unique_ptr<rgw::sal::User> user = driver->get_user(s->owner.get_id());
-  std::unique_ptr<rgw::sal::Bucket> bucket;
-  ret = driver->get_bucket(this, user.get(), s->owner.get_id().tenant, bucket_name, &bucket, y);
-  if (ret < 0) {
-    return ret;
-  }
-  bucket_info = bucket->get_info();
-
-  if (bucket_info.owner != s->owner.get_id()) {
-    ldpp_dout(this, 1) << "user doesn't own bucket, cannot remove notification" << dendl;
-    return -EPERM;
-  }
-  return 0;
-}
-
-// command (S3 compliant): GET /bucket?notification[=<notification-id>]
-class RGWPSListNotifsOp : public RGWOp {
-private:
-  std::string bucket_name;
-  RGWBucketInfo bucket_info;
-  std::optional<RGWPubSub> ps;
-  std::string notif_name;
-  rgw_pubsub_s3_notifications notifications;
-
-  int get_params() {
-    bool exists;
-    notif_name = s->info.args.get("notification", &exists);
-    if (!exists) {
-      ldpp_dout(this, 1) << "missing required param 'notification'" << dendl;
-      return -EINVAL;
-    } 
-    if (s->bucket_name.empty()) {
-      ldpp_dout(this, 1) << "request must be on a bucket" << dendl;
-      return -EINVAL;
-    }
-    bucket_name = s->bucket_name;
-    return 0;
-  }
-
-  public:
-  int verify_permission(optional_yield y) override;
-
-  void pre_exec() override {
-    rgw_bucket_object_pre_exec(s);
-  }
-
-  const char* name() const override { return "pubsub_notifications_get_s3"; }
-  RGWOpType get_type() override { return RGW_OP_PUBSUB_NOTIF_LIST; }
-  uint32_t op_mask() override { return RGW_OP_TYPE_READ; }
-
-  void execute(optional_yield y) override;
-  void send_response() override {
-    if (op_ret) {
-      set_req_state_err(s, op_ret);
-    }
-    dump_errno(s);
-    end_header(s, this, "application/xml");
-
-    if (op_ret < 0) {
-      return;
-    }
-    notifications.dump_xml(s->formatter);
-    rgw_flush_formatter_and_reset(s, s->formatter);
-  }
-};
-
-void RGWPSListNotifsOp::execute(optional_yield y) {
-  ps.emplace(static_cast<rgw::sal::RadosStore*>(driver), s->owner.get_id().tenant);
-  auto b = ps->get_bucket(bucket_info.bucket);
-  ceph_assert(b);
-  
-  // get all topics on a bucket
-  rgw_pubsub_bucket_topics bucket_topics;
-  op_ret = b->get_topics(&bucket_topics);
-  if (op_ret < 0) {
-    ldpp_dout(this, 1) << "failed to get list of topics from bucket '" << bucket_info.bucket.name << "', ret=" << op_ret << dendl;
-    return;
-  }
-  if (!notif_name.empty()) {
-    // get info of a specific notification
-    const auto unique_topic = find_unique_topic(bucket_topics, notif_name);
-    if (unique_topic) {
-      notifications.list.emplace_back(unique_topic->get());
-      return;
-    }
-    op_ret = -ENOENT;
-    ldpp_dout(this, 1) << "failed to get notification info for '" << notif_name << "', ret=" << op_ret << dendl;
-    return;
-  }
-  // loop through all topics of the bucket
-  for (const auto& topic : bucket_topics.topics) {
-    if (topic.second.s3_id.empty()) {
-        // not an s3 notification
-        continue;
-    }
-    notifications.list.emplace_back(topic.second);
-  }
-}
-
-int RGWPSListNotifsOp::verify_permission(optional_yield y) {
-  int ret = get_params();
-  if (ret < 0) {
-    return ret;
-  }
-
-  std::unique_ptr<rgw::sal::User> user = driver->get_user(s->owner.get_id());
-  std::unique_ptr<rgw::sal::Bucket> bucket;
-  ret = driver->get_bucket(this, user.get(), s->owner.get_id().tenant, bucket_name, &bucket, y);
-  if (ret < 0) {
-    return ret;
-  }
-  bucket_info = bucket->get_info();
-
-  if (bucket_info.owner != s->owner.get_id()) {
-    ldpp_dout(this, 1) << "user doesn't own bucket, cannot get notification list" << dendl;
-    return -EPERM;
-  }
-
-  return 0;
-}
-
-RGWOp* RGWHandler_REST_PSNotifs_S3::op_get() {
-  return new RGWPSListNotifsOp();
-}
-
-RGWOp* RGWHandler_REST_PSNotifs_S3::op_put() {
-  return new RGWPSCreateNotifOp();
-}
-
-RGWOp* RGWHandler_REST_PSNotifs_S3::op_delete() {
-  return new RGWPSDeleteNotifOp();
-}
-
-RGWOp* RGWHandler_REST_PSNotifs_S3::create_get_op() {
-    return new RGWPSListNotifsOp();
-}
-
-RGWOp* RGWHandler_REST_PSNotifs_S3::create_put_op() {
-  return new RGWPSCreateNotifOp();
-}
-
-RGWOp* RGWHandler_REST_PSNotifs_S3::create_delete_op() {
-  return new RGWPSDeleteNotifOp();
-}
-
diff --git a/src/rgw/store/rados/rgw_rest_pubsub.h b/src/rgw/store/rados/rgw_rest_pubsub.h
deleted file mode 100644 (file)
index 3b1a1bc..0000000
+++ /dev/null
@@ -1,39 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-#pragma once
-
-#include "rgw_rest_s3.h"
-
-// s3 compliant notification handler factory
-class RGWHandler_REST_PSNotifs_S3 : public RGWHandler_REST_S3 {
-protected:
-  int init_permissions(RGWOp* op, optional_yield y) override {return 0;}
-  int read_permissions(RGWOp* op, optional_yield y) override {return 0;}
-  bool supports_quota() override {return false;}
-  RGWOp* op_get() override;
-  RGWOp* op_put() override;
-  RGWOp* op_delete() override;
-public:
-  using RGWHandler_REST_S3::RGWHandler_REST_S3;
-  virtual ~RGWHandler_REST_PSNotifs_S3() = default;
-  // following are used to generate the operations when invoked by another REST handler
-  static RGWOp* create_get_op();
-  static RGWOp* create_put_op();
-  static RGWOp* create_delete_op();
-};
-
-// AWS compliant topics handler factory
-class RGWHandler_REST_PSTopic_AWS : public RGWHandler_REST {
-  const rgw::auth::StrategyRegistry& auth_registry;
-  const std::string& post_body;
-  void rgw_topic_parse_input();
-protected:
-  RGWOp* op_post() override;
-public:
-  RGWHandler_REST_PSTopic_AWS(const rgw::auth::StrategyRegistry& _auth_registry, const std::string& _post_body) : 
-      auth_registry(_auth_registry),
-      post_body(_post_body) {}
-  virtual ~RGWHandler_REST_PSTopic_AWS() = default;
-  int postauth_init(optional_yield) override { return 0; }
-  int authorize(const DoutPrefixProvider* dpp, optional_yield y) override;
-};
diff --git a/src/rgw/store/rados/rgw_rest_realm.cc b/src/rgw/store/rados/rgw_rest_realm.cc
deleted file mode 100644 (file)
index 79640a2..0000000
+++ /dev/null
@@ -1,376 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab ft=cpp
-
-#include "common/errno.h"
-#include "rgw_rest_realm.h"
-#include "rgw_rest_s3.h"
-#include "rgw_rest_config.h"
-#include "rgw_zone.h"
-#include "rgw_sal_rados.h"
-
-#include "services/svc_zone.h"
-#include "services/svc_mdlog.h"
-
-#include "include/ceph_assert.h"
-
-#define dout_subsys ceph_subsys_rgw
-
-using namespace std;
-
-// reject 'period push' if we would have to fetch too many intermediate periods
-static const uint32_t PERIOD_HISTORY_FETCH_MAX = 64;
-
-// base period op, shared between Get and Post
-class RGWOp_Period_Base : public RGWRESTOp {
- protected:
-  RGWPeriod period;
-  std::ostringstream error_stream;
- public:
-  int verify_permission(optional_yield) override { return 0; }
-  void send_response() override;
-};
-
-// reply with the period object on success
-void RGWOp_Period_Base::send_response()
-{
-  set_req_state_err(s, op_ret, error_stream.str());
-  dump_errno(s);
-
-  if (op_ret < 0) {
-    if (!s->err.message.empty()) {
-      ldpp_dout(this, 4) << "Request failed with " << op_ret
-          << ": " << s->err.message << dendl;
-    }
-    end_header(s);
-    return;
-  }
-
-  encode_json("period", period, s->formatter);
-  end_header(s, NULL, "application/json", s->formatter->get_len());
-  flusher.flush();
-}
-
-// GET /admin/realm/period
-class RGWOp_Period_Get : public RGWOp_Period_Base {
- public:
-  void execute(optional_yield y) override;
-  int check_caps(const RGWUserCaps& caps) override {
-    return caps.check_cap("zone", RGW_CAP_READ);
-  }
-  int verify_permission(optional_yield) override {
-    return check_caps(s->user->get_caps());
-  }
-  const char* name() const override { return "get_period"; }
-};
-
-void RGWOp_Period_Get::execute(optional_yield y)
-{
-  string realm_id, realm_name, period_id;
-  epoch_t epoch = 0;
-  RESTArgs::get_string(s, "realm_id", realm_id, &realm_id);
-  RESTArgs::get_string(s, "realm_name", realm_name, &realm_name);
-  RESTArgs::get_string(s, "period_id", period_id, &period_id);
-  RESTArgs::get_uint32(s, "epoch", 0, &epoch);
-
-  period.set_id(period_id);
-  period.set_epoch(epoch);
-
-  op_ret = period.init(this, driver->ctx(), static_cast<rgw::sal::RadosStore*>(driver)->svc()->sysobj, realm_id, y, realm_name);
-  if (op_ret < 0)
-    ldpp_dout(this, 5) << "failed to read period" << dendl;
-}
-
-// POST /admin/realm/period
-class RGWOp_Period_Post : public RGWOp_Period_Base {
- public:
-  void execute(optional_yield y) override;
-  int check_caps(const RGWUserCaps& caps) override {
-    return caps.check_cap("zone", RGW_CAP_WRITE);
-  }
-  int verify_permission(optional_yield) override {
-    return check_caps(s->user->get_caps());
-  }
-  const char* name() const override { return "post_period"; }
-  RGWOpType get_type() override { return RGW_OP_PERIOD_POST; }
-};
-
-void RGWOp_Period_Post::execute(optional_yield y)
-{
-  auto cct = driver->ctx();
-
-  // initialize the period without reading from rados
-  period.init(this, cct, static_cast<rgw::sal::RadosStore*>(driver)->svc()->sysobj, y, false);
-
-  // decode the period from input
-  const auto max_size = cct->_conf->rgw_max_put_param_size;
-  bool empty;
-  op_ret = get_json_input(cct, s, period, max_size, &empty);
-  if (op_ret < 0) {
-    ldpp_dout(this, -1) << "failed to decode period" << dendl;
-    return;
-  }
-
-  // require period.realm_id to match our realm
-  if (period.get_realm() != static_cast<rgw::sal::RadosStore*>(driver)->svc()->zone->get_realm().get_id()) {
-    error_stream << "period with realm id " << period.get_realm()
-        << " doesn't match current realm " << static_cast<rgw::sal::RadosStore*>(driver)->svc()->zone->get_realm().get_id() << std::endl;
-    op_ret = -EINVAL;
-    return;
-  }
-
-  // load the realm and current period from rados; there may be a more recent
-  // period that we haven't restarted with yet. we also don't want to modify
-  // the objects in use by RGWRados
-  RGWRealm realm(period.get_realm());
-  op_ret = realm.init(this, cct, static_cast<rgw::sal::RadosStore*>(driver)->svc()->sysobj, y);
-  if (op_ret < 0) {
-    ldpp_dout(this, -1) << "failed to read current realm: "
-        << cpp_strerror(-op_ret) << dendl;
-    return;
-  }
-
-  RGWPeriod current_period;
-  op_ret = current_period.init(this, cct, static_cast<rgw::sal::RadosStore*>(driver)->svc()->sysobj, realm.get_id(), y);
-  if (op_ret < 0) {
-    ldpp_dout(this, -1) << "failed to read current period: "
-        << cpp_strerror(-op_ret) << dendl;
-    return;
-  }
-
-  // if period id is empty, handle as 'period commit'
-  if (period.get_id().empty()) {
-    op_ret = period.commit(this, driver, realm, current_period, error_stream, y);
-    if (op_ret < 0) {
-      ldpp_dout(this, -1) << "master zone failed to commit period" << dendl;
-    }
-    return;
-  }
-
-  // if it's not period commit, nobody is allowed to push to the master zone
-  if (period.get_master_zone() == static_cast<rgw::sal::RadosStore*>(driver)->svc()->zone->get_zone_params().get_id()) {
-    ldpp_dout(this, 10) << "master zone rejecting period id="
-        << period.get_id() << " epoch=" << period.get_epoch() << dendl;
-    op_ret = -EINVAL; // XXX: error code
-    return;
-  }
-
-  // write the period to rados
-  op_ret = period.store_info(this, false, y);
-  if (op_ret < 0) {
-    ldpp_dout(this, -1) << "failed to store period " << period.get_id() << dendl;
-    return;
-  }
-  // set as latest epoch
-  op_ret = period.update_latest_epoch(this, period.get_epoch(), y);
-  if (op_ret == -EEXIST) {
-    // already have this epoch (or a more recent one)
-    ldpp_dout(this, 4) << "already have epoch >= " << period.get_epoch()
-        << " for period " << period.get_id() << dendl;
-    op_ret = 0;
-    return;
-  }
-  if (op_ret < 0) {
-    ldpp_dout(this, -1) << "failed to set latest epoch" << dendl;
-    return;
-  }
-
-  auto period_history = static_cast<rgw::sal::RadosStore*>(driver)->svc()->mdlog->get_period_history();
-
-  // decide whether we can set_current_period() or set_latest_epoch()
-  if (period.get_id() != current_period.get_id()) {
-    auto current_epoch = current_period.get_realm_epoch();
-    // discard periods in the past
-    if (period.get_realm_epoch() < current_epoch) {
-      ldpp_dout(this, 10) << "discarding period " << period.get_id()
-          << " with realm epoch " << period.get_realm_epoch()
-          << " older than current epoch " << current_epoch << dendl;
-      // return success to ack that we have this period
-      return;
-    }
-    // discard periods too far in the future
-    if (period.get_realm_epoch() > current_epoch + PERIOD_HISTORY_FETCH_MAX) {
-      ldpp_dout(this, -1) << "discarding period " << period.get_id()
-          << " with realm epoch " << period.get_realm_epoch() << " too far in "
-          "the future from current epoch " << current_epoch << dendl;
-      op_ret = -ENOENT; // XXX: error code
-      return;
-    }
-    // attach a copy of the period into the period history
-    auto cursor = period_history->attach(this, RGWPeriod{period}, y);
-    if (!cursor) {
-      // we're missing some history between the new period and current_period
-      op_ret = cursor.get_error();
-      ldpp_dout(this, -1) << "failed to collect the periods between current period "
-          << current_period.get_id() << " (realm epoch " << current_epoch
-          << ") and the new period " << period.get_id()
-          << " (realm epoch " << period.get_realm_epoch()
-          << "): " << cpp_strerror(-op_ret) << dendl;
-      return;
-    }
-    if (cursor.has_next()) {
-      // don't switch if we have a newer period in our history
-      ldpp_dout(this, 4) << "attached period " << period.get_id()
-          << " to history, but the history contains newer periods" << dendl;
-      return;
-    }
-    // set as current period
-    op_ret = realm.set_current_period(this, period, y);
-    if (op_ret < 0) {
-      ldpp_dout(this, -1) << "failed to update realm's current period" << dendl;
-      return;
-    }
-    ldpp_dout(this, 4) << "period " << period.get_id()
-        << " is newer than current period " << current_period.get_id()
-        << ", updating realm's current period and notifying zone" << dendl;
-    realm.notify_new_period(this, period, y);
-    return;
-  }
-  // reflect the period into our local objects
-  op_ret = period.reflect(this, y);
-  if (op_ret < 0) {
-    ldpp_dout(this, -1) << "failed to update local objects: "
-        << cpp_strerror(-op_ret) << dendl;
-    return;
-  }
-  ldpp_dout(this, 4) << "period epoch " << period.get_epoch()
-      << " is newer than current epoch " << current_period.get_epoch()
-      << ", updating period's latest epoch and notifying zone" << dendl;
-  realm.notify_new_period(this, period, y);
-  // update the period history
-  period_history->insert(RGWPeriod{period});
-}
-
-class RGWHandler_Period : public RGWHandler_Auth_S3 {
- protected:
-  using RGWHandler_Auth_S3::RGWHandler_Auth_S3;
-
-  RGWOp *op_get() override { return new RGWOp_Period_Get; }
-  RGWOp *op_post() override { return new RGWOp_Period_Post; }
-};
-
-class RGWRESTMgr_Period : public RGWRESTMgr {
- public:
-  RGWHandler_REST* get_handler(rgw::sal::Driver* driver,
-                              req_state*,
-                               const rgw::auth::StrategyRegistry& auth_registry,
-                               const std::string&) override {
-    return new RGWHandler_Period(auth_registry);
-  }
-};
-
-
-// GET /admin/realm
-class RGWOp_Realm_Get : public RGWRESTOp {
-  std::unique_ptr<RGWRealm> realm;
-public:
-  int check_caps(const RGWUserCaps& caps) override {
-    return caps.check_cap("zone", RGW_CAP_READ);
-  }
-  int verify_permission(optional_yield) override {
-    return check_caps(s->user->get_caps());
-  }
-  void execute(optional_yield y) override;
-  void send_response() override;
-  const char* name() const override { return "get_realm"; }
-};
-
-void RGWOp_Realm_Get::execute(optional_yield y)
-{
-  string id;
-  RESTArgs::get_string(s, "id", id, &id);
-  string name;
-  RESTArgs::get_string(s, "name", name, &name);
-
-  // read realm
-  realm.reset(new RGWRealm(id, name));
-  op_ret = realm->init(this, g_ceph_context, static_cast<rgw::sal::RadosStore*>(driver)->svc()->sysobj, y);
-  if (op_ret < 0)
-    ldpp_dout(this, -1) << "failed to read realm id=" << id
-        << " name=" << name << dendl;
-}
-
-void RGWOp_Realm_Get::send_response()
-{
-  set_req_state_err(s, op_ret);
-  dump_errno(s);
-
-  if (op_ret < 0) {
-    end_header(s);
-    return;
-  }
-
-  encode_json("realm", *realm, s->formatter);
-  end_header(s, NULL, "application/json", s->formatter->get_len());
-  flusher.flush();
-}
-
-// GET /admin/realm?list
-class RGWOp_Realm_List : public RGWRESTOp {
-  std::string default_id;
-  std::list<std::string> realms;
-public:
-  int check_caps(const RGWUserCaps& caps) override {
-    return caps.check_cap("zone", RGW_CAP_READ);
-  }
-  int verify_permission(optional_yield) override {
-    return check_caps(s->user->get_caps());
-  }
-  void execute(optional_yield y) override;
-  void send_response() override;
-  const char* name() const override { return "list_realms"; }
-};
-
-void RGWOp_Realm_List::execute(optional_yield y)
-{
-  {
-    // read default realm
-    RGWRealm realm(driver->ctx(), static_cast<rgw::sal::RadosStore*>(driver)->svc()->sysobj);
-    [[maybe_unused]] int ret = realm.read_default_id(this, default_id, y);
-  }
-  op_ret = static_cast<rgw::sal::RadosStore*>(driver)->svc()->zone->list_realms(this, realms);
-  if (op_ret < 0)
-    ldpp_dout(this, -1) << "failed to list realms" << dendl;
-}
-
-void RGWOp_Realm_List::send_response()
-{
-  set_req_state_err(s, op_ret);
-  dump_errno(s);
-
-  if (op_ret < 0) {
-    end_header(s);
-    return;
-  }
-
-  s->formatter->open_object_section("realms_list");
-  encode_json("default_info", default_id, s->formatter);
-  encode_json("realms", realms, s->formatter);
-  s->formatter->close_section();
-  end_header(s, NULL, "application/json", s->formatter->get_len());
-  flusher.flush();
-}
-
-class RGWHandler_Realm : public RGWHandler_Auth_S3 {
-protected:
-  using RGWHandler_Auth_S3::RGWHandler_Auth_S3;
-  RGWOp *op_get() override {
-    if (s->info.args.sub_resource_exists("list"))
-      return new RGWOp_Realm_List;
-    return new RGWOp_Realm_Get;
-  }
-};
-
-RGWRESTMgr_Realm::RGWRESTMgr_Realm()
-{
-  // add the /admin/realm/period resource
-  register_resource("period", new RGWRESTMgr_Period);
-}
-
-RGWHandler_REST*
-RGWRESTMgr_Realm::get_handler(rgw::sal::Driver* driver,
-                             req_state*,
-                              const rgw::auth::StrategyRegistry& auth_registry,
-                              const std::string&)
-{
-  return new RGWHandler_Realm(auth_registry);
-}
diff --git a/src/rgw/store/rados/rgw_rest_realm.h b/src/rgw/store/rados/rgw_rest_realm.h
deleted file mode 100644 (file)
index a0d1dc1..0000000
+++ /dev/null
@@ -1,16 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab ft=cpp
-
-#pragma once
-
-#include "rgw_rest.h"
-
-class RGWRESTMgr_Realm : public RGWRESTMgr {
-public:
-  RGWRESTMgr_Realm();
-
-  RGWHandler_REST* get_handler(rgw::sal::Driver* driver,
-                              req_state*,
-                               const rgw::auth::StrategyRegistry& auth_registry,
-                               const std::string&) override;
-};
diff --git a/src/rgw/store/rados/rgw_rest_user.cc b/src/rgw/store/rados/rgw_rest_user.cc
deleted file mode 100644 (file)
index c2aeece..0000000
+++ /dev/null
@@ -1,1109 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab ft=cpp
-
-#include "common/ceph_json.h"
-
-#include "rgw_op.h"
-#include "rgw_user.h"
-#include "rgw_rest_user.h"
-#include "rgw_sal.h"
-
-#include "include/str_list.h"
-#include "include/ceph_assert.h"
-
-#include "services/svc_zone.h"
-#include "services/svc_sys_obj.h"
-#include "rgw_zone.h"
-
-#define dout_subsys ceph_subsys_rgw
-
-using namespace std;
-
-class RGWOp_User_List : public RGWRESTOp {
-
-public:
-  RGWOp_User_List() {}
-
-  int check_caps(const RGWUserCaps& caps) override {
-    return caps.check_cap("users", RGW_CAP_READ);
-  }
-
-  void execute(optional_yield y) override;
-
-  const char* name() const override { return "list_user"; }
-};
-
-void RGWOp_User_List::execute(optional_yield y)
-{
-  RGWUserAdminOpState op_state(driver);
-
-  uint32_t max_entries;
-  std::string marker;
-  RESTArgs::get_uint32(s, "max-entries", 1000, &max_entries);
-  RESTArgs::get_string(s, "marker", marker, &marker);
-
-  op_state.max_entries = max_entries;
-  op_state.marker = marker;
-  op_ret = RGWUserAdminOp_User::list(this, driver, op_state, flusher);
-}
-
-class RGWOp_User_Info : public RGWRESTOp {
-
-public:
-  RGWOp_User_Info() {}
-
-  int check_caps(const RGWUserCaps& caps) override {
-    return caps.check_cap("users", RGW_CAP_READ);
-  }
-
-  void execute(optional_yield y) override;
-
-  const char* name() const override { return "get_user_info"; }
-};
-
-void RGWOp_User_Info::execute(optional_yield y)
-{
-  RGWUserAdminOpState op_state(driver);
-
-  std::string uid_str, access_key_str;
-  bool fetch_stats;
-  bool sync_stats;
-
-  RESTArgs::get_string(s, "uid", uid_str, &uid_str);
-  RESTArgs::get_string(s, "access-key", access_key_str, &access_key_str);
-
-  // if uid was not supplied in rest argument, error out now, otherwise we'll
-  // end up initializing anonymous user, for which keys.init will eventually
-  // return -EACESS
-  if (uid_str.empty() && access_key_str.empty()){
-    op_ret=-EINVAL;
-    return;
-  }
-
-  rgw_user uid(uid_str);
-
-  RESTArgs::get_bool(s, "stats", false, &fetch_stats);
-
-  RESTArgs::get_bool(s, "sync", false, &sync_stats);
-
-  op_state.set_user_id(uid);
-  op_state.set_access_key(access_key_str);
-  op_state.set_fetch_stats(fetch_stats);
-  op_state.set_sync_stats(sync_stats);
-
-  op_ret = RGWUserAdminOp_User::info(s, driver, op_state, flusher, y);
-}
-
-class RGWOp_User_Create : public RGWRESTOp {
-
-public:
-  RGWOp_User_Create() {}
-
-  int check_caps(const RGWUserCaps& caps) override {
-    return caps.check_cap("users", RGW_CAP_WRITE);
-  }
-
-  void execute(optional_yield y) override;
-
-  const char* name() const override { return "create_user"; }
-};
-
-void RGWOp_User_Create::execute(optional_yield y)
-{
-  std::string uid_str;
-  std::string display_name;
-  std::string email;
-  std::string access_key;
-  std::string secret_key;
-  std::string key_type_str;
-  std::string caps;
-  std::string tenant_name;
-  std::string op_mask_str;
-  std::string default_placement_str;
-  std::string placement_tags_str;
-
-  bool gen_key;
-  bool suspended;
-  bool system;
-  bool exclusive;
-
-  int32_t max_buckets;
-  const int32_t default_max_buckets =
-    s->cct->_conf.get_val<int64_t>("rgw_user_max_buckets");
-
-  RGWUserAdminOpState op_state(driver);
-
-  RESTArgs::get_string(s, "uid", uid_str, &uid_str);
-  rgw_user uid(uid_str);
-
-  RESTArgs::get_string(s, "display-name", display_name, &display_name);
-  RESTArgs::get_string(s, "email", email, &email);
-  RESTArgs::get_string(s, "access-key", access_key, &access_key);
-  RESTArgs::get_string(s, "secret-key", secret_key, &secret_key);
-  RESTArgs::get_string(s, "key-type", key_type_str, &key_type_str);
-  RESTArgs::get_string(s, "user-caps", caps, &caps);
-  RESTArgs::get_string(s, "tenant", tenant_name, &tenant_name);
-  RESTArgs::get_bool(s, "generate-key", true, &gen_key);
-  RESTArgs::get_bool(s, "suspended", false, &suspended);
-  RESTArgs::get_int32(s, "max-buckets", default_max_buckets, &max_buckets);
-  RESTArgs::get_bool(s, "system", false, &system);
-  RESTArgs::get_bool(s, "exclusive", false, &exclusive);
-  RESTArgs::get_string(s, "op-mask", op_mask_str, &op_mask_str);
-  RESTArgs::get_string(s, "default-placement", default_placement_str, &default_placement_str);
-  RESTArgs::get_string(s, "placement-tags", placement_tags_str, &placement_tags_str);
-
-  if (!s->user->get_info().system && system) {
-    ldpp_dout(this, 0) << "cannot set system flag by non-system user" << dendl;
-    op_ret = -EINVAL;
-    return;
-  }
-
-  if (!tenant_name.empty()) {
-    uid.tenant = tenant_name;
-  }
-
-  // TODO: validate required args are passed in. (for eg. uid and display_name here)
-  op_state.set_user_id(uid);
-  op_state.set_display_name(display_name);
-  op_state.set_user_email(email);
-  op_state.set_caps(caps);
-  op_state.set_access_key(access_key);
-  op_state.set_secret_key(secret_key);
-
-  if (!op_mask_str.empty()) {
-    uint32_t op_mask;
-    int ret = rgw_parse_op_type_list(op_mask_str, &op_mask);
-    if (ret < 0) {
-      ldpp_dout(this, 0) << "failed to parse op_mask: " << ret << dendl;
-      op_ret = -EINVAL;
-      return;
-    }
-    op_state.set_op_mask(op_mask);
-  }
-
-  if (!key_type_str.empty()) {
-    int32_t key_type = KEY_TYPE_UNDEFINED;
-    if (key_type_str.compare("swift") == 0)
-      key_type = KEY_TYPE_SWIFT;
-    else if (key_type_str.compare("s3") == 0)
-      key_type = KEY_TYPE_S3;
-
-    op_state.set_key_type(key_type);
-  }
-
-  if (max_buckets != default_max_buckets) {
-    if (max_buckets < 0) {
-      max_buckets = -1;
-    }
-    op_state.set_max_buckets(max_buckets);
-  }
-  if (s->info.args.exists("suspended"))
-    op_state.set_suspension(suspended);
-
-  if (s->info.args.exists("system"))
-    op_state.set_system(system);
-
-  if (s->info.args.exists("exclusive"))
-    op_state.set_exclusive(exclusive);
-
-  if (gen_key)
-    op_state.set_generate_key();
-
-  if (!default_placement_str.empty()) {
-    rgw_placement_rule target_rule;
-    target_rule.from_str(default_placement_str);
-    if (!driver->valid_placement(target_rule)) {
-      ldpp_dout(this, 0) << "NOTICE: invalid dest placement: " << target_rule.to_str() << dendl;
-      op_ret = -EINVAL;
-      return;
-    }
-    op_state.set_default_placement(target_rule);
-  }
-
-  if (!placement_tags_str.empty()) {
-    list<string> placement_tags_list;
-    get_str_list(placement_tags_str, ",", placement_tags_list);
-    op_state.set_placement_tags(placement_tags_list);
-  }
-
-  bufferlist data;
-  op_ret = driver->forward_request_to_master(s, s->user.get(), nullptr, data, nullptr, s->info, y);
-  if (op_ret < 0) {
-    ldpp_dout(this, 0) << "forward_request_to_master returned ret=" << op_ret << dendl;
-    return;
-  }
-  op_ret = RGWUserAdminOp_User::create(s, driver, op_state, flusher, y);
-}
-
-class RGWOp_User_Modify : public RGWRESTOp {
-
-public:
-  RGWOp_User_Modify() {}
-
-  int check_caps(const RGWUserCaps& caps) override {
-    return caps.check_cap("users", RGW_CAP_WRITE);
-  }
-
-  void execute(optional_yield y) override;
-
-  const char* name() const override { return "modify_user"; }
-};
-
-void RGWOp_User_Modify::execute(optional_yield y)
-{
-  std::string uid_str;
-  std::string display_name;
-  std::string email;
-  std::string access_key;
-  std::string secret_key;
-  std::string key_type_str;
-  std::string op_mask_str;
-  std::string default_placement_str;
-  std::string placement_tags_str;
-
-  bool gen_key;
-  bool suspended;
-  bool system;
-  bool email_set;
-  bool quota_set;
-  int32_t max_buckets;
-
-  RGWUserAdminOpState op_state(driver);
-
-  RESTArgs::get_string(s, "uid", uid_str, &uid_str);
-  rgw_user uid(uid_str);
-
-  RESTArgs::get_string(s, "display-name", display_name, &display_name);
-  RESTArgs::get_string(s, "email", email, &email, &email_set);
-  RESTArgs::get_string(s, "access-key", access_key, &access_key);
-  RESTArgs::get_string(s, "secret-key", secret_key, &secret_key);
-  RESTArgs::get_bool(s, "generate-key", false, &gen_key);
-  RESTArgs::get_bool(s, "suspended", false, &suspended);
-  RESTArgs::get_int32(s, "max-buckets", RGW_DEFAULT_MAX_BUCKETS, &max_buckets, &quota_set);
-  RESTArgs::get_string(s, "key-type", key_type_str, &key_type_str);
-
-  RESTArgs::get_bool(s, "system", false, &system);
-  RESTArgs::get_string(s, "op-mask", op_mask_str, &op_mask_str);
-  RESTArgs::get_string(s, "default-placement", default_placement_str, &default_placement_str);
-  RESTArgs::get_string(s, "placement-tags", placement_tags_str, &placement_tags_str);
-
-  if (!s->user->get_info().system && system) {
-    ldpp_dout(this, 0) << "cannot set system flag by non-system user" << dendl;
-    op_ret = -EINVAL;
-    return;
-  }
-
-  op_state.set_user_id(uid);
-  op_state.set_display_name(display_name);
-
-  if (email_set)
-    op_state.set_user_email(email);
-
-  op_state.set_access_key(access_key);
-  op_state.set_secret_key(secret_key);
-
-  if (quota_set) {
-    if (max_buckets < 0 ) {
-      max_buckets = -1;
-    }
-    op_state.set_max_buckets(max_buckets);
-  }
-  if (gen_key)
-    op_state.set_generate_key();
-
-  if (!key_type_str.empty()) {
-    int32_t key_type = KEY_TYPE_UNDEFINED;
-    if (key_type_str.compare("swift") == 0)
-      key_type = KEY_TYPE_SWIFT;
-    else if (key_type_str.compare("s3") == 0)
-      key_type = KEY_TYPE_S3;
-
-    op_state.set_key_type(key_type);
-  }
-
-  if (!op_mask_str.empty()) {
-    uint32_t op_mask;
-    if (rgw_parse_op_type_list(op_mask_str, &op_mask) < 0) {
-        ldpp_dout(this, 0) << "failed to parse op_mask" << dendl;
-        op_ret = -EINVAL;
-        return;
-    }   
-    op_state.set_op_mask(op_mask);
-  }
-
-  if (s->info.args.exists("suspended"))
-    op_state.set_suspension(suspended);
-
-  if (s->info.args.exists("system"))
-    op_state.set_system(system);
-
-  if (!op_mask_str.empty()) {
-    uint32_t op_mask;
-    int ret = rgw_parse_op_type_list(op_mask_str, &op_mask);
-    if (ret < 0) {
-      ldpp_dout(this, 0) << "failed to parse op_mask: " << ret << dendl;
-      op_ret = -EINVAL;
-      return;
-    }
-    op_state.set_op_mask(op_mask);
-  }
-
-  if (!default_placement_str.empty()) {
-    rgw_placement_rule target_rule;
-    target_rule.from_str(default_placement_str);
-    if (!driver->valid_placement(target_rule)) {
-      ldpp_dout(this, 0) << "NOTICE: invalid dest placement: " << target_rule.to_str() << dendl;
-      op_ret = -EINVAL;
-      return;
-    }
-    op_state.set_default_placement(target_rule);
-  }
-
-  if (!placement_tags_str.empty()) {
-    list<string> placement_tags_list;
-    get_str_list(placement_tags_str, ",", placement_tags_list);
-    op_state.set_placement_tags(placement_tags_list);
-  }
-  
-  bufferlist data;
-  op_ret = driver->forward_request_to_master(s, s->user.get(), nullptr, data, nullptr, s->info, y);
-  if (op_ret < 0) {
-    ldpp_dout(this, 0) << "forward_request_to_master returned ret=" << op_ret << dendl;
-    return;
-  }
-  op_ret = RGWUserAdminOp_User::modify(s, driver, op_state, flusher, y);
-}
-
-class RGWOp_User_Remove : public RGWRESTOp {
-
-public:
-  RGWOp_User_Remove() {}
-
-  int check_caps(const RGWUserCaps& caps) override {
-    return caps.check_cap("users", RGW_CAP_WRITE);
-  }
-
-  void execute(optional_yield y) override;
-
-  const char* name() const override { return "remove_user"; }
-};
-
-void RGWOp_User_Remove::execute(optional_yield y)
-{
-  std::string uid_str;
-  bool purge_data;
-
-  RGWUserAdminOpState op_state(driver);
-
-  RESTArgs::get_string(s, "uid", uid_str, &uid_str);
-  rgw_user uid(uid_str);
-
-  RESTArgs::get_bool(s, "purge-data", false, &purge_data);
-
-  // FIXME: no double checking
-  if (!uid.empty())
-    op_state.set_user_id(uid);
-
-  op_state.set_purge_data(purge_data);
-
-  bufferlist data;
-  op_ret = driver->forward_request_to_master(s, s->user.get(), nullptr, data, nullptr, s->info, y);
-  if (op_ret < 0) {
-    ldpp_dout(this, 0) << "forward_request_to_master returned ret=" << op_ret << dendl;
-    return;
-  }
-  op_ret = RGWUserAdminOp_User::remove(s, driver, op_state, flusher, s->yield);
-}
-
-class RGWOp_Subuser_Create : public RGWRESTOp {
-
-public:
-  RGWOp_Subuser_Create() {}
-
-  int check_caps(const RGWUserCaps& caps) override {
-    return caps.check_cap("users", RGW_CAP_WRITE);
-  }
-
-  void execute(optional_yield y) override;
-
-  const char* name() const override { return "create_subuser"; }
-};
-
-void RGWOp_Subuser_Create::execute(optional_yield y)
-{
-  std::string uid_str;
-  std::string subuser;
-  std::string secret_key;
-  std::string access_key;
-  std::string perm_str;
-  std::string key_type_str;
-
-  bool gen_subuser = false; // FIXME placeholder
-  bool gen_secret;
-  bool gen_access;
-
-  uint32_t perm_mask = 0;
-  int32_t key_type = KEY_TYPE_SWIFT;
-
-  RGWUserAdminOpState op_state(driver);
-
-  RESTArgs::get_string(s, "uid", uid_str, &uid_str);
-  rgw_user uid(uid_str);
-
-  RESTArgs::get_string(s, "subuser", subuser, &subuser);
-  RESTArgs::get_string(s, "access-key", access_key, &access_key);
-  RESTArgs::get_string(s, "secret-key", secret_key, &secret_key);
-  RESTArgs::get_string(s, "access", perm_str, &perm_str);
-  RESTArgs::get_string(s, "key-type", key_type_str, &key_type_str);
-  RESTArgs::get_bool(s, "generate-secret", false, &gen_secret);
-  RESTArgs::get_bool(s, "gen-access-key", false, &gen_access);
-  
-  perm_mask = rgw_str_to_perm(perm_str.c_str());
-  op_state.set_perm(perm_mask);
-
-  op_state.set_user_id(uid);
-  op_state.set_subuser(subuser);
-  op_state.set_access_key(access_key);
-  op_state.set_secret_key(secret_key);
-  op_state.set_generate_subuser(gen_subuser);
-
-  if (gen_access)
-    op_state.set_gen_access();
-
-  if (gen_secret)
-    op_state.set_gen_secret();
-
-  if (!key_type_str.empty()) {
-    if (key_type_str.compare("swift") == 0)
-      key_type = KEY_TYPE_SWIFT;
-    else if (key_type_str.compare("s3") == 0)
-      key_type = KEY_TYPE_S3;
-  }
-  op_state.set_key_type(key_type);
-
-  bufferlist data;
-  op_ret = driver->forward_request_to_master(s, s->user.get(), nullptr, data, nullptr, s->info, y);
-  if (op_ret < 0) {
-    ldpp_dout(this, 0) << "forward_request_to_master returned ret=" << op_ret << dendl;
-    return;
-  }
-  op_ret = RGWUserAdminOp_Subuser::create(s, driver, op_state, flusher, y);
-}
-
-class RGWOp_Subuser_Modify : public RGWRESTOp {
-
-public:
-  RGWOp_Subuser_Modify() {}
-
-  int check_caps(const RGWUserCaps& caps) override {
-    return caps.check_cap("users", RGW_CAP_WRITE);
-  }
-
-  void execute(optional_yield y) override;
-
-  const char* name() const override { return "modify_subuser"; }
-};
-
-void RGWOp_Subuser_Modify::execute(optional_yield y)
-{
-  std::string uid_str;
-  std::string subuser;
-  std::string secret_key;
-  std::string key_type_str;
-  std::string perm_str;
-
-  RGWUserAdminOpState op_state(driver);
-
-  uint32_t perm_mask;
-  int32_t key_type = KEY_TYPE_SWIFT;
-
-  bool gen_secret;
-
-  RESTArgs::get_string(s, "uid", uid_str, &uid_str);
-  rgw_user uid(uid_str);
-
-  RESTArgs::get_string(s, "subuser", subuser, &subuser);
-  RESTArgs::get_string(s, "secret-key", secret_key, &secret_key);
-  RESTArgs::get_string(s, "access", perm_str, &perm_str);
-  RESTArgs::get_string(s, "key-type", key_type_str, &key_type_str);
-  RESTArgs::get_bool(s, "generate-secret", false, &gen_secret);
-
-  perm_mask = rgw_str_to_perm(perm_str.c_str());
-  op_state.set_perm(perm_mask);
-
-  op_state.set_user_id(uid);
-  op_state.set_subuser(subuser);
-
-  if (!secret_key.empty())
-    op_state.set_secret_key(secret_key);
-
-  if (gen_secret)
-    op_state.set_gen_secret();
-
-  if (!key_type_str.empty()) {
-    if (key_type_str.compare("swift") == 0)
-      key_type = KEY_TYPE_SWIFT;
-    else if (key_type_str.compare("s3") == 0)
-      key_type = KEY_TYPE_S3;
-  }
-  op_state.set_key_type(key_type);
-
-  bufferlist data;
-  op_ret = driver->forward_request_to_master(s, s->user.get(), nullptr, data, nullptr, s->info, y);
-  if (op_ret < 0) {
-    ldpp_dout(this, 0) << "forward_request_to_master returned ret=" << op_ret << dendl;
-    return;
-  }
-  op_ret = RGWUserAdminOp_Subuser::modify(s, driver, op_state, flusher, y);
-}
-
-class RGWOp_Subuser_Remove : public RGWRESTOp {
-
-public:
-  RGWOp_Subuser_Remove() {}
-
-  int check_caps(const RGWUserCaps& caps) override {
-    return caps.check_cap("users", RGW_CAP_WRITE);
-  }
-
-  void execute(optional_yield y) override;
-
-  const char* name() const override { return "remove_subuser"; }
-};
-
-void RGWOp_Subuser_Remove::execute(optional_yield y)
-{
-  std::string uid_str;
-  std::string subuser;
-  bool purge_keys;
-
-  RGWUserAdminOpState op_state(driver);
-
-  RESTArgs::get_string(s, "uid", uid_str, &uid_str);
-  rgw_user uid(uid_str);
-
-  RESTArgs::get_string(s, "subuser", subuser, &subuser);
-  RESTArgs::get_bool(s, "purge-keys", true, &purge_keys);
-
-  op_state.set_user_id(uid);
-  op_state.set_subuser(subuser);
-
-  if (purge_keys)
-    op_state.set_purge_keys();
-
-  bufferlist data;
-  op_ret = driver->forward_request_to_master(s, s->user.get(), nullptr, data, nullptr, s->info, y);
-  if (op_ret < 0) {
-    ldpp_dout(this, 0) << "forward_request_to_master returned ret=" << op_ret << dendl;
-    return;
-  }
-  op_ret = RGWUserAdminOp_Subuser::remove(s, driver, op_state, flusher, y);
-}
-
-class RGWOp_Key_Create : public RGWRESTOp {
-
-public:
-  RGWOp_Key_Create() {}
-
-  int check_caps(const RGWUserCaps& caps) override {
-    return caps.check_cap("users", RGW_CAP_WRITE);
-  }
-
-  void execute(optional_yield y) override;
-
-  const char* name() const override { return "create_access_key"; }
-};
-
-void RGWOp_Key_Create::execute(optional_yield y)
-{
-  std::string uid_str;
-  std::string subuser;
-  std::string access_key;
-  std::string secret_key;
-  std::string key_type_str;
-
-  bool gen_key;
-
-  RGWUserAdminOpState op_state(driver);
-
-  RESTArgs::get_string(s, "uid", uid_str, &uid_str);
-  rgw_user uid(uid_str);
-
-  RESTArgs::get_string(s, "subuser", subuser, &subuser);
-  RESTArgs::get_string(s, "access-key", access_key, &access_key);
-  RESTArgs::get_string(s, "secret-key", secret_key, &secret_key);
-  RESTArgs::get_string(s, "key-type", key_type_str, &key_type_str);
-  RESTArgs::get_bool(s, "generate-key", true, &gen_key);
-
-  op_state.set_user_id(uid);
-  op_state.set_subuser(subuser);
-  op_state.set_access_key(access_key);
-  op_state.set_secret_key(secret_key);
-
-  if (gen_key)
-    op_state.set_generate_key();
-
-  if (!key_type_str.empty()) {
-    int32_t key_type = KEY_TYPE_UNDEFINED;
-    if (key_type_str.compare("swift") == 0)
-      key_type = KEY_TYPE_SWIFT;
-    else if (key_type_str.compare("s3") == 0)
-      key_type = KEY_TYPE_S3;
-
-    op_state.set_key_type(key_type);
-  }
-
-  op_ret = RGWUserAdminOp_Key::create(s, driver, op_state, flusher, y);
-}
-
-class RGWOp_Key_Remove : public RGWRESTOp {
-
-public:
-  RGWOp_Key_Remove() {}
-
-  int check_caps(const RGWUserCaps& caps) override {
-    return caps.check_cap("users", RGW_CAP_WRITE);
-  }
-
-  void execute(optional_yield y) override;
-
-  const char* name() const override { return "remove_access_key"; }
-};
-
-void RGWOp_Key_Remove::execute(optional_yield y)
-{
-  std::string uid_str;
-  std::string subuser;
-  std::string access_key;
-  std::string key_type_str;
-
-  RGWUserAdminOpState op_state(driver);
-
-  RESTArgs::get_string(s, "uid", uid_str, &uid_str);
-  rgw_user uid(uid_str);
-
-  RESTArgs::get_string(s, "subuser", subuser, &subuser);
-  RESTArgs::get_string(s, "access-key", access_key, &access_key);
-  RESTArgs::get_string(s, "key-type", key_type_str, &key_type_str);
-
-  op_state.set_user_id(uid);
-  op_state.set_subuser(subuser);
-  op_state.set_access_key(access_key);
-
-  if (!key_type_str.empty()) {
-    int32_t key_type = KEY_TYPE_UNDEFINED;
-    if (key_type_str.compare("swift") == 0)
-      key_type = KEY_TYPE_SWIFT;
-    else if (key_type_str.compare("s3") == 0)
-      key_type = KEY_TYPE_S3;
-
-    op_state.set_key_type(key_type);
-  }
-
-  op_ret = RGWUserAdminOp_Key::remove(s, driver, op_state, flusher, y);
-}
-
-class RGWOp_Caps_Add : public RGWRESTOp {
-
-public:
-  RGWOp_Caps_Add() {}
-
-  int check_caps(const RGWUserCaps& caps) override {
-    return caps.check_cap("users", RGW_CAP_WRITE);
-  }
-
-  void execute(optional_yield y) override;
-
-  const char* name() const override { return "add_user_caps"; }
-};
-
-void RGWOp_Caps_Add::execute(optional_yield y)
-{
-  std::string uid_str;
-  std::string caps;
-
-  RGWUserAdminOpState op_state(driver);
-
-  RESTArgs::get_string(s, "uid", uid_str, &uid_str);
-  rgw_user uid(uid_str);
-
-  RESTArgs::get_string(s, "user-caps", caps, &caps);
-
-  op_state.set_user_id(uid);
-  op_state.set_caps(caps);
-
-  bufferlist data;
-  op_ret = driver->forward_request_to_master(s, s->user.get(), nullptr, data, nullptr, s->info, y);
-  if (op_ret < 0) {
-    ldpp_dout(this, 0) << "forward_request_to_master returned ret=" << op_ret << dendl;
-    return;
-  }
-  op_ret = RGWUserAdminOp_Caps::add(s, driver, op_state, flusher, y);
-}
-
-class RGWOp_Caps_Remove : public RGWRESTOp {
-
-public:
-  RGWOp_Caps_Remove() {}
-
-  int check_caps(const RGWUserCaps& caps) override {
-    return caps.check_cap("users", RGW_CAP_WRITE);
-  }
-
-  void execute(optional_yield y) override;
-
-  const char* name() const override { return "remove_user_caps"; }
-};
-
-void RGWOp_Caps_Remove::execute(optional_yield y)
-{
-  std::string uid_str;
-  std::string caps;
-
-  RGWUserAdminOpState op_state(driver);
-
-  RESTArgs::get_string(s, "uid", uid_str, &uid_str);
-  rgw_user uid(uid_str);
-
-  RESTArgs::get_string(s, "user-caps", caps, &caps);
-
-  op_state.set_user_id(uid);
-  op_state.set_caps(caps);
-
-  bufferlist data;
-  op_ret = driver->forward_request_to_master(s, s->user.get(), nullptr, data, nullptr, s->info, y);
-  if (op_ret < 0) {
-    ldpp_dout(this, 0) << "forward_request_to_master returned ret=" << op_ret << dendl;
-    return;
-  }
-  op_ret = RGWUserAdminOp_Caps::remove(s, driver, op_state, flusher, y);
-}
-
-struct UserQuotas {
-  RGWQuota quota;
-
-  UserQuotas() {}
-
-  explicit UserQuotas(RGWUserInfo& info){
-    quota.bucket_quota = info.quota.bucket_quota;
-    quota.user_quota = info.quota.user_quota;
-  }
-
-  void dump(Formatter *f) const {
-    encode_json("bucket_quota", quota.bucket_quota, f);
-    encode_json("user_quota", quota.user_quota, f);
-  }
-  void decode_json(JSONObj *obj) {
-    JSONDecoder::decode_json("bucket_quota", quota.bucket_quota, obj);
-    JSONDecoder::decode_json("user_quota", quota.user_quota, obj);
-  }
-};
-
-class RGWOp_Quota_Info : public RGWRESTOp {
-
-public:
-  RGWOp_Quota_Info() {}
-
-  int check_caps(const RGWUserCaps& caps) override {
-    return caps.check_cap("users", RGW_CAP_READ);
-  }
-
-  void execute(optional_yield y) override;
-
-  const char* name() const override { return "get_quota_info"; }
-};
-
-
-void RGWOp_Quota_Info::execute(optional_yield y)
-{
-  RGWUserAdminOpState op_state(driver);
-
-  std::string uid_str;
-  std::string quota_type;
-
-  RESTArgs::get_string(s, "uid", uid_str, &uid_str);
-  RESTArgs::get_string(s, "quota-type", quota_type, &quota_type);
-
-  if (uid_str.empty()) {
-    op_ret = -EINVAL;
-    return;
-  }
-
-  rgw_user uid(uid_str);
-
-  bool show_all = quota_type.empty();
-  bool show_bucket = show_all || (quota_type == "bucket");
-  bool show_user = show_all || (quota_type == "user");
-
-  if (!(show_all || show_bucket || show_user)) {
-    op_ret = -EINVAL;
-    return;
-  }
-
-  op_state.set_user_id(uid);
-
-  RGWUser user;
-  op_ret = user.init(s, driver, op_state, y);
-  if (op_ret < 0)
-    return;
-
-  if (!op_state.has_existing_user()) {
-    op_ret = -ERR_NO_SUCH_USER;
-    return;
-  }
-
-  RGWUserInfo info;
-  string err_msg;
-  op_ret = user.info(info, &err_msg);
-  if (op_ret < 0)
-    return;
-
-  flusher.start(0);
-  if (show_all) {
-    UserQuotas quotas(info);
-    encode_json("quota", quotas, s->formatter);
-  } else if (show_user) {
-    encode_json("user_quota", info.quota.user_quota, s->formatter);
-  } else {
-    encode_json("bucket_quota", info.quota.bucket_quota, s->formatter);
-  }
-
-  flusher.flush();
-}
-
-class RGWOp_Quota_Set : public RGWRESTOp {
-
-public:
-  RGWOp_Quota_Set() {}
-
-  int check_caps(const RGWUserCaps& caps) override {
-    return caps.check_cap("users", RGW_CAP_WRITE);
-  }
-
-  void execute(optional_yield y) override;
-
-  const char* name() const override { return "set_quota_info"; }
-};
-
-/**
- * set quota
- *
- * two different ways to set the quota info: as json struct in the message body or via http params.
- *
- * as json:
- *
- * PUT /admin/user?uid=<uid>[&quota-type=<type>]
- *
- * whereas quota-type is optional and is either user, or bucket
- *
- * if quota-type is not specified then we expect to get a structure that contains both quotas,
- * otherwise we'll only get the relevant configuration.
- *
- * E.g., if quota type not specified:
- * {
- *    "user_quota" : {
- *      "max_size_kb" : 4096,
- *      "max_objects" : -1,
- *      "enabled" : false
- *    },
- *    "bucket_quota" : {
- *      "max_size_kb" : 1024,
- *      "max_objects" : -1,
- *      "enabled" : true
- *    }
- * }
- *
- *
- * or if quota type is specified:
- * {
- *   "max_size_kb" : 4096,
- *   "max_objects" : -1,
- *   "enabled" : false
- * }
- *
- * Another option is not to pass any body and set the following http params:
- *
- *
- * max-size-kb=<size>
- * max-objects=<max objects>
- * enabled[={true,false}]
- *
- * all params are optionals and default to the current settings. With this type of configuration the
- * quota-type param is mandatory.
- *
- */
-
-void RGWOp_Quota_Set::execute(optional_yield y)
-{
-  RGWUserAdminOpState op_state(driver);
-
-  std::string uid_str;
-  std::string quota_type;
-
-  RESTArgs::get_string(s, "uid", uid_str, &uid_str);
-  RESTArgs::get_string(s, "quota-type", quota_type, &quota_type);
-
-  if (uid_str.empty()) {
-    op_ret = -EINVAL;
-    return;
-  }
-
-  rgw_user uid(uid_str);
-
-  bool set_all = quota_type.empty();
-  bool set_bucket = set_all || (quota_type == "bucket");
-  bool set_user = set_all || (quota_type == "user");
-
-  if (!(set_all || set_bucket || set_user)) {
-    ldpp_dout(this, 20) << "invalid quota type" << dendl;
-    op_ret = -EINVAL;
-    return;
-  }
-
-  bool use_http_params;
-
-  if (s->content_length > 0) {
-    use_http_params = false;
-  } else {
-    const char *encoding = s->info.env->get("HTTP_TRANSFER_ENCODING");
-    use_http_params = (!encoding || strcmp(encoding, "chunked") != 0);
-  }
-
-  if (use_http_params && set_all) {
-    ldpp_dout(this, 20) << "quota type was not specified, can't set all quotas via http headers" << dendl;
-    op_ret = -EINVAL;
-    return;
-  }
-
-  op_state.set_user_id(uid);
-
-  RGWUser user;
-  op_ret = user.init(s, driver, op_state, y);
-  if (op_ret < 0) {
-    ldpp_dout(this, 20) << "failed initializing user info: " << op_ret << dendl;
-    return;
-  }
-
-  if (!op_state.has_existing_user()) {
-    op_ret = -ERR_NO_SUCH_USER;
-    return;
-  }
-
-#define QUOTA_INPUT_MAX_LEN 1024
-  if (set_all) {
-    UserQuotas quotas;
-
-    if ((op_ret = get_json_input(driver->ctx(), s, quotas, QUOTA_INPUT_MAX_LEN, NULL)) < 0) {
-      ldpp_dout(this, 20) << "failed to retrieve input" << dendl;
-      return;
-    }
-
-    op_state.set_user_quota(quotas.quota.user_quota);
-    op_state.set_bucket_quota(quotas.quota.bucket_quota);
-  } else {
-    RGWQuotaInfo quota;
-
-    if (!use_http_params) {
-      bool empty;
-      op_ret = get_json_input(driver->ctx(), s, quota, QUOTA_INPUT_MAX_LEN, &empty);
-      if (op_ret < 0) {
-        ldpp_dout(this, 20) << "failed to retrieve input" << dendl;
-        if (!empty)
-          return;
-
-        /* was probably chunked input, but no content provided, configure via http params */
-        use_http_params = true;
-      }
-    }
-
-    if (use_http_params) {
-      RGWUserInfo info;
-      string err_msg;
-      op_ret = user.info(info, &err_msg);
-      if (op_ret < 0) {
-        ldpp_dout(this, 20) << "failed to get user info: " << op_ret << dendl;
-        return;
-      }
-      RGWQuotaInfo *old_quota;
-      if (set_user) {
-        old_quota = &info.quota.user_quota;
-      } else {
-        old_quota = &info.quota.bucket_quota;
-      }
-
-      RESTArgs::get_int64(s, "max-objects", old_quota->max_objects, &quota.max_objects);
-      RESTArgs::get_int64(s, "max-size", old_quota->max_size, &quota.max_size);
-      int64_t max_size_kb;
-      bool has_max_size_kb = false;
-      RESTArgs::get_int64(s, "max-size-kb", 0, &max_size_kb, &has_max_size_kb);
-      if (has_max_size_kb) {
-        quota.max_size = max_size_kb * 1024;
-      }
-      RESTArgs::get_bool(s, "enabled", old_quota->enabled, &quota.enabled);
-    }
-
-    if (set_user) {
-      op_state.set_user_quota(quota);
-    } else {
-      op_state.set_bucket_quota(quota);
-    }
-  }
-
-  string err;
-  op_ret = user.modify(s, op_state, y, &err);
-  if (op_ret < 0) {
-    ldpp_dout(this, 20) << "failed updating user info: " << op_ret << ": " << err << dendl;
-    return;
-  }
-}
-
-RGWOp *RGWHandler_User::op_get()
-{
-  if (s->info.args.sub_resource_exists("quota"))
-    return new RGWOp_Quota_Info;
-
-  if (s->info.args.sub_resource_exists("list"))
-    return new RGWOp_User_List;
-
-  return new RGWOp_User_Info;
-}
-
-RGWOp *RGWHandler_User::op_put()
-{
-  if (s->info.args.sub_resource_exists("subuser"))
-    return new RGWOp_Subuser_Create;
-
-  if (s->info.args.sub_resource_exists("key"))
-    return new RGWOp_Key_Create;
-
-  if (s->info.args.sub_resource_exists("caps"))
-    return new RGWOp_Caps_Add;
-
-  if (s->info.args.sub_resource_exists("quota"))
-    return new RGWOp_Quota_Set;
-
-  return new RGWOp_User_Create;
-}
-
-RGWOp *RGWHandler_User::op_post()
-{
-  if (s->info.args.sub_resource_exists("subuser"))
-    return new RGWOp_Subuser_Modify;
-
-  return new RGWOp_User_Modify;
-}
-
-RGWOp *RGWHandler_User::op_delete()
-{
-  if (s->info.args.sub_resource_exists("subuser"))
-    return new RGWOp_Subuser_Remove;
-
-  if (s->info.args.sub_resource_exists("key"))
-    return new RGWOp_Key_Remove;
-
-  if (s->info.args.sub_resource_exists("caps"))
-    return new RGWOp_Caps_Remove;
-
-  return new RGWOp_User_Remove;
-}
-
diff --git a/src/rgw/store/rados/rgw_rest_user.h b/src/rgw/store/rados/rgw_rest_user.h
deleted file mode 100644 (file)
index ee585be..0000000
+++ /dev/null
@@ -1,36 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab ft=cpp
-
-#pragma once
-
-#include "rgw_rest.h"
-#include "rgw_rest_s3.h"
-
-
-class RGWHandler_User : public RGWHandler_Auth_S3 {
-protected:
-  RGWOp *op_get() override;
-  RGWOp *op_put() override;
-  RGWOp *op_post() override;
-  RGWOp *op_delete() override;
-public:
-  using RGWHandler_Auth_S3::RGWHandler_Auth_S3;
-  ~RGWHandler_User() override = default;
-
-  int read_permissions(RGWOp*, optional_yield) override {
-    return 0;
-  }
-};
-
-class RGWRESTMgr_User : public RGWRESTMgr {
-public:
-  RGWRESTMgr_User() = default;
-  ~RGWRESTMgr_User() override = default;
-
-  RGWHandler_REST *get_handler(rgw::sal::Driver* driver,
-                              req_state*,
-                               const rgw::auth::StrategyRegistry& auth_registry,
-                               const std::string&) override {
-    return new RGWHandler_User(auth_registry);
-  }
-};
diff --git a/src/rgw/store/rados/rgw_sal_rados.cc b/src/rgw/store/rados/rgw_sal_rados.cc
deleted file mode 100644 (file)
index 577569d..0000000
+++ /dev/null
@@ -1,3630 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab ft=cpp
-
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2020 Red Hat, Inc.
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation. See file COPYING.
- *
- */
-
-#include <errno.h>
-#include <stdlib.h>
-#include <system_error>
-#include <filesystem>
-#include <unistd.h>
-#include <sstream>
-#include <boost/algorithm/string.hpp>
-#include <boost/process.hpp>
-
-#include "common/Clock.h"
-#include "common/errno.h"
-
-#include "rgw_sal.h"
-#include "rgw_sal_rados.h"
-#include "rgw_bucket.h"
-#include "rgw_multi.h"
-#include "rgw_acl_s3.h"
-#include "rgw_aio.h"
-#include "rgw_aio_throttle.h"
-#include "rgw_tracer.h"
-
-#include "rgw_zone.h"
-#include "rgw_rest_conn.h"
-#include "rgw_service.h"
-#include "rgw_lc.h"
-#include "rgw_lc_tier.h"
-#include "rgw_rest_admin.h"
-#include "rgw_rest_bucket.h"
-#include "rgw_rest_metadata.h"
-#include "rgw_rest_log.h"
-#include "rgw_rest_config.h"
-#include "rgw_rest_ratelimit.h"
-#include "rgw_rest_realm.h"
-#include "rgw_rest_user.h"
-#include "services/svc_sys_obj.h"
-#include "services/svc_meta.h"
-#include "services/svc_meta_be_sobj.h"
-#include "services/svc_cls.h"
-#include "services/svc_zone.h"
-#include "services/svc_tier_rados.h"
-#include "services/svc_quota.h"
-#include "services/svc_config_key.h"
-#include "services/svc_zone_utils.h"
-#include "services/svc_role_rados.h"
-#include "services/svc_user.h"
-#include "cls/rgw/cls_rgw_client.h"
-
-#include "rgw_pubsub.h"
-
-#define dout_subsys ceph_subsys_rgw
-
-using namespace std;
-
-static string mp_ns = RGW_OBJ_NS_MULTIPART;
-
-namespace rgw::sal {
-
-// default number of entries to list with each bucket listing call
-// (use marker to bridge between calls)
-static constexpr size_t listing_max_entries = 1000;
-
-static int decode_policy(CephContext* cct,
-                         bufferlist& bl,
-                         RGWAccessControlPolicy* policy)
-{
-  auto iter = bl.cbegin();
-  try {
-    policy->decode(iter);
-  } catch (buffer::error& err) {
-    ldout(cct, 0) << "ERROR: could not decode policy, caught buffer::error" << dendl;
-    return -EIO;
-  }
-  if (cct->_conf->subsys.should_gather<ceph_subsys_rgw, 15>()) {
-    ldout(cct, 15) << __func__ << " Read AccessControlPolicy";
-    RGWAccessControlPolicy_S3* s3policy = static_cast<RGWAccessControlPolicy_S3 *>(policy);
-    s3policy->to_xml(*_dout);
-    *_dout << dendl;
-  }
-  return 0;
-}
-
-static int rgw_op_get_bucket_policy_from_attr(const DoutPrefixProvider* dpp,
-                                             RadosStore* store,
-                                             User* user,
-                                             Attrs& bucket_attrs,
-                                             RGWAccessControlPolicy* policy,
-                                             optional_yield y)
-{
-  auto aiter = bucket_attrs.find(RGW_ATTR_ACL);
-
-  if (aiter != bucket_attrs.end()) {
-    int ret = decode_policy(store->ctx(), aiter->second, policy);
-    if (ret < 0)
-      return ret;
-  } else {
-    ldout(store->ctx(), 0) << "WARNING: couldn't find acl header for bucket, generating default" << dendl;
-    /* object exists, but policy is broken */
-    int r = user->load_user(dpp, y);
-    if (r < 0)
-      return r;
-
-    policy->create_default(user->get_id(), user->get_display_name());
-  }
-  return 0;
-}
-
-int RadosCompletions::drain()
-{
-  int ret = 0;
-  while (!handles.empty()) {
-    librados::AioCompletion* handle = handles.front();
-    handles.pop_front();
-    handle->wait_for_complete();
-    int r = handle->get_return_value();
-    handle->release();
-    if (r < 0) {
-      ret = r;
-    }
-  }
-  return ret;
-}
-
-int RadosUser::list_buckets(const DoutPrefixProvider* dpp, const std::string& marker,
-                              const std::string& end_marker, uint64_t max, bool need_stats,
-                              BucketList &buckets, optional_yield y)
-{
-  RGWUserBuckets ulist;
-  bool is_truncated = false;
-  int ret;
-
-  buckets.clear();
-  ret = store->ctl()->user->list_buckets(dpp, info.user_id, marker, end_marker, max,
-                                        need_stats, &ulist, &is_truncated, y);
-  if (ret < 0)
-    return ret;
-
-  buckets.set_truncated(is_truncated);
-  for (const auto& ent : ulist.get_buckets()) {
-    buckets.add(std::unique_ptr<Bucket>(new RadosBucket(this->store, ent.second, this)));
-  }
-
-  return 0;
-}
-
-int RadosUser::create_bucket(const DoutPrefixProvider* dpp,
-                                const rgw_bucket& b,
-                                const std::string& zonegroup_id,
-                                rgw_placement_rule& placement_rule,
-                                std::string& swift_ver_location,
-                                const RGWQuotaInfo * pquota_info,
-                                const RGWAccessControlPolicy& policy,
-                                Attrs& attrs,
-                                RGWBucketInfo& info,
-                                obj_version& ep_objv,
-                                bool exclusive,
-                                bool obj_lock_enabled,
-                                bool* existed,
-                                req_info& req_info,
-                                std::unique_ptr<Bucket>* bucket_out,
-                                optional_yield y)
-{
-  int ret;
-  bufferlist in_data;
-  RGWBucketInfo master_info;
-  rgw_bucket* pmaster_bucket;
-  uint32_t* pmaster_num_shards;
-  real_time creation_time;
-  std::unique_ptr<Bucket> bucket;
-  obj_version objv,* pobjv = NULL;
-
-  /* If it exists, look it up; otherwise create it */
-  ret = store->get_bucket(dpp, this, b, &bucket, y);
-  if (ret < 0 && ret != -ENOENT)
-    return ret;
-
-  if (ret != -ENOENT) {
-    RGWAccessControlPolicy old_policy(store->ctx());
-    *existed = true;
-    if (swift_ver_location.empty()) {
-      swift_ver_location = bucket->get_info().swift_ver_location;
-    }
-    placement_rule.inherit_from(bucket->get_info().placement_rule);
-
-    // don't allow changes to the acl policy
-    int r = rgw_op_get_bucket_policy_from_attr(dpp, store, this, bucket->get_attrs(),
-                                              &old_policy, y);
-    if (r >= 0 && old_policy != policy) {
-      bucket_out->swap(bucket);
-      return -EEXIST;
-    }
-  } else {
-    bucket = std::unique_ptr<Bucket>(new RadosBucket(store, b, this));
-    *existed = false;
-    bucket->set_attrs(attrs);
-  }
-
-  if (!store->svc()->zone->is_meta_master()) {
-    JSONParser jp;
-    ret = store->forward_request_to_master(dpp, this, NULL, in_data, &jp, req_info, y);
-    if (ret < 0) {
-      return ret;
-    }
-
-    JSONDecoder::decode_json("entry_point_object_ver", ep_objv, &jp);
-    JSONDecoder::decode_json("object_ver", objv, &jp);
-    JSONDecoder::decode_json("bucket_info", master_info, &jp);
-    ldpp_dout(dpp, 20) << "parsed: objv.tag=" << objv.tag << " objv.ver=" << objv.ver << dendl;
-    std::time_t ctime = ceph::real_clock::to_time_t(master_info.creation_time);
-    ldpp_dout(dpp, 20) << "got creation time: << " << std::put_time(std::localtime(&ctime), "%F %T") << dendl;
-    pmaster_bucket= &master_info.bucket;
-    creation_time = master_info.creation_time;
-    pmaster_num_shards = &master_info.layout.current_index.layout.normal.num_shards;
-    pobjv = &objv;
-    if (master_info.obj_lock_enabled()) {
-      info.flags = BUCKET_VERSIONED | BUCKET_OBJ_LOCK_ENABLED;
-    }
-  } else {
-    pmaster_bucket = NULL;
-    pmaster_num_shards = NULL;
-    if (obj_lock_enabled)
-      info.flags = BUCKET_VERSIONED | BUCKET_OBJ_LOCK_ENABLED;
-  }
-
-  std::string zid = zonegroup_id;
-  if (zid.empty()) {
-    zid = store->svc()->zone->get_zonegroup().get_id();
-  }
-
-  if (*existed) {
-    rgw_placement_rule selected_placement_rule;
-    ret = store->svc()->zone->select_bucket_placement(dpp, this->get_info(),
-                                              zid, placement_rule,
-                                              &selected_placement_rule, nullptr, y);
-    if (selected_placement_rule != info.placement_rule) {
-      ret = -EEXIST;
-      bucket_out->swap(bucket);
-      return ret;
-    }
-  } else {
-
-    ret = store->getRados()->create_bucket(this->get_info(), bucket->get_key(),
-                                   zid, placement_rule, swift_ver_location, pquota_info,
-                                   attrs, info, pobjv, &ep_objv, creation_time,
-                                   pmaster_bucket, pmaster_num_shards, y, dpp,
-                                   exclusive);
-    if (ret == -EEXIST) {
-      *existed = true;
-      /* bucket already existed, might have raced with another bucket creation,
-       * or might be partial bucket creation that never completed. Read existing
-       * bucket info, verify that the reported bucket owner is the current user.
-       * If all is ok then update the user's list of buckets.  Otherwise inform
-       * client about a name conflict.
-       */
-      if (info.owner.compare(this->get_id()) != 0) {
-       return -EEXIST;
-      }
-      ret = 0;
-    } else if (ret != 0) {
-      return ret;
-    }
-  }
-
-  bucket->set_version(ep_objv);
-  bucket->get_info() = info;
-
-  RadosBucket* rbucket = static_cast<RadosBucket*>(bucket.get());
-  ret = rbucket->link(dpp, this, y, false);
-  if (ret && !*existed && ret != -EEXIST) {
-    /* if it exists (or previously existed), don't remove it! */
-    ret = rbucket->unlink(dpp, this, y);
-    if (ret < 0) {
-      ldpp_dout(dpp, 0) << "WARNING: failed to unlink bucket: ret=" << ret
-                      << dendl;
-    }
-  } else if (ret == -EEXIST || (ret == 0 && *existed)) {
-    ret = -ERR_BUCKET_EXISTS;
-  }
-
-  bucket_out->swap(bucket);
-
-  return ret;
-}
-
-int RadosUser::read_attrs(const DoutPrefixProvider* dpp, optional_yield y)
-{
-  return store->ctl()->user->get_attrs_by_uid(dpp, get_id(), &attrs, y, &objv_tracker);
-}
-
-int RadosUser::merge_and_store_attrs(const DoutPrefixProvider* dpp, Attrs& new_attrs, optional_yield y)
-{
-  for(auto& it : new_attrs) {
-         attrs[it.first] = it.second;
-  }
-  return store_user(dpp, y, false);
-}
-
-int RadosUser::read_stats(const DoutPrefixProvider *dpp,
-                             optional_yield y, RGWStorageStats* stats,
-                            ceph::real_time* last_stats_sync,
-                            ceph::real_time* last_stats_update)
-{
-  return store->ctl()->user->read_stats(dpp, get_id(), stats, y, last_stats_sync, last_stats_update);
-}
-
-int RadosUser::read_stats_async(const DoutPrefixProvider *dpp, RGWGetUserStats_CB* cb)
-{
-  return store->svc()->user->read_stats_async(dpp, get_id(), cb);
-}
-
-int RadosUser::complete_flush_stats(const DoutPrefixProvider *dpp, optional_yield y)
-{
-  return store->svc()->user->complete_flush_stats(dpp, get_id(), y);
-}
-
-int RadosUser::read_usage(const DoutPrefixProvider *dpp, uint64_t start_epoch, uint64_t end_epoch,
-                              uint32_t max_entries, bool* is_truncated,
-                              RGWUsageIter& usage_iter,
-                              map<rgw_user_bucket, rgw_usage_log_entry>& usage)
-{
-  std::string bucket_name;
-  return store->getRados()->read_usage(dpp, get_id(), bucket_name, start_epoch,
-                                      end_epoch, max_entries, is_truncated,
-                                      usage_iter, usage);
-}
-
-int RadosUser::trim_usage(const DoutPrefixProvider *dpp, uint64_t start_epoch, uint64_t end_epoch)
-{
-  std::string bucket_name;
-
-  return store->getRados()->trim_usage(dpp, get_id(), bucket_name, start_epoch, end_epoch);
-}
-
-int RadosUser::load_user(const DoutPrefixProvider* dpp, optional_yield y)
-{
-    return store->ctl()->user->get_info_by_uid(dpp, info.user_id, &info, y, RGWUserCtl::GetParams().set_objv_tracker(&objv_tracker).set_attrs(&attrs));
-}
-
-int RadosUser::store_user(const DoutPrefixProvider* dpp, optional_yield y, bool exclusive, RGWUserInfo* old_info)
-{
-    return store->ctl()->user->store_info(dpp, info, y,
-                                         RGWUserCtl::PutParams().set_objv_tracker(&objv_tracker)
-                                         .set_exclusive(exclusive)
-                                         .set_attrs(&attrs)
-                                         .set_old_info(old_info));
-}
-
-int RadosUser::remove_user(const DoutPrefixProvider* dpp, optional_yield y)
-{
-    return store->ctl()->user->remove_info(dpp, info, y,
-                                         RGWUserCtl::RemoveParams().set_objv_tracker(&objv_tracker));
-}
-
-int RadosUser::verify_mfa(const std::string& mfa_str, bool* verified,
-                         const DoutPrefixProvider* dpp, optional_yield y)
-{
-  vector<string> params;
-  get_str_vec(mfa_str, " ", params);
-
-  if (params.size() != 2) {
-    ldpp_dout(dpp, 5) << "NOTICE: invalid mfa string provided: " << mfa_str << dendl;
-    return -EINVAL;
-  }
-
-  string& serial = params[0];
-  string& pin = params[1];
-
-  auto i = info.mfa_ids.find(serial);
-  if (i == info.mfa_ids.end()) {
-    ldpp_dout(dpp, 5) << "NOTICE: user does not have mfa device with serial=" << serial << dendl;
-    return -EACCES;
-  }
-
-  int ret = store->svc()->cls->mfa.check_mfa(dpp, info.user_id, serial, pin, y);
-  if (ret < 0) {
-    ldpp_dout(dpp, 20) << "NOTICE: failed to check MFA, serial=" << serial << dendl;
-    return -EACCES;
-  }
-
-  *verified = true;
-
-  return 0;
-}
-
-RadosBucket::~RadosBucket() {}
-
-int RadosBucket::remove_bucket(const DoutPrefixProvider* dpp,
-                              bool delete_children,
-                              bool forward_to_master,
-                              req_info* req_info,
-                              optional_yield y)
-{
-  int ret;
-
-  // Refresh info
-  ret = load_bucket(dpp, y);
-  if (ret < 0) {
-    return ret;
-  }
-
-  ListParams params;
-  params.list_versions = true;
-  params.allow_unordered = true;
-
-  ListResults results;
-
-  do {
-    results.objs.clear();
-
-    ret = list(dpp, params, 1000, results, y);
-    if (ret < 0) {
-      return ret;
-    }
-
-    if (!results.objs.empty() && !delete_children) {
-      ldpp_dout(dpp, -1) << "ERROR: could not remove non-empty bucket " << info.bucket.name <<
-       dendl;
-      return -ENOTEMPTY;
-    }
-
-    for (const auto& obj : results.objs) {
-      rgw_obj_key key(obj.key);
-      /* xxx dang */
-      ret = rgw_remove_object(dpp, store, this, key);
-      if (ret < 0 && ret != -ENOENT) {
-       return ret;
-      }
-    }
-  } while(results.is_truncated);
-
-  ret = abort_multiparts(dpp, store->ctx());
-  if (ret < 0) {
-    return ret;
-  }
-
-  // remove lifecycle config, if any (XXX note could be made generic)
-  (void) store->getRados()->get_lc()->remove_bucket_config(
-    this, get_attrs());
-
-  ret = store->ctl()->bucket->sync_user_stats(dpp, info.owner, info, y, nullptr);
-  if (ret < 0) {
-     ldout(store->ctx(), 1) << "WARNING: failed sync user stats before bucket delete. ret=" <<  ret << dendl;
-  }
-
-  RGWObjVersionTracker ot;
-
-  // if we deleted children above we will force delete, as any that
-  // remain is detrius from a prior bug
-  ret = store->getRados()->delete_bucket(info, ot, y, dpp, !delete_children);
-  if (ret < 0) {
-    ldpp_dout(dpp, -1) << "ERROR: could not remove bucket " <<
-      info.bucket.name << dendl;
-    return ret;
-  }
-
-  // if bucket has notification definitions associated with it
-  // they should be removed (note that any pending notifications on the bucket are still going to be sent)
-  RGWPubSub ps(store, info.owner.tenant);
-  RGWPubSub::Bucket ps_bucket(&ps, info.bucket);
-  const auto ps_ret = ps_bucket.remove_notifications(dpp, y);
-  if (ps_ret < 0 && ps_ret != -ENOENT) {
-    ldpp_dout(dpp, -1) << "ERROR: unable to remove notifications from bucket. ret=" << ps_ret << dendl;
-  }
-
-  ret = store->ctl()->bucket->unlink_bucket(info.owner, info.bucket, y, dpp, false);
-  if (ret < 0) {
-    ldpp_dout(dpp, -1) << "ERROR: unable to remove user bucket information" << dendl;
-  }
-
-  if (forward_to_master) {
-    bufferlist in_data;
-    ret = store->forward_request_to_master(dpp, owner, &ot.read_version, in_data, nullptr, *req_info, y);
-    if (ret < 0) {
-      if (ret == -ENOENT) {
-       /* adjust error, we want to return with NoSuchBucket and not
-        * NoSuchKey */
-       ret = -ERR_NO_SUCH_BUCKET;
-      }
-      return ret;
-    }
-  }
-
-  return ret;
-}
-
-int RadosBucket::remove_bucket_bypass_gc(int concurrent_max, bool
-                                        keep_index_consistent,
-                                        optional_yield y, const
-                                        DoutPrefixProvider *dpp)
-{
-  int ret;
-  map<RGWObjCategory, RGWStorageStats> stats;
-  map<string, bool> common_prefixes;
-  RGWObjectCtx obj_ctx(store);
-  CephContext *cct = store->ctx();
-
-  string bucket_ver, master_ver;
-
-  ret = load_bucket(dpp, null_yield);
-  if (ret < 0)
-    return ret;
-
-  const auto& index = info.get_current_index();
-  ret = read_stats(dpp, index, RGW_NO_SHARD, &bucket_ver, &master_ver, stats, NULL);
-  if (ret < 0)
-    return ret;
-
-  ret = abort_multiparts(dpp, cct);
-  if (ret < 0) {
-    return ret;
-  }
-
-  rgw::sal::Bucket::ListParams params;
-  rgw::sal::Bucket::ListResults results;
-
-  params.list_versions = true;
-  params.allow_unordered = true;
-
-  std::unique_ptr<rgw::sal::Completions> handles = store->get_completions();
-
-  int max_aio = concurrent_max;
-  results.is_truncated = true;
-
-  while (results.is_truncated) {
-    ret = list(dpp, params, listing_max_entries, results, null_yield);
-    if (ret < 0)
-      return ret;
-
-    std::vector<rgw_bucket_dir_entry>::iterator it = results.objs.begin();
-    for (; it != results.objs.end(); ++it) {
-      RGWObjState *astate = NULL;
-      RGWObjManifest *amanifest = nullptr;
-      std::unique_ptr<rgw::sal::Object> obj = get_object((*it).key);
-
-      ret = store->getRados()->get_obj_state(dpp, &obj_ctx, obj->get_bucket()->get_info(),
-                                            obj.get(), &astate, &amanifest,
-                                            false, y);
-      if (ret == -ENOENT) {
-        ldpp_dout(dpp, 1) << "WARNING: cannot find obj state for obj " << obj << dendl;
-        continue;
-      }
-      if (ret < 0) {
-        ldpp_dout(dpp, -1) << "ERROR: get obj state returned with error " << ret << dendl;
-        return ret;
-      }
-
-      if (amanifest) {
-        RGWObjManifest& manifest = *amanifest;
-        RGWObjManifest::obj_iterator miter = manifest.obj_begin(dpp);
-       std::unique_ptr<rgw::sal::Object> head_obj = get_object(manifest.get_obj().key);
-        rgw_raw_obj raw_head_obj;
-       dynamic_cast<RadosObject*>(head_obj.get())->get_raw_obj(&raw_head_obj);
-
-        for (; miter != manifest.obj_end(dpp) && max_aio--; ++miter) {
-          if (!max_aio) {
-            ret = handles->drain();
-            if (ret < 0) {
-              ldpp_dout(dpp, -1) << "ERROR: could not drain handles as aio completion returned with " << ret << dendl;
-              return ret;
-            }
-            max_aio = concurrent_max;
-          }
-
-          rgw_raw_obj last_obj = miter.get_location().get_raw_obj(store);
-          if (last_obj == raw_head_obj) {
-            // have the head obj deleted at the end
-            continue;
-          }
-
-          ret = store->delete_raw_obj_aio(dpp, last_obj, handles.get());
-          if (ret < 0) {
-            ldpp_dout(dpp, -1) << "ERROR: delete obj aio failed with " << ret << dendl;
-            return ret;
-          }
-        } // for all shadow objs
-
-       ret = head_obj->delete_obj_aio(dpp, astate, handles.get(), keep_index_consistent, null_yield);
-        if (ret < 0) {
-          ldpp_dout(dpp, -1) << "ERROR: delete obj aio failed with " << ret << dendl;
-          return ret;
-        }
-      }
-
-      if (!max_aio) {
-        ret = handles->drain();
-        if (ret < 0) {
-          ldpp_dout(dpp, -1) << "ERROR: could not drain handles as aio completion returned with " << ret << dendl;
-          return ret;
-        }
-        max_aio = concurrent_max;
-      }
-      obj_ctx.invalidate(obj->get_obj());
-    } // for all RGW objects in results
-  } // while is_truncated
-
-  ret = handles->drain();
-  if (ret < 0) {
-    ldpp_dout(dpp, -1) << "ERROR: could not drain handles as aio completion returned with " << ret << dendl;
-    return ret;
-  }
-
-  sync_user_stats(dpp, y);
-  if (ret < 0) {
-     ldpp_dout(dpp, 1) << "WARNING: failed sync user stats before bucket delete. ret=" <<  ret << dendl;
-  }
-
-  RGWObjVersionTracker objv_tracker;
-
-  // this function can only be run if caller wanted children to be
-  // deleted, so we can ignore the check for children as any that
-  // remain are detritus from a prior bug
-  ret = remove_bucket(dpp, true, false, nullptr, y);
-  if (ret < 0) {
-    ldpp_dout(dpp, -1) << "ERROR: could not remove bucket " << this << dendl;
-    return ret;
-  }
-
-  return ret;
-}
-
-int RadosBucket::load_bucket(const DoutPrefixProvider* dpp, optional_yield y, bool get_stats)
-{
-  int ret;
-
-  RGWSI_MetaBackend_CtxParams bectx_params = RGWSI_MetaBackend_CtxParams_SObj();
-  RGWObjVersionTracker ep_ot;
-  if (info.bucket.bucket_id.empty()) {
-    ret = store->ctl()->bucket->read_bucket_info(info.bucket, &info, y, dpp,
-                                     RGWBucketCtl::BucketInstance::GetParams()
-                                     .set_mtime(&mtime)
-                                     .set_attrs(&attrs)
-                                      .set_bectx_params(bectx_params),
-                                     &ep_ot);
-  } else {
-    ret  = store->ctl()->bucket->read_bucket_instance_info(info.bucket, &info, y, dpp,
-                                     RGWBucketCtl::BucketInstance::GetParams()
-                                     .set_mtime(&mtime)
-                                     .set_attrs(&attrs)
-                                     .set_bectx_params(bectx_params));
-  }
-  if (ret != 0) {
-    return ret;
-  }
-
-  bucket_version = ep_ot.read_version;
-
-  if (get_stats) {
-    ret = store->ctl()->bucket->read_bucket_stats(info.bucket, &ent, y, dpp);
-  }
-
-  return ret;
-}
-
-int RadosBucket::read_stats(const DoutPrefixProvider *dpp,
-                           const bucket_index_layout_generation& idx_layout,
-                           int shard_id, std::string* bucket_ver, std::string* master_ver,
-                           std::map<RGWObjCategory, RGWStorageStats>& stats,
-                           std::string* max_marker, bool* syncstopped)
-{
-  return store->getRados()->get_bucket_stats(dpp, info, idx_layout, shard_id, bucket_ver, master_ver, stats, max_marker, syncstopped);
-}
-
-int RadosBucket::read_stats_async(const DoutPrefixProvider *dpp,
-                                 const bucket_index_layout_generation& idx_layout,
-                                 int shard_id, RGWGetBucketStats_CB* ctx)
-{
-  return store->getRados()->get_bucket_stats_async(dpp, get_info(), idx_layout, shard_id, ctx);
-}
-
-int RadosBucket::sync_user_stats(const DoutPrefixProvider *dpp, optional_yield y)
-{
-  return store->ctl()->bucket->sync_user_stats(dpp, owner->get_id(), info, y, &ent);
-}
-
-int RadosBucket::update_container_stats(const DoutPrefixProvider* dpp)
-{
-  int ret;
-  map<std::string, RGWBucketEnt> m;
-
-  m[info.bucket.name] = ent;
-  ret = store->getRados()->update_containers_stats(m, dpp);
-  if (!ret)
-    return -EEXIST;
-  if (ret < 0)
-    return ret;
-
-  map<std::string, RGWBucketEnt>::iterator iter = m.find(info.bucket.name);
-  if (iter == m.end())
-    return -EINVAL;
-
-  ent.count = iter->second.count;
-  ent.size = iter->second.size;
-  ent.size_rounded = iter->second.size_rounded;
-  ent.creation_time = iter->second.creation_time;
-  ent.placement_rule = std::move(iter->second.placement_rule);
-
-  info.creation_time = ent.creation_time;
-  info.placement_rule = ent.placement_rule;
-
-  return 0;
-}
-
-int RadosBucket::check_bucket_shards(const DoutPrefixProvider* dpp)
-{
-      return store->getRados()->check_bucket_shards(info, info.bucket, get_count(), dpp);
-}
-
-int RadosBucket::link(const DoutPrefixProvider* dpp, User* new_user, optional_yield y, bool update_entrypoint, RGWObjVersionTracker* objv)
-{
-  RGWBucketEntryPoint ep;
-  ep.bucket = info.bucket;
-  ep.owner = new_user->get_id();
-  ep.creation_time = get_creation_time();
-  ep.linked = true;
-  Attrs ep_attrs;
-  rgw_ep_info ep_data{ep, ep_attrs};
-
-  int r = store->ctl()->bucket->link_bucket(new_user->get_id(), info.bucket,
-                                           get_creation_time(), y, dpp, update_entrypoint,
-                                           &ep_data);
-  if (r < 0)
-    return r;
-
-  if (objv)
-    *objv = ep_data.ep_objv;
-
-  return r;
-}
-
-int RadosBucket::unlink(const DoutPrefixProvider* dpp, User* new_user, optional_yield y, bool update_entrypoint)
-{
-  return store->ctl()->bucket->unlink_bucket(new_user->get_id(), info.bucket, y, dpp, update_entrypoint);
-}
-
-int RadosBucket::chown(const DoutPrefixProvider* dpp, User* new_user, User* old_user, optional_yield y, const std::string* marker)
-{
-  std::string obj_marker;
-
-  if (marker == nullptr)
-    marker = &obj_marker;
-
-  int r = this->link(dpp, new_user, y);
-  if (r < 0) {
-    return r;
-  }
-  if (!old_user) {
-    return r;
-  }
-
-  return store->ctl()->bucket->chown(store, this, new_user->get_id(),
-                          old_user->get_display_name(), *marker, y, dpp);
-}
-
-int RadosBucket::put_info(const DoutPrefixProvider* dpp, bool exclusive, ceph::real_time _mtime)
-{
-  mtime = _mtime;
-  return store->getRados()->put_bucket_instance_info(info, exclusive, mtime, &attrs, dpp);
-}
-
-/* Make sure to call get_bucket_info() if you need it first */
-bool RadosBucket::is_owner(User* user)
-{
-  return (info.owner.compare(user->get_id()) == 0);
-}
-
-int RadosBucket::check_empty(const DoutPrefixProvider* dpp, optional_yield y)
-{
-  return store->getRados()->check_bucket_empty(dpp, info, y);
-}
-
-int RadosBucket::check_quota(const DoutPrefixProvider *dpp, RGWQuota& quota, uint64_t obj_size,
-                               optional_yield y, bool check_size_only)
-{
-    return store->getRados()->check_quota(dpp, owner->get_id(), get_key(),
-                                         quota, obj_size, y, check_size_only);
-}
-
-int RadosBucket::merge_and_store_attrs(const DoutPrefixProvider* dpp, Attrs& new_attrs, optional_yield y)
-{
-  for(auto& it : new_attrs) {
-         attrs[it.first] = it.second;
-  }
-  return store->ctl()->bucket->set_bucket_instance_attrs(get_info(),
-                               new_attrs, &get_info().objv_tracker, y, dpp);
-}
-
-int RadosBucket::try_refresh_info(const DoutPrefixProvider* dpp, ceph::real_time* pmtime)
-{
-  return store->getRados()->try_refresh_bucket_info(info, pmtime, dpp, &attrs);
-}
-
-int RadosBucket::read_usage(const DoutPrefixProvider *dpp, uint64_t start_epoch, uint64_t end_epoch,
-                              uint32_t max_entries, bool* is_truncated,
-                              RGWUsageIter& usage_iter,
-                              map<rgw_user_bucket, rgw_usage_log_entry>& usage)
-{
-  return store->getRados()->read_usage(dpp, owner->get_id(), get_name(), start_epoch,
-                                      end_epoch, max_entries, is_truncated,
-                                      usage_iter, usage);
-}
-
-int RadosBucket::trim_usage(const DoutPrefixProvider *dpp, uint64_t start_epoch, uint64_t end_epoch)
-{
-  return store->getRados()->trim_usage(dpp, owner->get_id(), get_name(), start_epoch, end_epoch);
-}
-
-int RadosBucket::remove_objs_from_index(const DoutPrefixProvider *dpp, std::list<rgw_obj_index_key>& objs_to_unlink)
-{
-  return store->getRados()->remove_objs_from_index(dpp, info, objs_to_unlink);
-}
-
-int RadosBucket::check_index(const DoutPrefixProvider *dpp, std::map<RGWObjCategory, RGWStorageStats>& existing_stats, std::map<RGWObjCategory, RGWStorageStats>& calculated_stats)
-{
-  return store->getRados()->bucket_check_index(dpp, info, &existing_stats, &calculated_stats);
-}
-
-int RadosBucket::rebuild_index(const DoutPrefixProvider *dpp)
-{
-  return store->getRados()->bucket_rebuild_index(dpp, info);
-}
-
-int RadosBucket::set_tag_timeout(const DoutPrefixProvider *dpp, uint64_t timeout)
-{
-  return store->getRados()->cls_obj_set_bucket_tag_timeout(dpp, info, timeout);
-}
-
-int RadosBucket::purge_instance(const DoutPrefixProvider* dpp)
-{
-  int max_shards = (info.layout.current_index.layout.normal.num_shards > 0 ? info.layout.current_index.layout.normal.num_shards : 1);
-  for (int i = 0; i < max_shards; i++) {
-    RGWRados::BucketShard bs(store->getRados());
-    int shard_id = (info.layout.current_index.layout.normal.num_shards > 0  ? i : -1);
-    int ret = bs.init(dpp, info, info.layout.current_index, shard_id);
-    if (ret < 0) {
-      cerr << "ERROR: bs.init(bucket=" << info.bucket << ", shard=" << shard_id
-           << "): " << cpp_strerror(-ret) << std::endl;
-      return ret;
-    }
-    ret = store->getRados()->bi_remove(dpp, bs);
-    if (ret < 0) {
-      cerr << "ERROR: failed to remove bucket index object: "
-           << cpp_strerror(-ret) << std::endl;
-      return ret;
-    }
-  }
-  return 0;
-}
-
-int RadosBucket::set_acl(const DoutPrefixProvider* dpp, RGWAccessControlPolicy &acl, optional_yield y)
-{
-  bufferlist aclbl;
-
-  acls = acl;
-  acl.encode(aclbl);
-  map<string, bufferlist>& attrs = get_attrs();
-
-  attrs[RGW_ATTR_ACL] = aclbl;
-  info.owner = acl.get_owner().get_id();
-
-  int r = store->ctl()->bucket->store_bucket_instance_info(info.bucket,
-                 info, y, dpp,
-                 RGWBucketCtl::BucketInstance::PutParams().set_attrs(&attrs));
-  if (r < 0) {
-    cerr << "ERROR: failed to set bucket owner: " << cpp_strerror(-r) << std::endl;
-    return r;
-  }
-  
-  return 0;
-}
-
-std::unique_ptr<Object> RadosBucket::get_object(const rgw_obj_key& k)
-{
-  return std::make_unique<RadosObject>(this->store, k, this);
-}
-
-int RadosBucket::list(const DoutPrefixProvider* dpp, ListParams& params, int max, ListResults& results, optional_yield y)
-{
-  RGWRados::Bucket target(store->getRados(), get_info());
-  if (params.shard_id >= 0) {
-    target.set_shard_id(params.shard_id);
-  }
-  RGWRados::Bucket::List list_op(&target);
-
-  list_op.params.prefix = params.prefix;
-  list_op.params.delim = params.delim;
-  list_op.params.marker = params.marker;
-  list_op.params.ns = params.ns;
-  list_op.params.end_marker = params.end_marker;
-  list_op.params.ns = params.ns;
-  list_op.params.enforce_ns = params.enforce_ns;
-  list_op.params.access_list_filter = params.access_list_filter;
-  list_op.params.force_check_filter = params.force_check_filter;
-  list_op.params.list_versions = params.list_versions;
-  list_op.params.allow_unordered = params.allow_unordered;
-
-  int ret = list_op.list_objects(dpp, max, &results.objs, &results.common_prefixes, &results.is_truncated, y);
-  if (ret >= 0) {
-    results.next_marker = list_op.get_next_marker();
-    params.marker = results.next_marker;
-  }
-
-  return ret;
-}
-
-std::unique_ptr<MultipartUpload> RadosBucket::get_multipart_upload(
-                                 const std::string& oid,
-                                 std::optional<std::string> upload_id,
-                                 ACLOwner owner, ceph::real_time mtime)
-{
-  return std::make_unique<RadosMultipartUpload>(this->store, this, oid, upload_id,
-                                               std::move(owner), mtime);
-}
-
-int RadosBucket::list_multiparts(const DoutPrefixProvider *dpp,
-                                const string& prefix,
-                                string& marker,
-                                const string& delim,
-                                const int& max_uploads,
-                                vector<std::unique_ptr<MultipartUpload>>& uploads,
-                                map<string, bool> *common_prefixes,
-                                bool *is_truncated)
-{
-  rgw::sal::Bucket::ListParams params;
-  rgw::sal::Bucket::ListResults results;
-  MultipartMetaFilter mp_filter;
-
-  params.prefix = prefix;
-  params.delim = delim;
-  params.marker = marker;
-  params.ns = RGW_OBJ_NS_MULTIPART;
-  params.access_list_filter = &mp_filter;
-
-  int ret = list(dpp, params, max_uploads, results, null_yield);
-
-  if (ret < 0)
-    return ret;
-
-  if (!results.objs.empty()) {
-    for (const rgw_bucket_dir_entry& dentry : results.objs) {
-      rgw_obj_key key(dentry.key);
-      ACLOwner owner(rgw_user(dentry.meta.owner));
-      owner.set_name(dentry.meta.owner_display_name);
-      uploads.push_back(this->get_multipart_upload(key.name,
-                       std::nullopt, std::move(owner)));
-    }
-  }
-  if (common_prefixes) {
-    *common_prefixes = std::move(results.common_prefixes);
-  }
-  *is_truncated = results.is_truncated;
-  marker = params.marker.name;
-
-  return 0;
-}
-
-int RadosBucket::abort_multiparts(const DoutPrefixProvider* dpp,
-                                 CephContext* cct)
-{
-  constexpr int max = 1000;
-  int ret, num_deleted = 0;
-  vector<std::unique_ptr<MultipartUpload>> uploads;
-  string marker;
-  bool is_truncated;
-
-  const std::string empty_delim;
-  const std::string empty_prefix;
-
-  do {
-    ret = list_multiparts(dpp, empty_prefix, marker, empty_delim,
-                         max, uploads, nullptr, &is_truncated);
-    if (ret < 0) {
-      ldpp_dout(dpp, 0) << __func__ <<
-       " ERROR : calling list_bucket_multiparts; ret=" << ret <<
-       "; bucket=\"" << this << "\"" << dendl;
-      return ret;
-    }
-    ldpp_dout(dpp, 20) << __func__ <<
-      " INFO: aborting and cleaning up multipart upload(s); bucket=\"" <<
-      this << "\"; uploads.size()=" << uploads.size() <<
-      "; is_truncated=" << is_truncated << dendl;
-
-    if (!uploads.empty()) {
-      for (const auto& upload : uploads) {
-       ret = upload->abort(dpp, cct);
-        if (ret < 0) {
-         // we're doing a best-effort; if something cannot be found,
-         // log it and keep moving forward
-         if (ret != -ENOENT && ret != -ERR_NO_SUCH_UPLOAD) {
-           ldpp_dout(dpp, 0) << __func__ <<
-             " ERROR : failed to abort and clean-up multipart upload \"" <<
-             upload->get_meta() << "\"" << dendl;
-           return ret;
-         } else {
-           ldpp_dout(dpp, 10) << __func__ <<
-             " NOTE : unable to find part(s) of "
-             "aborted multipart upload of \"" << upload->get_meta() <<
-             "\" for cleaning up" << dendl;
-         }
-        }
-        num_deleted++;
-      }
-      if (num_deleted) {
-        ldpp_dout(dpp, 0) << __func__ <<
-         " WARNING : aborted " << num_deleted <<
-         " incomplete multipart uploads" << dendl;
-      }
-    }
-  } while (is_truncated);
-
-  return 0;
-}
-
-std::unique_ptr<User> RadosStore::get_user(const rgw_user &u)
-{
-  return std::make_unique<RadosUser>(this, u);
-}
-
-std::string RadosStore::get_cluster_id(const DoutPrefixProvider* dpp,  optional_yield y)
-{
-  return getRados()->get_cluster_fsid(dpp, y);
-}
-
-int RadosStore::get_user_by_access_key(const DoutPrefixProvider* dpp, const std::string& key, optional_yield y, std::unique_ptr<User>* user)
-{
-  RGWUserInfo uinfo;
-  User* u;
-  RGWObjVersionTracker objv_tracker;
-
-  int r = ctl()->user->get_info_by_access_key(dpp, key, &uinfo, y, RGWUserCtl::GetParams().set_objv_tracker(&objv_tracker));
-  if (r < 0)
-    return r;
-
-  u = new RadosUser(this, uinfo);
-  if (!u)
-    return -ENOMEM;
-
-  u->get_version_tracker() = objv_tracker;
-
-  user->reset(u);
-  return 0;
-}
-
-int RadosStore::get_user_by_email(const DoutPrefixProvider* dpp, const std::string& email, optional_yield y, std::unique_ptr<User>* user)
-{
-  RGWUserInfo uinfo;
-  User* u;
-  RGWObjVersionTracker objv_tracker;
-
-  int r = ctl()->user->get_info_by_email(dpp, email, &uinfo, y, RGWUserCtl::GetParams().set_objv_tracker(&objv_tracker));
-  if (r < 0)
-    return r;
-
-  u = new RadosUser(this, uinfo);
-  if (!u)
-    return -ENOMEM;
-
-  u->get_version_tracker() = objv_tracker;
-
-  user->reset(u);
-  return 0;
-}
-
-int RadosStore::get_user_by_swift(const DoutPrefixProvider* dpp, const std::string& user_str, optional_yield y, std::unique_ptr<User>* user)
-{
-  RGWUserInfo uinfo;
-  User* u;
-  RGWObjVersionTracker objv_tracker;
-
-  int r = ctl()->user->get_info_by_swift(dpp, user_str, &uinfo, y, RGWUserCtl::GetParams().set_objv_tracker(&objv_tracker));
-  if (r < 0)
-    return r;
-
-  u = new RadosUser(this, uinfo);
-  if (!u)
-    return -ENOMEM;
-
-  u->get_version_tracker() = objv_tracker;
-
-  user->reset(u);
-  return 0;
-}
-
-std::unique_ptr<Object> RadosStore::get_object(const rgw_obj_key& k)
-{
-  return std::make_unique<RadosObject>(this, k);
-}
-
-int RadosStore::get_bucket(const DoutPrefixProvider* dpp, User* u, const rgw_bucket& b, std::unique_ptr<Bucket>* bucket, optional_yield y)
-{
-  int ret;
-  Bucket* bp;
-
-  bp = new RadosBucket(this, b, u);
-  ret = bp->load_bucket(dpp, y);
-  if (ret < 0) {
-    delete bp;
-    return ret;
-  }
-
-  bucket->reset(bp);
-  return 0;
-}
-
-int RadosStore::get_bucket(User* u, const RGWBucketInfo& i, std::unique_ptr<Bucket>* bucket)
-{
-  Bucket* bp;
-
-  bp = new RadosBucket(this, i, u);
-  /* Don't need to fetch the bucket info, use the provided one */
-
-  bucket->reset(bp);
-  return 0;
-}
-
-int RadosStore::get_bucket(const DoutPrefixProvider* dpp, User* u, const std::string& tenant, const std::string& name, std::unique_ptr<Bucket>* bucket, optional_yield y)
-{
-  rgw_bucket b;
-
-  b.tenant = tenant;
-  b.name = name;
-
-  return get_bucket(dpp, u, b, bucket, y);
-}
-
-bool RadosStore::is_meta_master()
-{
-  return svc()->zone->is_meta_master();
-}
-
-int RadosStore::forward_request_to_master(const DoutPrefixProvider *dpp, User* user, obj_version* objv,
-                                            bufferlist& in_data,
-                                            JSONParser* jp, req_info& info,
-                                            optional_yield y)
-{
-  if (is_meta_master()) {
-    /* We're master, don't forward */
-    return 0;
-  }
-
-  if (!svc()->zone->get_master_conn()) {
-    ldpp_dout(dpp, 0) << "rest connection is invalid" << dendl;
-    return -EINVAL;
-  }
-  ldpp_dout(dpp, 0) << "sending request to master zonegroup" << dendl;
-  bufferlist response;
-  std::string uid_str = user->get_id().to_str();
-#define MAX_REST_RESPONSE (128 * 1024) // we expect a very small response
-  int ret = svc()->zone->get_master_conn()->forward(dpp, rgw_user(uid_str), info,
-                                                    objv, MAX_REST_RESPONSE,
-                                                   &in_data, &response, y);
-  if (ret < 0)
-    return ret;
-
-  ldpp_dout(dpp, 20) << "response: " << response.c_str() << dendl;
-  if (jp && !jp->parse(response.c_str(), response.length())) {
-    ldpp_dout(dpp, 0) << "failed parsing response from master zonegroup" << dendl;
-    return -EINVAL;
-  }
-
-  return 0;
-}
-
-int RadosStore::forward_iam_request_to_master(const DoutPrefixProvider *dpp, const RGWAccessKey& key, obj_version* objv,
-                                            bufferlist& in_data,
-                                            RGWXMLDecoder::XMLParser* parser, req_info& info,
-                                            optional_yield y)
-{
-  if (is_meta_master()) {
-    /* We're master, don't forward */
-    return 0;
-  }
-
-  if (!svc()->zone->get_master_conn()) {
-    ldpp_dout(dpp, 0) << "rest connection is invalid" << dendl;
-    return -EINVAL;
-  }
-  ldpp_dout(dpp, 0) << "sending request to master zonegroup" << dendl;
-  bufferlist response;
-#define MAX_REST_RESPONSE (128 * 1024) // we expect a very small response
-  int ret = svc()->zone->get_master_conn()->forward_iam_request(dpp, key, info,
-                                                    objv, MAX_REST_RESPONSE,
-                                                                                       &in_data, &response, y);
-  if (ret < 0)
-    return ret;
-
-  ldpp_dout(dpp, 20) << "response: " << response.c_str() << dendl;
-
-  std::string r = response.c_str();
-  std::string str_to_search = "&quot;";
-  std::string str_to_replace = "\"";
-  boost::replace_all(r, str_to_search, str_to_replace);
-  ldpp_dout(dpp, 20) << "r: " << r.c_str() << dendl;
-
-  if (parser && !parser->parse(r.c_str(), r.length(), 1)) {
-    ldpp_dout(dpp, 0) << "ERROR: failed to parse response from master zonegroup" << dendl;
-    return -EIO;
-  }
-
-  return 0;
-}
-
-std::string RadosStore::zone_unique_id(uint64_t unique_num)
-{
-  return svc()->zone_utils->unique_id(unique_num);
-}
-
-std::string RadosStore::zone_unique_trans_id(const uint64_t unique_num)
-{
-  return svc()->zone_utils->unique_trans_id(unique_num);
-}
-
-int RadosStore::get_zonegroup(const std::string& id,
-                             std::unique_ptr<ZoneGroup>* zonegroup)
-{
-  ZoneGroup* zg;
-  RGWZoneGroup rzg;
-  int r = svc()->zone->get_zonegroup(id, rzg);
-  if (r < 0)
-    return r;
-
-  zg = new RadosZoneGroup(this, rzg);
-  if (!zg)
-    return -ENOMEM;
-
-  zonegroup->reset(zg);
-  return 0;
-}
-
-int RadosStore::list_all_zones(const DoutPrefixProvider* dpp, std::list<std::string>& zone_ids)
-{
-  return svc()->zone->list_zones(dpp, zone_ids);
-}
-
-int RadosStore::cluster_stat(RGWClusterStat& stats)
-{
-  rados_cluster_stat_t rados_stats;
-  int ret;
-
-  ret = rados->get_rados_handle()->cluster_stat(rados_stats);
-  if (ret < 0)
-    return ret;
-
-  stats.kb = rados_stats.kb;
-  stats.kb_used = rados_stats.kb_used;
-  stats.kb_avail = rados_stats.kb_avail;
-  stats.num_objects = rados_stats.num_objects;
-
-  return ret;
-}
-
-std::unique_ptr<Lifecycle> RadosStore::get_lifecycle(void)
-{
-  return std::make_unique<RadosLifecycle>(this);
-}
-
-std::unique_ptr<Completions> RadosStore::get_completions(void)
-{
-  return std::make_unique<RadosCompletions>();
-}
-
-std::unique_ptr<Notification> RadosStore::get_notification(
-  rgw::sal::Object* obj, rgw::sal::Object* src_obj, req_state* s, rgw::notify::EventType event_type, const std::string* object_name)
-{
-  return std::make_unique<RadosNotification>(s, this, obj, src_obj, s, event_type, object_name);
-}
-
-std::unique_ptr<Notification> RadosStore::get_notification(const DoutPrefixProvider* dpp, rgw::sal::Object* obj, rgw::sal::Object* src_obj, rgw::notify::EventType event_type, rgw::sal::Bucket* _bucket, std::string& _user_id, std::string& _user_tenant, std::string& _req_id, optional_yield y)
-{
-  return std::make_unique<RadosNotification>(dpp, this, obj, src_obj, event_type, _bucket, _user_id, _user_tenant, _req_id, y);
-}
-
-int RadosStore::delete_raw_obj(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj)
-{
-  return rados->delete_raw_obj(dpp, obj);
-}
-
-int RadosStore::delete_raw_obj_aio(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj, Completions* aio)
-{
-  RadosCompletions* raio = static_cast<RadosCompletions*>(aio);
-
-  return rados->delete_raw_obj_aio(dpp, obj, raio->handles);
-}
-
-void RadosStore::get_raw_obj(const rgw_placement_rule& placement_rule, const rgw_obj& obj, rgw_raw_obj* raw_obj)
-{
-    rados->obj_to_raw(placement_rule, obj, raw_obj);
-}
-
-int RadosStore::get_raw_chunk_size(const DoutPrefixProvider* dpp, const rgw_raw_obj& obj, uint64_t* chunk_size)
-{
-  return rados->get_max_chunk_size(obj.pool, chunk_size, dpp);
-}
-
-int RadosStore::initialize(CephContext *cct, const DoutPrefixProvider *dpp)
-{
-  std::unique_ptr<ZoneGroup> zg =
-    std::make_unique<RadosZoneGroup>(this, svc()->zone->get_zonegroup());
-  zone = make_unique<RadosZone>(this, std::move(zg));
-  return 0;
-}
-
-int RadosStore::log_usage(const DoutPrefixProvider *dpp, map<rgw_user_bucket, RGWUsageBatch>& usage_info)
-{
-    return rados->log_usage(dpp, usage_info);
-}
-
-int RadosStore::log_op(const DoutPrefixProvider *dpp, std::string& oid, bufferlist& bl)
-{
-  rgw_raw_obj obj(svc()->zone->get_zone_params().log_pool, oid);
-
-  int ret = rados->append_async(dpp, obj, bl.length(), bl);
-  if (ret == -ENOENT) {
-    ret = rados->create_pool(dpp, svc()->zone->get_zone_params().log_pool);
-    if (ret < 0)
-      return ret;
-    // retry
-    ret = rados->append_async(dpp, obj, bl.length(), bl);
-  }
-
-  return ret;
-}
-
-int RadosStore::register_to_service_map(const DoutPrefixProvider *dpp, const std::string& daemon_type,
-                                          const map<std::string, std::string>& meta)
-{
-  return rados->register_to_service_map(dpp, daemon_type, meta);
-}
-
-void RadosStore::get_quota(RGWQuota& quota)
-{
-    quota.bucket_quota = svc()->quota->get_bucket_quota();
-    quota.user_quota = svc()->quota->get_user_quota();
-}
-
-void RadosStore::get_ratelimit(RGWRateLimitInfo& bucket_ratelimit, RGWRateLimitInfo& user_ratelimit, RGWRateLimitInfo& anon_ratelimit)
-{
-  bucket_ratelimit = svc()->zone->get_current_period().get_config().bucket_ratelimit;
-  user_ratelimit = svc()->zone->get_current_period().get_config().user_ratelimit;
-  anon_ratelimit = svc()->zone->get_current_period().get_config().anon_ratelimit;
-}
-
-int RadosStore::set_buckets_enabled(const DoutPrefixProvider* dpp, vector<rgw_bucket>& buckets, bool enabled)
-{
-    return rados->set_buckets_enabled(buckets, enabled, dpp);
-}
-
-int RadosStore::get_sync_policy_handler(const DoutPrefixProvider* dpp,
-                                          std::optional<rgw_zone_id> zone,
-                                          std::optional<rgw_bucket> bucket,
-                                          RGWBucketSyncPolicyHandlerRef* phandler,
-                                          optional_yield y)
-{
-  return ctl()->bucket->get_sync_policy_handler(zone, bucket, phandler, y, dpp);
-}
-
-RGWDataSyncStatusManager* RadosStore::get_data_sync_manager(const rgw_zone_id& source_zone)
-{
-  return rados->get_data_sync_manager(source_zone);
-}
-
-int RadosStore::read_all_usage(const DoutPrefixProvider *dpp, uint64_t start_epoch, uint64_t end_epoch,
-                                 uint32_t max_entries, bool* is_truncated,
-                                 RGWUsageIter& usage_iter,
-                                 map<rgw_user_bucket, rgw_usage_log_entry>& usage)
-{
-  rgw_user uid;
-  std::string bucket_name;
-
-  return rados->read_usage(dpp, uid, bucket_name, start_epoch, end_epoch, max_entries,
-                          is_truncated, usage_iter, usage);
-}
-
-int RadosStore::trim_all_usage(const DoutPrefixProvider *dpp, uint64_t start_epoch, uint64_t end_epoch)
-{
-  rgw_user uid;
-  std::string bucket_name;
-
-  return rados->trim_usage(dpp, uid, bucket_name, start_epoch, end_epoch);
-}
-
-int RadosStore::get_config_key_val(std::string name, bufferlist* bl)
-{
-  return svc()->config_key->get(name, true, bl);
-}
-
-int RadosStore::meta_list_keys_init(const DoutPrefixProvider *dpp, const std::string& section, const std::string& marker, void** phandle)
-{
-  return ctl()->meta.mgr->list_keys_init(dpp, section, marker, phandle);
-}
-
-int RadosStore::meta_list_keys_next(const DoutPrefixProvider *dpp, void* handle, int max, list<std::string>& keys, bool* truncated)
-{
-  return ctl()->meta.mgr->list_keys_next(dpp, handle, max, keys, truncated);
-}
-
-void RadosStore::meta_list_keys_complete(void* handle)
-{
-  ctl()->meta.mgr->list_keys_complete(handle);
-}
-
-std::string RadosStore::meta_get_marker(void* handle)
-{
-  return ctl()->meta.mgr->get_marker(handle);
-}
-
-int RadosStore::meta_remove(const DoutPrefixProvider* dpp, std::string& metadata_key, optional_yield y)
-{
-  return ctl()->meta.mgr->remove(metadata_key, y, dpp);
-}
-
-void RadosStore::finalize(void)
-{
-  if (rados)
-    rados->finalize();
-}
-
-void RadosStore::register_admin_apis(RGWRESTMgr* mgr)
-{
-  mgr->register_resource("user", new RGWRESTMgr_User);
-  mgr->register_resource("bucket", new RGWRESTMgr_Bucket);
-  /*Registering resource for /admin/metadata */
-  mgr->register_resource("metadata", new RGWRESTMgr_Metadata);
-  mgr->register_resource("log", new RGWRESTMgr_Log);
-  /* XXX These may become global when cbodley is done with his zone work */
-  mgr->register_resource("config", new RGWRESTMgr_Config);
-  mgr->register_resource("realm", new RGWRESTMgr_Realm);
-  mgr->register_resource("ratelimit", new RGWRESTMgr_Ratelimit);
-}
-
-std::unique_ptr<LuaManager> RadosStore::get_lua_manager()
-{
-  return std::make_unique<RadosLuaManager>(this);
-}
-
-std::unique_ptr<RGWRole> RadosStore::get_role(std::string name,
-                                             std::string tenant,
-                                             std::string path,
-                                             std::string trust_policy,
-                                             std::string max_session_duration_str,
-                std::multimap<std::string,std::string> tags)
-{
-  return std::make_unique<RadosRole>(this, name, tenant, path, trust_policy, max_session_duration_str, tags);
-}
-
-std::unique_ptr<RGWRole> RadosStore::get_role(std::string id)
-{
-  return std::make_unique<RadosRole>(this, id);
-}
-
-std::unique_ptr<RGWRole> RadosStore::get_role(const RGWRoleInfo& info)
-{
-  return std::make_unique<RadosRole>(this, info);
-}
-
-int RadosStore::get_roles(const DoutPrefixProvider *dpp,
-                         optional_yield y,
-                         const std::string& path_prefix,
-                         const std::string& tenant,
-                         vector<std::unique_ptr<RGWRole>>& roles)
-{
-  auto pool = svc()->zone->get_zone_params().roles_pool;
-  std::string prefix;
-
-  // List all roles if path prefix is empty
-  if (! path_prefix.empty()) {
-    prefix = tenant + RGWRole::role_path_oid_prefix + path_prefix;
-  } else {
-    prefix = tenant + RGWRole::role_path_oid_prefix;
-  }
-
-  //Get the filtered objects
-  list<std::string> result;
-  bool is_truncated;
-  RGWListRawObjsCtx ctx;
-  do {
-    list<std::string> oids;
-    int r = rados->list_raw_objects(dpp, pool, prefix, 1000, ctx, oids, &is_truncated);
-    if (r < 0) {
-      ldpp_dout(dpp, 0) << "ERROR: listing filtered objects failed: "
-                  << prefix << ": " << cpp_strerror(-r) << dendl;
-      return r;
-    }
-    for (const auto& iter : oids) {
-      result.push_back(iter.substr(RGWRole::role_path_oid_prefix.size()));
-    }
-  } while (is_truncated);
-
-  for (const auto& it : result) {
-    //Find the role oid prefix from the end
-    size_t pos = it.rfind(RGWRole::role_oid_prefix);
-    if (pos == std::string::npos) {
-        continue;
-    }
-    // Split the result into path and info_oid + id
-    std::string path = it.substr(0, pos);
-
-    /*Make sure that prefix is part of path (False results could've been returned)
-      because of the role info oid + id appended to the path)*/
-    if(path_prefix.empty() || path.find(path_prefix) != std::string::npos) {
-      //Get id from info oid prefix + id
-      std::string id = it.substr(pos + RGWRole::role_oid_prefix.length());
-
-      std::unique_ptr<rgw::sal::RGWRole> role = get_role(id);
-      int ret = role->read_info(dpp, y);
-      if (ret < 0) {
-        return ret;
-      }
-      roles.push_back(std::move(role));
-    }
-  }
-
-  return 0;
-}
-
-std::unique_ptr<RGWOIDCProvider> RadosStore::get_oidc_provider()
-{
-  return std::make_unique<RadosOIDCProvider>(this);
-}
-
-int RadosStore::get_oidc_providers(const DoutPrefixProvider *dpp,
-                                  const std::string& tenant,
-                                  vector<std::unique_ptr<RGWOIDCProvider>>& providers)
-{
-  std::string prefix = tenant + RGWOIDCProvider::oidc_url_oid_prefix;
-  auto pool = svc()->zone->get_zone_params().oidc_pool;
-
-  //Get the filtered objects
-  list<std::string> result;
-  bool is_truncated;
-  RGWListRawObjsCtx ctx;
-  do {
-    list<std::string> oids;
-    int r = rados->list_raw_objects(dpp, pool, prefix, 1000, ctx, oids, &is_truncated);
-    if (r < 0) {
-      ldpp_dout(dpp, 0) << "ERROR: listing filtered objects failed: OIDC pool: "
-                  << pool.name << ": " << prefix << ": " << cpp_strerror(-r) << dendl;
-      return r;
-    }
-    for (const auto& iter : oids) {
-      std::unique_ptr<rgw::sal::RGWOIDCProvider> provider = get_oidc_provider();
-      bufferlist bl;
-
-      r = rgw_get_system_obj(svc()->sysobj, pool, iter, bl, nullptr, nullptr, null_yield, dpp);
-      if (r < 0) {
-        return r;
-      }
-
-      try {
-        using ceph::decode;
-        auto iter = bl.cbegin();
-        decode(*provider, iter);
-      } catch (buffer::error& err) {
-        ldpp_dout(dpp, 0) << "ERROR: failed to decode oidc provider info from pool: "
-         << pool.name << ": " << iter << dendl;
-        return -EIO;
-      }
-
-      providers.push_back(std::move(provider));
-    }
-  } while (is_truncated);
-
-  return 0;
-}
-
-std::unique_ptr<Writer> RadosStore::get_append_writer(const DoutPrefixProvider *dpp,
-                                 optional_yield y,
-                                 std::unique_ptr<rgw::sal::Object> _head_obj,
-                                 const rgw_user& owner,
-                                 const rgw_placement_rule *ptail_placement_rule,
-                                 const std::string& unique_tag,
-                                 uint64_t position,
-                                 uint64_t *cur_accounted_size)
-{
-  auto aio = rgw::make_throttle(ctx()->_conf->rgw_put_obj_min_window_size, y);
-  return std::make_unique<RadosAppendWriter>(dpp, y,
-                                std::move(_head_obj),
-                                this, std::move(aio), owner,
-                                ptail_placement_rule,
-                                unique_tag, position,
-                                cur_accounted_size);
-}
-
-std::unique_ptr<Writer> RadosStore::get_atomic_writer(const DoutPrefixProvider *dpp,
-                                 optional_yield y,
-                                 std::unique_ptr<rgw::sal::Object> _head_obj,
-                                 const rgw_user& owner,
-                                 const rgw_placement_rule *ptail_placement_rule,
-                                 uint64_t olh_epoch,
-                                 const std::string& unique_tag)
-{
-  auto aio = rgw::make_throttle(ctx()->_conf->rgw_put_obj_min_window_size, y);
-  return std::make_unique<RadosAtomicWriter>(dpp, y,
-                                std::move(_head_obj),
-                                this, std::move(aio), owner,
-                                ptail_placement_rule,
-                                olh_epoch, unique_tag);
-}
-
-const std::string& RadosStore::get_compression_type(const rgw_placement_rule& rule)
-{
-      return svc()->zone->get_zone_params().get_compression_type(rule);
-}
-
-bool RadosStore::valid_placement(const rgw_placement_rule& rule)
-{
-  return svc()->zone->get_zone_params().valid_placement(rule);
-}
-
-int RadosStore::get_obj_head_ioctx(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj, librados::IoCtx* ioctx)
-{
-  return rados->get_obj_head_ioctx(dpp, bucket_info, obj, ioctx);
-}
-
-RadosObject::~RadosObject()
-{
-  if (rados_ctx_owned)
-    delete rados_ctx;
-}
-
-int RadosObject::get_obj_state(const DoutPrefixProvider* dpp, RGWObjState **pstate, optional_yield y, bool follow_olh)
-{
-  int ret = store->getRados()->get_obj_state(dpp, rados_ctx, bucket->get_info(), this, pstate, &manifest, follow_olh, y);
-  if (ret < 0) {
-    return ret;
-  }
-
-  /* Don't overwrite obj, atomic, or prefetch */
-  rgw_obj obj = get_obj();
-  bool is_atomic = state.is_atomic;
-  bool prefetch_data = state.prefetch_data;
-
-  state = **pstate;
-
-  state.obj = obj;
-  state.is_atomic = is_atomic;
-  state.prefetch_data = prefetch_data;
-  return ret;
-}
-
-int RadosObject::read_attrs(const DoutPrefixProvider* dpp, RGWRados::Object::Read &read_op, optional_yield y, rgw_obj* target_obj)
-{
-  read_op.params.attrs = &attrs;
-  read_op.params.target_obj = target_obj;
-  read_op.params.obj_size = &state.size;
-  read_op.params.lastmod = &state.mtime;
-
-  return read_op.prepare(y, dpp);
-}
-
-int RadosObject::set_obj_attrs(const DoutPrefixProvider* dpp, Attrs* setattrs, Attrs* delattrs, optional_yield y)
-{
-  Attrs empty;
-  return store->getRados()->set_attrs(dpp, rados_ctx,
-                       bucket->get_info(),
-                       this,
-                       setattrs ? *setattrs : empty,
-                       delattrs ? delattrs : nullptr,
-                       y);
-}
-
-int RadosObject::get_obj_attrs(optional_yield y, const DoutPrefixProvider* dpp, rgw_obj* target_obj)
-{
-  RGWRados::Object op_target(store->getRados(), bucket, *rados_ctx, this);
-  RGWRados::Object::Read read_op(&op_target);
-
-  return read_attrs(dpp, read_op, y, target_obj);
-}
-
-int RadosObject::modify_obj_attrs(const char* attr_name, bufferlist& attr_val, optional_yield y, const DoutPrefixProvider* dpp)
-{
-  rgw_obj target = get_obj();
-  rgw_obj save = get_obj();
-  int r = get_obj_attrs(y, dpp, &target);
-  if (r < 0) {
-    return r;
-  }
-
-  /* Temporarily set target */
-  state.obj = target;
-  set_atomic();
-  attrs[attr_name] = attr_val;
-  r = set_obj_attrs(dpp, &attrs, nullptr, y);
-  /* Restore target */
-  state.obj = save;
-
-  return r;
-}
-
-int RadosObject::delete_obj_attrs(const DoutPrefixProvider* dpp, const char* attr_name, optional_yield y)
-{
-  Attrs rmattr;
-  bufferlist bl;
-
-  set_atomic();
-  rmattr[attr_name] = bl;
-  return set_obj_attrs(dpp, nullptr, &rmattr, y);
-}
-
-bool RadosObject::is_expired() {
-  auto iter = attrs.find(RGW_ATTR_DELETE_AT);
-  if (iter != attrs.end()) {
-    utime_t delete_at;
-    try {
-      auto bufit = iter->second.cbegin();
-      decode(delete_at, bufit);
-    } catch (buffer::error& err) {
-      ldout(store->ctx(), 0) << "ERROR: " << __func__ << ": failed to decode " RGW_ATTR_DELETE_AT " attr" << dendl;
-      return false;
-    }
-
-    if (delete_at <= ceph_clock_now() && !delete_at.is_zero()) {
-      return true;
-    }
-  }
-
-  return false;
-}
-
-void RadosObject::gen_rand_obj_instance_name()
-{
-  store->getRados()->gen_rand_obj_instance_name(&state.obj.key);
-}
-
-void RadosObject::raw_obj_to_obj(const rgw_raw_obj& raw_obj)
-{
-  rgw_obj tobj = get_obj();
-  RGWSI_Tier_RADOS::raw_obj_to_obj(get_bucket()->get_key(), raw_obj, &tobj);
-  set_key(tobj.key);
-}
-
-void RadosObject::get_raw_obj(rgw_raw_obj* raw_obj)
-{
-  store->getRados()->obj_to_raw((bucket->get_info()).placement_rule, get_obj(), raw_obj);
-}
-
-int RadosObject::omap_get_vals(const DoutPrefixProvider *dpp, const std::string& marker, uint64_t count,
-                                 std::map<std::string, bufferlist> *m,
-                                 bool* pmore, optional_yield y)
-{
-  rgw_raw_obj raw_obj;
-  get_raw_obj(&raw_obj);
-  auto sysobj = store->svc()->sysobj->get_obj(raw_obj);
-
-  return sysobj.omap().get_vals(dpp, marker, count, m, pmore, y);
-}
-
-int RadosObject::omap_get_all(const DoutPrefixProvider *dpp, std::map<std::string, bufferlist> *m,
-                                optional_yield y)
-{
-  rgw_raw_obj raw_obj;
-  get_raw_obj(&raw_obj);
-  auto sysobj = store->svc()->sysobj->get_obj(raw_obj);
-
-  return sysobj.omap().get_all(dpp, m, y);
-}
-
-int RadosObject::omap_get_vals_by_keys(const DoutPrefixProvider *dpp, const std::string& oid,
-                                         const std::set<std::string>& keys,
-                                         Attrs* vals)
-{
-  int ret;
-  rgw_raw_obj head_obj;
-  librados::IoCtx cur_ioctx;
-  rgw_obj obj = get_obj();
-
-  store->getRados()->obj_to_raw(bucket->get_placement_rule(), obj, &head_obj);
-  ret = store->get_obj_head_ioctx(dpp, bucket->get_info(), obj, &cur_ioctx);
-  if (ret < 0) {
-    return ret;
-  }
-
-  return cur_ioctx.omap_get_vals_by_keys(oid, keys, vals);
-}
-
-int RadosObject::omap_set_val_by_key(const DoutPrefixProvider *dpp, const std::string& key, bufferlist& val,
-                                       bool must_exist, optional_yield y)
-{
-  rgw_raw_obj raw_meta_obj;
-  rgw_obj obj = get_obj();
-
-  store->getRados()->obj_to_raw(bucket->get_placement_rule(), obj, &raw_meta_obj);
-
-  auto sysobj = store->svc()->sysobj->get_obj(raw_meta_obj);
-
-  return sysobj.omap().set_must_exist(must_exist).set(dpp, key, val, y);
-}
-
-std::unique_ptr<MPSerializer> RadosObject::get_serializer(const DoutPrefixProvider *dpp, const std::string& lock_name)
-{
-  return std::make_unique<MPRadosSerializer>(dpp, store, this, lock_name);
-}
-
-int RadosObject::transition(Bucket* bucket,
-                           const rgw_placement_rule& placement_rule,
-                           const real_time& mtime,
-                           uint64_t olh_epoch,
-                           const DoutPrefixProvider* dpp,
-                           optional_yield y)
-{
-  return store->getRados()->transition_obj(*rados_ctx, bucket, *this, placement_rule, mtime, olh_epoch, dpp, y);
-}
-
-int RadosObject::transition_to_cloud(Bucket* bucket,
-                          rgw::sal::PlacementTier* tier,
-                          rgw_bucket_dir_entry& o,
-                          std::set<std::string>& cloud_targets,
-                          CephContext* cct,
-                          bool update_object,
-                          const DoutPrefixProvider* dpp,
-                          optional_yield y)
-{
-  /* init */
-  rgw::sal::RadosPlacementTier* rtier = static_cast<rgw::sal::RadosPlacementTier*>(tier);
-  string id = "cloudid";
-  string endpoint = rtier->get_rt().t.s3.endpoint;
-  RGWAccessKey key = rtier->get_rt().t.s3.key;
-  string region = rtier->get_rt().t.s3.region;
-  HostStyle host_style = rtier->get_rt().t.s3.host_style;
-  string bucket_name = rtier->get_rt().t.s3.target_path;
-  const rgw::sal::ZoneGroup& zonegroup = store->get_zone()->get_zonegroup();
-
-  if (bucket_name.empty()) {
-    bucket_name = "rgwx-" + zonegroup.get_name() + "-" + tier->get_storage_class() +
-                    "-cloud-bucket";
-    boost::algorithm::to_lower(bucket_name);
-  }
-
-  /* Create RGW REST connection */
-  S3RESTConn conn(cct, id, { endpoint }, key, zonegroup.get_id(), region, host_style);
-
-  RGWLCCloudTierCtx tier_ctx(cct, dpp, o, store, bucket->get_info(),
-                            this, conn, bucket_name,
-                            rtier->get_rt().t.s3.target_storage_class);
-  tier_ctx.acl_mappings = rtier->get_rt().t.s3.acl_mappings;
-  tier_ctx.multipart_min_part_size = rtier->get_rt().t.s3.multipart_min_part_size;
-  tier_ctx.multipart_sync_threshold = rtier->get_rt().t.s3.multipart_sync_threshold;
-  tier_ctx.storage_class = tier->get_storage_class();
-
-  ldpp_dout(dpp, 0) << "Transitioning object(" << o.key << ") to the cloud endpoint(" << endpoint << ")" << dendl;
-
-  /* Transition object to cloud end point */
-  int ret = rgw_cloud_tier_transfer_object(tier_ctx, cloud_targets);
-
-  if (ret < 0) {
-    ldpp_dout(dpp, 0) << "ERROR: failed to transfer object(" << o.key << ") to the cloud endpoint(" << endpoint << ") ret=" << ret << dendl;
-    return ret;
-  }
-
-  if (update_object) {
-    real_time read_mtime;
-
-    std::unique_ptr<rgw::sal::Object::ReadOp> read_op(get_read_op());
-    read_op->params.lastmod = &read_mtime;
-
-    ret = read_op->prepare(null_yield, dpp);
-    if (ret < 0) {
-      ldpp_dout(dpp, 0) << "ERROR: Updating tier object(" << o.key << ") failed ret=" << ret << dendl;
-      return ret;
-    }
-
-    if (read_mtime != tier_ctx.o.meta.mtime) {
-      /* raced */
-      ldpp_dout(dpp, 0) << "ERROR: Updating tier object(" << o.key << ") failed ret=" << -ECANCELED << dendl;
-      return -ECANCELED;
-    }
-
-    rgw_placement_rule target_placement;
-    target_placement.inherit_from(tier_ctx.bucket_info.placement_rule);
-    target_placement.storage_class = tier->get_storage_class();
-
-    ret = write_cloud_tier(dpp, null_yield, tier_ctx.o.versioned_epoch,
-                          tier, tier_ctx.is_multipart_upload,
-                          target_placement, tier_ctx.obj);
-
-  }
-
-  return ret;
-}
-
-int RadosObject::write_cloud_tier(const DoutPrefixProvider* dpp,
-                                 optional_yield y,
-                                 uint64_t olh_epoch,
-                                 PlacementTier* tier,
-                                 bool is_multipart_upload,
-                                 rgw_placement_rule& target_placement,
-                                 Object* head_obj)
-{
-  rgw::sal::RadosPlacementTier* rtier = static_cast<rgw::sal::RadosPlacementTier*>(tier);
-  map<string, bufferlist> attrs = get_attrs();
-  RGWRados::Object op_target(store->getRados(), bucket, *rados_ctx, this);
-  RGWRados::Object::Write obj_op(&op_target);
-
-  obj_op.meta.modify_tail = true;
-  obj_op.meta.flags = PUT_OBJ_CREATE;
-  obj_op.meta.category = RGWObjCategory::CloudTiered;
-  obj_op.meta.delete_at = real_time();
-  bufferlist blo;
-  obj_op.meta.data = &blo;
-  obj_op.meta.if_match = NULL;
-  obj_op.meta.if_nomatch = NULL;
-  obj_op.meta.user_data = NULL;
-  obj_op.meta.zones_trace = NULL;
-  obj_op.meta.delete_at = real_time();
-  obj_op.meta.olh_epoch = olh_epoch;
-
-  RGWObjManifest *pmanifest;
-  RGWObjManifest manifest;
-
-  pmanifest = &manifest;
-  RGWObjTier tier_config;
-  tier_config.name = tier->get_storage_class();
-  tier_config.tier_placement = rtier->get_rt();
-  tier_config.is_multipart_upload = is_multipart_upload;
-
-  pmanifest->set_tier_type("cloud-s3");
-  pmanifest->set_tier_config(tier_config);
-
-  /* check if its necessary */
-  pmanifest->set_head(target_placement, head_obj->get_obj(), 0);
-  pmanifest->set_tail_placement(target_placement, head_obj->get_obj().bucket);
-  pmanifest->set_obj_size(0);
-  obj_op.meta.manifest = pmanifest;
-
-  /* update storage class */
-  bufferlist bl;
-  bl.append(tier->get_storage_class());
-  attrs[RGW_ATTR_STORAGE_CLASS] = bl;
-
-  attrs.erase(RGW_ATTR_ID_TAG);
-  attrs.erase(RGW_ATTR_TAIL_TAG);
-
-  return obj_op.write_meta(dpp, 0, 0, attrs, y);
-}
-
-int RadosObject::get_max_chunk_size(const DoutPrefixProvider* dpp, rgw_placement_rule placement_rule, uint64_t* max_chunk_size, uint64_t* alignment)
-{
-  return store->getRados()->get_max_chunk_size(placement_rule, get_obj(), max_chunk_size, dpp, alignment);
-}
-
-void RadosObject::get_max_aligned_size(uint64_t size, uint64_t alignment,
-                                    uint64_t* max_size)
-{
-  store->getRados()->get_max_aligned_size(size, alignment, max_size);
-}
-
-bool RadosObject::placement_rules_match(rgw_placement_rule& r1, rgw_placement_rule& r2)
-{
-  rgw_obj obj;
-  rgw_pool p1, p2;
-
-  obj = get_obj();
-
-  if (r1 == r2)
-    return true;
-
-  if (!store->getRados()->get_obj_data_pool(r1, obj, &p1)) {
-    return false;
-  }
-  if (!store->getRados()->get_obj_data_pool(r2, obj, &p2)) {
-    return false;
-  }
-
-  return p1 == p2;
-}
-
-int RadosObject::dump_obj_layout(const DoutPrefixProvider *dpp, optional_yield y, Formatter* f)
-{
-  int ret;
-  RGWObjManifest *amanifest{nullptr};
-  rgw_raw_obj head_obj;
-
-  RGWRados::Object op_target(store->getRados(), get_bucket(), *rados_ctx, this);
-  RGWRados::Object::Read parent_op(&op_target);
-  uint64_t obj_size;
-
-  parent_op.params.obj_size = &obj_size;
-  parent_op.params.attrs = &get_attrs();
-
-  ret = parent_op.prepare(y, dpp);
-  if (ret < 0) {
-    return ret;
-  }
-
-  head_obj = parent_op.state.head_obj;
-
-  ret = op_target.get_manifest(dpp, &amanifest, y);
-  if (ret < 0) {
-    return ret;
-  }
-
-  ::encode_json("head", head_obj, f);
-  ::encode_json("manifest", *amanifest, f);
-  f->open_array_section("data_location");
-  for (auto miter = amanifest->obj_begin(dpp); miter != amanifest->obj_end(dpp); ++miter) {
-    f->open_object_section("obj");
-    rgw_raw_obj raw_loc = miter.get_location().get_raw_obj(store);
-    uint64_t ofs = miter.get_ofs();
-    uint64_t left = amanifest->get_obj_size() - ofs;
-    ::encode_json("ofs", miter.get_ofs(), f);
-    ::encode_json("loc", raw_loc, f);
-    ::encode_json("loc_ofs", miter.location_ofs(), f);
-    uint64_t loc_size = miter.get_stripe_size();
-    if (loc_size > left) {
-      loc_size = left;
-    }
-    ::encode_json("loc_size", loc_size, f);
-    f->close_section();
-  }
-  f->close_section();
-
-  return 0;
-}
-
-std::unique_ptr<Object::ReadOp> RadosObject::get_read_op()
-{
-  return std::make_unique<RadosObject::RadosReadOp>(this, rados_ctx);
-}
-
-RadosObject::RadosReadOp::RadosReadOp(RadosObject *_source, RGWObjectCtx *_rctx) :
-       source(_source),
-       rctx(_rctx),
-       op_target(_source->store->getRados(),
-                 _source->get_bucket(),
-                 *static_cast<RGWObjectCtx *>(rctx),
-                 _source),
-       parent_op(&op_target)
-{ }
-
-int RadosObject::RadosReadOp::prepare(optional_yield y, const DoutPrefixProvider* dpp)
-{
-  uint64_t obj_size;
-
-  parent_op.conds.mod_ptr = params.mod_ptr;
-  parent_op.conds.unmod_ptr = params.unmod_ptr;
-  parent_op.conds.high_precision_time = params.high_precision_time;
-  parent_op.conds.mod_zone_id = params.mod_zone_id;
-  parent_op.conds.mod_pg_ver = params.mod_pg_ver;
-  parent_op.conds.if_match = params.if_match;
-  parent_op.conds.if_nomatch = params.if_nomatch;
-  parent_op.params.lastmod = params.lastmod;
-  parent_op.params.target_obj = params.target_obj;
-  parent_op.params.obj_size = &obj_size;
-  parent_op.params.attrs = &source->get_attrs();
-
-  int ret = parent_op.prepare(y, dpp);
-  if (ret < 0)
-    return ret;
-
-  source->set_key(parent_op.state.obj.key);
-  source->set_obj_size(obj_size);
-
-  return ret;
-}
-
-int RadosObject::RadosReadOp::read(int64_t ofs, int64_t end, bufferlist& bl, optional_yield y, const DoutPrefixProvider* dpp)
-{
-  return parent_op.read(ofs, end, bl, y, dpp);
-}
-
-int RadosObject::RadosReadOp::get_attr(const DoutPrefixProvider* dpp, const char* name, bufferlist& dest, optional_yield y)
-{
-  return parent_op.get_attr(dpp, name, dest, y);
-}
-
-std::unique_ptr<Object::DeleteOp> RadosObject::get_delete_op()
-{
-  return std::make_unique<RadosObject::RadosDeleteOp>(this);
-}
-
-RadosObject::RadosDeleteOp::RadosDeleteOp(RadosObject *_source) :
-       source(_source),
-       op_target(_source->store->getRados(),
-                 _source->get_bucket(),
-                 _source->get_ctx(),
-                 _source),
-       parent_op(&op_target)
-{ }
-
-int RadosObject::RadosDeleteOp::delete_obj(const DoutPrefixProvider* dpp, optional_yield y)
-{
-  parent_op.params.bucket_owner = params.bucket_owner.get_id();
-  parent_op.params.versioning_status = params.versioning_status;
-  parent_op.params.obj_owner = params.obj_owner;
-  parent_op.params.olh_epoch = params.olh_epoch;
-  parent_op.params.marker_version_id = params.marker_version_id;
-  parent_op.params.bilog_flags = params.bilog_flags;
-  parent_op.params.remove_objs = params.remove_objs;
-  parent_op.params.expiration_time = params.expiration_time;
-  parent_op.params.unmod_since = params.unmod_since;
-  parent_op.params.mtime = params.mtime;
-  parent_op.params.high_precision_time = params.high_precision_time;
-  parent_op.params.zones_trace = params.zones_trace;
-  parent_op.params.abortmp = params.abortmp;
-  parent_op.params.parts_accounted_size = params.parts_accounted_size;
-
-  int ret = parent_op.delete_obj(y, dpp);
-  if (ret < 0)
-    return ret;
-
-  result.delete_marker = parent_op.result.delete_marker;
-  result.version_id = parent_op.result.version_id;
-
-  return ret;
-}
-
-int RadosObject::delete_object(const DoutPrefixProvider* dpp,
-                              optional_yield y,
-                              bool prevent_versioning)
-{
-  RGWRados::Object del_target(store->getRados(), bucket, *rados_ctx, this);
-  RGWRados::Object::Delete del_op(&del_target);
-
-  del_op.params.bucket_owner = bucket->get_info().owner;
-  del_op.params.versioning_status = prevent_versioning ? 0 : bucket->get_info().versioning_status();
-
-  return del_op.delete_obj(y, dpp);
-}
-
-int RadosObject::delete_obj_aio(const DoutPrefixProvider* dpp, RGWObjState* astate,
-                                  Completions* aio, bool keep_index_consistent,
-                                  optional_yield y)
-{
-  RadosCompletions* raio = static_cast<RadosCompletions*>(aio);
-
-  return store->getRados()->delete_obj_aio(dpp, get_obj(), bucket->get_info(), astate,
-                                          raio->handles, keep_index_consistent, y);
-}
-
-int RadosObject::copy_object(User* user,
-                               req_info* info,
-                               const rgw_zone_id& source_zone,
-                               rgw::sal::Object* dest_object,
-                               rgw::sal::Bucket* dest_bucket,
-                               rgw::sal::Bucket* src_bucket,
-                               const rgw_placement_rule& dest_placement,
-                               ceph::real_time* src_mtime,
-                               ceph::real_time* mtime,
-                               const ceph::real_time* mod_ptr,
-                               const ceph::real_time* unmod_ptr,
-                               bool high_precision_time,
-                               const char* if_match,
-                               const char* if_nomatch,
-                               AttrsMod attrs_mod,
-                               bool copy_if_newer,
-                               Attrs& attrs,
-                               RGWObjCategory category,
-                               uint64_t olh_epoch,
-                               boost::optional<ceph::real_time> delete_at,
-                               std::string* version_id,
-                               std::string* tag,
-                               std::string* etag,
-                               void (*progress_cb)(off_t, void *),
-                               void* progress_data,
-                               const DoutPrefixProvider* dpp,
-                               optional_yield y)
-{
-  return store->getRados()->copy_obj(*rados_ctx,
-                                    user->get_id(),
-                                    info,
-                                    source_zone,
-                                    dest_object,
-                                    this,
-                                    dest_bucket,
-                                    src_bucket,
-                                    dest_placement,
-                                    src_mtime,
-                                    mtime,
-                                    mod_ptr,
-                                    unmod_ptr,
-                                    high_precision_time,
-                                    if_match,
-                                    if_nomatch,
-                                    static_cast<RGWRados::AttrsMod>(attrs_mod),
-                                    copy_if_newer,
-                                    attrs,
-                                    category,
-                                    olh_epoch,
-                                    (delete_at ? *delete_at : real_time()),
-                                    version_id,
-                                    tag,
-                                    etag,
-                                    progress_cb,
-                                    progress_data,
-                                    dpp,
-                                    y);
-}
-
-int RadosObject::RadosReadOp::iterate(const DoutPrefixProvider* dpp, int64_t ofs, int64_t end, RGWGetDataCB* cb, optional_yield y)
-{
-  return parent_op.iterate(dpp, ofs, end, cb, y);
-}
-
-int RadosObject::swift_versioning_restore(bool& restored,
-                                         const DoutPrefixProvider* dpp)
-{
-  return store->getRados()->swift_versioning_restore(*rados_ctx,
-                                                    bucket->get_owner()->get_id(),
-                                                    bucket,
-                                                    this,
-                                                    restored,
-                                                    dpp);
-}
-
-int RadosObject::swift_versioning_copy(const DoutPrefixProvider* dpp, optional_yield y)
-{
-  return store->getRados()->swift_versioning_copy(*rados_ctx,
-                                        bucket->get_info().owner,
-                                        bucket,
-                                        this,
-                                        dpp,
-                                        y);
-}
-
-int RadosMultipartUpload::abort(const DoutPrefixProvider *dpp, CephContext *cct)
-{
-  std::unique_ptr<rgw::sal::Object> meta_obj = get_meta_obj();
-  meta_obj->set_in_extra_data(true);
-  meta_obj->set_hash_source(mp_obj.get_key());
-  cls_rgw_obj_chain chain;
-  list<rgw_obj_index_key> remove_objs;
-  bool truncated;
-  int marker = 0;
-  int ret;
-  uint64_t parts_accounted_size = 0;
-
-  do {
-    ret = list_parts(dpp, cct, 1000, marker, &marker, &truncated);
-    if (ret < 0) {
-      ldpp_dout(dpp, 20) << __func__ << ": RadosMultipartUpload::list_parts returned " <<
-       ret << dendl;
-      return (ret == -ENOENT) ? -ERR_NO_SUCH_UPLOAD : ret;
-    }
-
-    for (auto part_it = parts.begin();
-        part_it != parts.end();
-        ++part_it) {
-      RadosMultipartPart* obj_part = dynamic_cast<RadosMultipartPart*>(part_it->second.get());
-      if (obj_part->info.manifest.empty()) {
-       std::unique_ptr<rgw::sal::Object> obj = bucket->get_object(
-                                   rgw_obj_key(obj_part->oid, std::string(), RGW_OBJ_NS_MULTIPART));
-       obj->set_hash_source(mp_obj.get_key());
-       ret = obj->delete_object(dpp, null_yield);
-        if (ret < 0 && ret != -ENOENT)
-          return ret;
-      } else {
-       auto target = meta_obj->get_obj();
-       store->getRados()->update_gc_chain(dpp, target, obj_part->info.manifest, &chain);
-        RGWObjManifest::obj_iterator oiter = obj_part->info.manifest.obj_begin(dpp);
-        if (oiter != obj_part->info.manifest.obj_end(dpp)) {
-         std::unique_ptr<rgw::sal::Object> head = bucket->get_object(rgw_obj_key());
-          rgw_raw_obj raw_head = oiter.get_location().get_raw_obj(store);
-         dynamic_cast<rgw::sal::RadosObject*>(head.get())->raw_obj_to_obj(raw_head);
-
-          rgw_obj_index_key key;
-          head->get_key().get_index_key(&key);
-          remove_objs.push_back(key);
-        }
-      }
-      parts_accounted_size += obj_part->info.accounted_size;
-    }
-  } while (truncated);
-
-  if (store->getRados()->get_gc() == nullptr) {
-    //Delete objects inline if gc hasn't been initialised (in case when bypass gc is specified)
-    store->getRados()->delete_objs_inline(dpp, chain, mp_obj.get_upload_id());
-  } else {
-    /* use upload id as tag and do it synchronously */
-    auto [ret, leftover_chain] = store->getRados()->send_chain_to_gc(chain, mp_obj.get_upload_id());
-    if (ret < 0 && leftover_chain) {
-      ldpp_dout(dpp, 5) << __func__ << ": gc->send_chain() returned " << ret << dendl;
-      if (ret == -ENOENT) {
-        return -ERR_NO_SUCH_UPLOAD;
-      }
-      //Delete objects inline if send chain to gc fails
-      store->getRados()->delete_objs_inline(dpp, *leftover_chain, mp_obj.get_upload_id());
-    }
-  }
-
-  std::unique_ptr<rgw::sal::Object::DeleteOp> del_op = meta_obj->get_delete_op();
-  del_op->params.bucket_owner = bucket->get_acl_owner();
-  del_op->params.versioning_status = 0;
-  if (!remove_objs.empty()) {
-    del_op->params.remove_objs = &remove_objs;
-  }
-  
-  del_op->params.abortmp = true;
-  del_op->params.parts_accounted_size = parts_accounted_size;
-
-  // and also remove the metadata obj
-  ret = del_op->delete_obj(dpp, null_yield);
-  if (ret < 0) {
-    ldpp_dout(dpp, 20) << __func__ << ": del_op.delete_obj returned " <<
-      ret << dendl;
-  }
-  return (ret == -ENOENT) ? -ERR_NO_SUCH_UPLOAD : ret;
-}
-
-std::unique_ptr<rgw::sal::Object> RadosMultipartUpload::get_meta_obj()
-{
-  return bucket->get_object(rgw_obj_key(get_meta(), string(), mp_ns));
-}
-
-int RadosMultipartUpload::init(const DoutPrefixProvider *dpp, optional_yield y, ACLOwner& owner, rgw_placement_rule& dest_placement, rgw::sal::Attrs& attrs)
-{
-  int ret;
-  std::string oid = mp_obj.get_key();
-  RGWObjectCtx obj_ctx(store);
-
-  do {
-    char buf[33];
-    string tmp_obj_name;
-    std::unique_ptr<rgw::sal::Object> obj;
-    gen_rand_alphanumeric(store->ctx(), buf, sizeof(buf) - 1);
-    std::string upload_id = MULTIPART_UPLOAD_ID_PREFIX; /* v2 upload id */
-    upload_id.append(buf);
-
-    mp_obj.init(oid, upload_id);
-    tmp_obj_name = mp_obj.get_meta();
-
-    obj = bucket->get_object(rgw_obj_key(tmp_obj_name, string(), mp_ns));
-    // the meta object will be indexed with 0 size, we c
-    obj->set_in_extra_data(true);
-    obj->set_hash_source(oid);
-
-    RGWRados::Object op_target(store->getRados(),
-                              obj->get_bucket(),
-                              obj_ctx, obj.get());
-    RGWRados::Object::Write obj_op(&op_target);
-
-    op_target.set_versioning_disabled(true); /* no versioning for multipart meta */
-    obj_op.meta.owner = owner.get_id();
-    obj_op.meta.category = RGWObjCategory::MultiMeta;
-    obj_op.meta.flags = PUT_OBJ_CREATE_EXCL;
-    obj_op.meta.mtime = &mtime;
-
-    multipart_upload_info upload_info;
-    upload_info.dest_placement = dest_placement;
-
-    bufferlist bl;
-    encode(upload_info, bl);
-    obj_op.meta.data = &bl;
-
-    ret = obj_op.write_meta(dpp, bl.length(), 0, attrs, y);
-  } while (ret == -EEXIST);
-
-  return ret;
-}
-
-int RadosMultipartUpload::list_parts(const DoutPrefixProvider *dpp, CephContext *cct,
-                                    int num_parts, int marker,
-                                    int *next_marker, bool *truncated,
-                                    bool assume_unsorted)
-{
-  map<string, bufferlist> parts_map;
-  map<string, bufferlist>::iterator iter;
-
-  std::unique_ptr<rgw::sal::Object> obj = bucket->get_object(
-                     rgw_obj_key(get_meta(), std::string(), RGW_OBJ_NS_MULTIPART));
-  obj->set_in_extra_data(true);
-
-  bool sorted_omap = is_v2_upload_id(get_upload_id()) && !assume_unsorted;
-
-  parts.clear();
-
-  int ret;
-  if (sorted_omap) {
-    string p;
-    p = "part.";
-    char buf[32];
-
-    snprintf(buf, sizeof(buf), "%08d", marker);
-    p.append(buf);
-
-    ret = obj->omap_get_vals(dpp, p, num_parts + 1, &parts_map,
-                                 nullptr, null_yield);
-  } else {
-    ret = obj->omap_get_all(dpp, &parts_map, null_yield);
-  }
-  if (ret < 0) {
-    return ret;
-  }
-
-  int i;
-  int last_num = 0;
-
-  uint32_t expected_next = marker + 1;
-
-  for (i = 0, iter = parts_map.begin();
-       (i < num_parts || !sorted_omap) && iter != parts_map.end();
-       ++iter, ++i) {
-    bufferlist& bl = iter->second;
-    auto bli = bl.cbegin();
-    std::unique_ptr<RadosMultipartPart> part = std::make_unique<RadosMultipartPart>();
-    try {
-      decode(part->info, bli);
-    } catch (buffer::error& err) {
-      ldpp_dout(dpp, 0) << "ERROR: could not part info, caught buffer::error" <<
-       dendl;
-      return -EIO;
-    }
-    if (sorted_omap) {
-      if (part->info.num != expected_next) {
-        /* ouch, we expected a specific part num here, but we got a
-         * different one. Either a part is missing, or it could be a
-         * case of mixed rgw versions working on the same upload,
-         * where one gateway doesn't support correctly sorted omap
-         * keys for multipart upload just assume data is unsorted.
-         */
-        return list_parts(dpp, cct, num_parts, marker, next_marker, truncated, true);
-      }
-      expected_next++;
-    }
-    if (sorted_omap ||
-      (int)part->info.num > marker) {
-      last_num = part->info.num;
-      parts[part->info.num] = std::move(part);
-    }
-  }
-
-  if (sorted_omap) {
-    if (truncated) {
-      *truncated = (iter != parts_map.end());
-    }
-  } else {
-    /* rebuild a map with only num_parts entries */
-    std::map<uint32_t, std::unique_ptr<MultipartPart>> new_parts;
-    std::map<uint32_t, std::unique_ptr<MultipartPart>>::iterator piter;
-    for (i = 0, piter = parts.begin();
-        i < num_parts && piter != parts.end();
-        ++i, ++piter) {
-      last_num = piter->first;
-      new_parts[piter->first] = std::move(piter->second);
-    }
-
-    if (truncated) {
-      *truncated = (piter != parts.end());
-    }
-
-    parts.swap(new_parts);
-  }
-
-  if (next_marker) {
-    *next_marker = last_num;
-  }
-
-  return 0;
-}
-
-int RadosMultipartUpload::complete(const DoutPrefixProvider *dpp,
-                                  optional_yield y, CephContext* cct,
-                                  map<int, string>& part_etags,
-                                  list<rgw_obj_index_key>& remove_objs,
-                                  uint64_t& accounted_size, bool& compressed,
-                                  RGWCompressionInfo& cs_info, off_t& ofs,
-                                  std::string& tag, ACLOwner& owner,
-                                  uint64_t olh_epoch,
-                                  rgw::sal::Object* target_obj)
-{
-  char final_etag[CEPH_CRYPTO_MD5_DIGESTSIZE];
-  char final_etag_str[CEPH_CRYPTO_MD5_DIGESTSIZE * 2 + 16];
-  std::string etag;
-  bufferlist etag_bl;
-  MD5 hash;
-  // Allow use of MD5 digest in FIPS mode for non-cryptographic purposes
-  hash.SetFlags(EVP_MD_CTX_FLAG_NON_FIPS_ALLOW);
-  bool truncated;
-  int ret;
-
-  int total_parts = 0;
-  int handled_parts = 0;
-  int max_parts = 1000;
-  int marker = 0;
-  uint64_t min_part_size = cct->_conf->rgw_multipart_min_part_size;
-  auto etags_iter = part_etags.begin();
-  rgw::sal::Attrs attrs = target_obj->get_attrs();
-
-  do {
-    ret = list_parts(dpp, cct, max_parts, marker, &marker, &truncated);
-    if (ret == -ENOENT) {
-      ret = -ERR_NO_SUCH_UPLOAD;
-    }
-    if (ret < 0)
-      return ret;
-
-    total_parts += parts.size();
-    if (!truncated && total_parts != (int)part_etags.size()) {
-      ldpp_dout(dpp, 0) << "NOTICE: total parts mismatch: have: " << total_parts
-                      << " expected: " << part_etags.size() << dendl;
-      ret = -ERR_INVALID_PART;
-      return ret;
-    }
-
-    for (auto obj_iter = parts.begin(); etags_iter != part_etags.end() && obj_iter != parts.end(); ++etags_iter, ++obj_iter, ++handled_parts) {
-      RadosMultipartPart* part = dynamic_cast<rgw::sal::RadosMultipartPart*>(obj_iter->second.get());
-      uint64_t part_size = part->get_size();
-      if (handled_parts < (int)part_etags.size() - 1 &&
-          part_size < min_part_size) {
-        ret = -ERR_TOO_SMALL;
-        return ret;
-      }
-
-      char petag[CEPH_CRYPTO_MD5_DIGESTSIZE];
-      if (etags_iter->first != (int)obj_iter->first) {
-        ldpp_dout(dpp, 0) << "NOTICE: parts num mismatch: next requested: "
-                        << etags_iter->first << " next uploaded: "
-                        << obj_iter->first << dendl;
-        ret = -ERR_INVALID_PART;
-        return ret;
-      }
-      string part_etag = rgw_string_unquote(etags_iter->second);
-      if (part_etag.compare(part->get_etag()) != 0) {
-        ldpp_dout(dpp, 0) << "NOTICE: etag mismatch: part: " << etags_iter->first
-                        << " etag: " << etags_iter->second << dendl;
-        ret = -ERR_INVALID_PART;
-        return ret;
-      }
-
-      hex_to_buf(part->get_etag().c_str(), petag,
-               CEPH_CRYPTO_MD5_DIGESTSIZE);
-      hash.Update((const unsigned char *)petag, sizeof(petag));
-
-      RGWUploadPartInfo& obj_part = part->info;
-
-      /* update manifest for part */
-      string oid = mp_obj.get_part(part->info.num);
-      rgw_obj src_obj;
-      src_obj.init_ns(bucket->get_key(), oid, mp_ns);
-
-      if (obj_part.manifest.empty()) {
-        ldpp_dout(dpp, 0) << "ERROR: empty manifest for object part: obj="
-                        << src_obj << dendl;
-        ret = -ERR_INVALID_PART;
-        return ret;
-      } else {
-        manifest.append(dpp, obj_part.manifest, store->svc()->zone->get_zonegroup(), store->svc()->zone->get_zone_params());
-      }
-
-      bool part_compressed = (obj_part.cs_info.compression_type != "none");
-      if ((handled_parts > 0) &&
-          ((part_compressed != compressed) ||
-            (cs_info.compression_type != obj_part.cs_info.compression_type))) {
-          ldpp_dout(dpp, 0) << "ERROR: compression type was changed during multipart upload ("
-                           << cs_info.compression_type << ">>" << obj_part.cs_info.compression_type << ")" << dendl;
-          ret = -ERR_INVALID_PART;
-          return ret; 
-      }
-      
-      if (part_compressed) {
-        int64_t new_ofs; // offset in compression data for new part
-        if (cs_info.blocks.size() > 0)
-          new_ofs = cs_info.blocks.back().new_ofs + cs_info.blocks.back().len;
-        else
-          new_ofs = 0;
-        for (const auto& block : obj_part.cs_info.blocks) {
-          compression_block cb;
-          cb.old_ofs = block.old_ofs + cs_info.orig_size;
-          cb.new_ofs = new_ofs;
-          cb.len = block.len;
-          cs_info.blocks.push_back(cb);
-          new_ofs = cb.new_ofs + cb.len;
-        } 
-        if (!compressed)
-          cs_info.compression_type = obj_part.cs_info.compression_type;
-        cs_info.orig_size += obj_part.cs_info.orig_size;
-        compressed = true;
-      }
-
-      rgw_obj_index_key remove_key;
-      src_obj.key.get_index_key(&remove_key);
-
-      remove_objs.push_back(remove_key);
-
-      ofs += obj_part.size;
-      accounted_size += obj_part.accounted_size;
-    }
-  } while (truncated);
-  hash.Final((unsigned char *)final_etag);
-
-  buf_to_hex((unsigned char *)final_etag, sizeof(final_etag), final_etag_str);
-  snprintf(&final_etag_str[CEPH_CRYPTO_MD5_DIGESTSIZE * 2],
-          sizeof(final_etag_str) - CEPH_CRYPTO_MD5_DIGESTSIZE * 2,
-           "-%lld", (long long)part_etags.size());
-  etag = final_etag_str;
-  ldpp_dout(dpp, 10) << "calculated etag: " << etag << dendl;
-
-  etag_bl.append(etag);
-
-  attrs[RGW_ATTR_ETAG] = etag_bl;
-
-  if (compressed) {
-    // write compression attribute to full object
-    bufferlist tmp;
-    encode(cs_info, tmp);
-    attrs[RGW_ATTR_COMPRESSION] = tmp;
-  }
-
-  target_obj->set_atomic();
-
-  RGWRados::Object op_target(store->getRados(),
-                            target_obj->get_bucket(),
-                            dynamic_cast<RadosObject*>(target_obj)->get_ctx(),
-                            target_obj);
-  RGWRados::Object::Write obj_op(&op_target);
-
-  obj_op.meta.manifest = &manifest;
-  obj_op.meta.remove_objs = &remove_objs;
-
-  obj_op.meta.ptag = &tag; /* use req_id as operation tag */
-  obj_op.meta.owner = owner.get_id();
-  obj_op.meta.flags = PUT_OBJ_CREATE;
-  obj_op.meta.modify_tail = true;
-  obj_op.meta.completeMultipart = true;
-  obj_op.meta.olh_epoch = olh_epoch;
-
-  ret = obj_op.write_meta(dpp, ofs, accounted_size, attrs, y);
-  if (ret < 0)
-    return ret;
-
-  return ret;
-}
-
-int RadosMultipartUpload::get_info(const DoutPrefixProvider *dpp, optional_yield y, rgw_placement_rule** rule, rgw::sal::Attrs* attrs)
-{
-  if (!rule && !attrs) {
-    return 0;
-  }
-
-  if (rule) {
-    if (!placement.empty()) {
-      *rule = &placement;
-      if (!attrs) {
-       /* Don't need attrs, done */
-       return 0;
-      }
-    } else {
-      *rule = nullptr;
-    }
-  }
-
-  /* We need either attributes or placement, so we need a read */
-  std::unique_ptr<rgw::sal::Object> meta_obj;
-  meta_obj = get_meta_obj();
-  meta_obj->set_in_extra_data(true);
-
-  multipart_upload_info upload_info;
-  bufferlist headbl;
-
-  /* Read the obj head which contains the multipart_upload_info */
-  std::unique_ptr<rgw::sal::Object::ReadOp> read_op = meta_obj->get_read_op();
-  meta_obj->set_prefetch_data();
-
-  int ret = read_op->prepare(y, dpp);
-  if (ret < 0) {
-    if (ret == -ENOENT) {
-      return -ERR_NO_SUCH_UPLOAD;
-    }
-    return ret;
-  }
-
-  extract_span_context(meta_obj->get_attrs(), trace_ctx);
-
-  if (attrs) {
-    /* Attrs are filled in by prepare */
-    *attrs = meta_obj->get_attrs();
-    if (!rule || *rule != nullptr) {
-      /* placement was cached; don't actually read */
-      return 0;
-    }
-  }
-
-  /* Now read the placement from the head */
-  ret = read_op->read(0, store->ctx()->_conf->rgw_max_chunk_size, headbl, y, dpp);
-  if (ret < 0) {
-    if (ret == -ENOENT) {
-      return -ERR_NO_SUCH_UPLOAD;
-    }
-    return ret;
-  }
-
-  if (headbl.length() <= 0) {
-    return -ERR_NO_SUCH_UPLOAD;
-  }
-
-  /* Decode multipart_upload_info */
-  auto hiter = headbl.cbegin();
-  try {
-    decode(upload_info, hiter);
-  } catch (buffer::error& err) {
-    ldpp_dout(dpp, 0) << "ERROR: failed to decode multipart upload info" << dendl;
-    return -EIO;
-  }
-  placement = upload_info.dest_placement;
-  *rule = &placement;
-
-  return 0;
-}
-
-std::unique_ptr<Writer> RadosMultipartUpload::get_writer(
-                                 const DoutPrefixProvider *dpp,
-                                 optional_yield y,
-                                 std::unique_ptr<rgw::sal::Object> _head_obj,
-                                 const rgw_user& owner,
-                                 const rgw_placement_rule *ptail_placement_rule,
-                                 uint64_t part_num,
-                                 const std::string& part_num_str)
-{
-  auto aio = rgw::make_throttle(store->ctx()->_conf->rgw_put_obj_min_window_size, y);
-  return std::make_unique<RadosMultipartWriter>(dpp, y, this,
-                                std::move(_head_obj), store, std::move(aio), owner,
-                                ptail_placement_rule, part_num, part_num_str);
-}
-
-MPRadosSerializer::MPRadosSerializer(const DoutPrefixProvider *dpp, RadosStore* store, RadosObject* obj, const std::string& lock_name) :
-  lock(lock_name)
-{
-  rgw_pool meta_pool;
-  rgw_raw_obj raw_obj;
-
-  obj->get_raw_obj(&raw_obj);
-  oid = raw_obj.oid;
-  store->getRados()->get_obj_data_pool(obj->get_bucket()->get_placement_rule(),
-                                      obj->get_obj(), &meta_pool);
-  store->getRados()->open_pool_ctx(dpp, meta_pool, ioctx, true);
-}
-
-int MPRadosSerializer::try_lock(const DoutPrefixProvider *dpp, utime_t dur, optional_yield y)
-{
-  op.assert_exists();
-  lock.set_duration(dur);
-  lock.lock_exclusive(&op);
-  int ret = rgw_rados_operate(dpp, ioctx, oid, &op, y);
-  if (! ret) {
-    locked = true;
-  }
-  return ret;
-}
-
-LCRadosSerializer::LCRadosSerializer(RadosStore* store, const std::string& _oid, const std::string& lock_name, const std::string& cookie) :
-  StoreLCSerializer(_oid),
-  lock(lock_name)
-{
-  ioctx = &store->getRados()->lc_pool_ctx;
-  lock.set_cookie(cookie);
-}
-
-int LCRadosSerializer::try_lock(const DoutPrefixProvider *dpp, utime_t dur, optional_yield y)
-{
-  lock.set_duration(dur);
-  return lock.lock_exclusive(ioctx, oid);
-}
-
-int RadosLifecycle::get_entry(const std::string& oid, const std::string& marker,
-                             std::unique_ptr<LCEntry>* entry)
-{
-  cls_rgw_lc_entry cls_entry;
-  int ret = cls_rgw_lc_get_entry(*store->getRados()->get_lc_pool_ctx(), oid, marker, cls_entry);
-  if (ret)
-    return ret;
-
-  LCEntry* e;
-  e = new StoreLCEntry(cls_entry.bucket, cls_entry.start_time, cls_entry.status);
-  if (!e)
-    return -ENOMEM;
-
-  entry->reset(e);
-  return 0;
-}
-
-int RadosLifecycle::get_next_entry(const std::string& oid, const std::string& marker,
-                                  std::unique_ptr<LCEntry>* entry)
-{
-  cls_rgw_lc_entry cls_entry;
-  int ret = cls_rgw_lc_get_next_entry(*store->getRados()->get_lc_pool_ctx(), oid, marker,
-                                     cls_entry);
-
-  if (ret)
-    return ret;
-
-  LCEntry* e;
-  e = new StoreLCEntry(cls_entry.bucket, cls_entry.start_time, cls_entry.status);
-  if (!e)
-    return -ENOMEM;
-
-  entry->reset(e);
-  return 0;
-}
-
-int RadosLifecycle::set_entry(const std::string& oid, LCEntry& entry)
-{
-  cls_rgw_lc_entry cls_entry;
-
-  cls_entry.bucket = entry.get_bucket();
-  cls_entry.start_time = entry.get_start_time();
-  cls_entry.status = entry.get_status();
-
-  return cls_rgw_lc_set_entry(*store->getRados()->get_lc_pool_ctx(), oid, cls_entry);
-}
-
-int RadosLifecycle::list_entries(const std::string& oid, const std::string& marker,
-                                uint32_t max_entries, std::vector<std::unique_ptr<LCEntry>>& entries)
-{
-  entries.clear();
-
-  vector<cls_rgw_lc_entry> cls_entries;
-  int ret = cls_rgw_lc_list(*store->getRados()->get_lc_pool_ctx(), oid, marker, max_entries, cls_entries);
-
-  if (ret < 0)
-    return ret;
-
-  for (auto& entry : cls_entries) {
-    entries.push_back(std::make_unique<StoreLCEntry>(entry.bucket, oid,
-                               entry.start_time, entry.status));
-  }
-
-  return ret;
-}
-
-int RadosLifecycle::rm_entry(const std::string& oid, LCEntry& entry)
-{
-  cls_rgw_lc_entry cls_entry;
-
-  cls_entry.bucket = entry.get_bucket();
-  cls_entry.start_time = entry.get_start_time();
-  cls_entry.status = entry.get_status();
-
-  return cls_rgw_lc_rm_entry(*store->getRados()->get_lc_pool_ctx(), oid, cls_entry);
-}
-
-int RadosLifecycle::get_head(const std::string& oid, std::unique_ptr<LCHead>* head)
-{
-  cls_rgw_lc_obj_head cls_head;
-  int ret = cls_rgw_lc_get_head(*store->getRados()->get_lc_pool_ctx(), oid, cls_head);
-  if (ret)
-    return ret;
-
-  LCHead* h;
-  h = new StoreLCHead(cls_head.start_date, cls_head.shard_rollover_date, cls_head.marker);
-  if (!h)
-    return -ENOMEM;
-
-  head->reset(h);
-  return 0;
-}
-
-int RadosLifecycle::put_head(const std::string& oid, LCHead& head)
-{
-  cls_rgw_lc_obj_head cls_head;
-
-  cls_head.marker = head.get_marker();
-  cls_head.start_date = head.get_start_date();
-  cls_head.shard_rollover_date = head.get_shard_rollover_date();
-
-  return cls_rgw_lc_put_head(*store->getRados()->get_lc_pool_ctx(), oid, cls_head);
-}
-
-std::unique_ptr<LCSerializer> RadosLifecycle::get_serializer(const std::string& lock_name,
-                                                            const std::string& oid,
-                                                            const std::string& cookie)
-{
-  return std::make_unique<LCRadosSerializer>(store, oid, lock_name, cookie);
-}
-
-int RadosNotification::publish_reserve(const DoutPrefixProvider *dpp, RGWObjTags* obj_tags)
-{
-  return rgw::notify::publish_reserve(dpp, event_type, res, obj_tags);
-}
-
-int RadosNotification::publish_commit(const DoutPrefixProvider* dpp, uint64_t size,
-                                    const ceph::real_time& mtime, const std::string& etag, const std::string& version)
-{
-  return rgw::notify::publish_commit(obj, size, mtime, etag, version, event_type, res, dpp);
-}
-
-int RadosAtomicWriter::prepare(optional_yield y)
-{
-  return processor.prepare(y);
-}
-
-int RadosAtomicWriter::process(bufferlist&& data, uint64_t offset)
-{
-  return processor.process(std::move(data), offset);
-}
-
-int RadosAtomicWriter::complete(size_t accounted_size, const std::string& etag,
-                       ceph::real_time *mtime, ceph::real_time set_mtime,
-                       std::map<std::string, bufferlist>& attrs,
-                       ceph::real_time delete_at,
-                       const char *if_match, const char *if_nomatch,
-                       const std::string *user_data,
-                       rgw_zone_set *zones_trace, bool *canceled,
-                       optional_yield y)
-{
-  return processor.complete(accounted_size, etag, mtime, set_mtime, attrs, delete_at,
-                           if_match, if_nomatch, user_data, zones_trace, canceled, y);
-}
-
-int RadosAppendWriter::prepare(optional_yield y)
-{
-  return processor.prepare(y);
-}
-
-int RadosAppendWriter::process(bufferlist&& data, uint64_t offset)
-{
-  return processor.process(std::move(data), offset);
-}
-
-int RadosAppendWriter::complete(size_t accounted_size, const std::string& etag,
-                       ceph::real_time *mtime, ceph::real_time set_mtime,
-                       std::map<std::string, bufferlist>& attrs,
-                       ceph::real_time delete_at,
-                       const char *if_match, const char *if_nomatch,
-                       const std::string *user_data,
-                       rgw_zone_set *zones_trace, bool *canceled,
-                       optional_yield y)
-{
-  return processor.complete(accounted_size, etag, mtime, set_mtime, attrs, delete_at,
-                           if_match, if_nomatch, user_data, zones_trace, canceled, y);
-}
-
-int RadosMultipartWriter::prepare(optional_yield y)
-{
-  return processor.prepare(y);
-}
-
-int RadosMultipartWriter::process(bufferlist&& data, uint64_t offset)
-{
-  return processor.process(std::move(data), offset);
-}
-
-int RadosMultipartWriter::complete(size_t accounted_size, const std::string& etag,
-                       ceph::real_time *mtime, ceph::real_time set_mtime,
-                       std::map<std::string, bufferlist>& attrs,
-                       ceph::real_time delete_at,
-                       const char *if_match, const char *if_nomatch,
-                       const std::string *user_data,
-                       rgw_zone_set *zones_trace, bool *canceled,
-                       optional_yield y)
-{
-  return processor.complete(accounted_size, etag, mtime, set_mtime, attrs, delete_at,
-                           if_match, if_nomatch, user_data, zones_trace, canceled, y);
-}
-
-const std::string& RadosZoneGroup::get_endpoint() const
-{
-  if (!group.endpoints.empty()) {
-      return group.endpoints.front();
-  } else {
-    // use zonegroup's master zone endpoints
-    auto z = group.zones.find(group.master_zone);
-    if (z != group.zones.end() && !z->second.endpoints.empty()) {
-      return z->second.endpoints.front();
-    }
-  }
-  return empty;
-}
-
-bool RadosZoneGroup::placement_target_exists(std::string& target) const
-{
-  return !!group.placement_targets.count(target);
-}
-
-int RadosZoneGroup::get_placement_target_names(std::set<std::string>& names) const
-{
-  for (const auto& target : group.placement_targets) {
-    names.emplace(target.second.name);
-  }
-
-  return 0;
-}
-
-int RadosZoneGroup::get_placement_tier(const rgw_placement_rule& rule,
-                                      std::unique_ptr<PlacementTier>* tier)
-{
-  std::map<std::string, RGWZoneGroupPlacementTarget>::const_iterator titer;
-  titer = group.placement_targets.find(rule.name);
-  if (titer == group.placement_targets.end()) {
-    return -ENOENT;
-  }
-
-  const auto& target_rule = titer->second;
-  std::map<std::string, RGWZoneGroupPlacementTier>::const_iterator ttier;
-  ttier = target_rule.tier_targets.find(rule.storage_class);
-  if (ttier == target_rule.tier_targets.end()) {
-    // not found
-    return -ENOENT;
-  }
-
-  PlacementTier* t;
-  t = new RadosPlacementTier(store, ttier->second);
-  if (!t)
-    return -ENOMEM;
-
-  tier->reset(t);
-  return 0;
-}
-
-int RadosZoneGroup::get_zone_by_id(const std::string& id, std::unique_ptr<Zone>* zone)
-{
-  RGWZone* rz = store->svc()->zone->find_zone(id);
-  if (!rz)
-    return -ENOENT;
-
-  Zone* z = new RadosZone(store, clone(), *rz);
-  zone->reset(z);
-  return 0;
-}
-
-int RadosZoneGroup::get_zone_by_name(const std::string& name, std::unique_ptr<Zone>* zone)
-{
-  rgw_zone_id id;
-  int ret = store->svc()->zone->find_zone_id_by_name(name, &id);
-  if (ret < 0)
-    return ret;
-
-  RGWZone* rz = store->svc()->zone->find_zone(id.id);
-  if (!rz)
-    return -ENOENT;
-
-  Zone* z = new RadosZone(store, clone(), *rz);
-  zone->reset(z);
-  return 0;
-}
-
-int RadosZoneGroup::list_zones(std::list<std::string>& zone_ids)
-{
-  for (const auto& entry : group.zones)
-    {
-      zone_ids.push_back(entry.second.id);
-    }
-  return 0;
-}
-
-std::unique_ptr<Zone> RadosZone::clone()
-{
-  if (local_zone)
-    return std::make_unique<RadosZone>(store, group->clone());
-
-  return std::make_unique<RadosZone>(store, group->clone(), rgw_zone);
-}
-
-const std::string& RadosZone::get_id()
-{
-  if (local_zone)
-    return store->svc()->zone->zone_id().id;
-
-  return rgw_zone.id;
-}
-
-const std::string& RadosZone::get_name() const
-{
-  if (local_zone)
-    return store->svc()->zone->zone_name();
-
-  return rgw_zone.name;
-}
-
-bool RadosZone::is_writeable()
-{
-  if (local_zone)
-    return store->svc()->zone->zone_is_writeable();
-
-  return !rgw_zone.read_only;
-}
-
-bool RadosZone::get_redirect_endpoint(std::string* endpoint)
-{
-  if (local_zone)
-    return store->svc()->zone->get_redirect_zone_endpoint(endpoint);
-
-  endpoint = &rgw_zone.redirect_zone;
-  return true;
-}
-
-bool RadosZone::has_zonegroup_api(const std::string& api) const
-{
-  return store->svc()->zone->has_zonegroup_api(api);
-}
-
-const std::string& RadosZone::get_current_period_id()
-{
-  return store->svc()->zone->get_current_period_id();
-}
-
-const RGWAccessKey& RadosZone::get_system_key()
-{
-  return store->svc()->zone->get_zone_params().system_key;
-}
-
-const std::string& RadosZone::get_realm_name()
-{
-  return store->svc()->zone->get_realm().get_name();
-}
-
-const std::string& RadosZone::get_realm_id()
-{
-  return store->svc()->zone->get_realm().get_id();
-}
-
-const std::string_view RadosZone::get_tier_type()
-{
-  if (local_zone)
-    return store->svc()->zone->get_zone().tier_type;
-
-  return rgw_zone.id;
-}
-
-RGWBucketSyncPolicyHandlerRef RadosZone::get_sync_policy_handler()
-{
-  return store->svc()->zone->get_sync_policy_handler(get_id());
-}
-
-RadosLuaManager::RadosLuaManager(RadosStore* _s) : 
-  store(_s),
-  pool((store->svc() && store->svc()->zone) ? store->svc()->zone->get_zone_params().log_pool : rgw_pool())
-{ }
-
-int RadosLuaManager::get_script(const DoutPrefixProvider* dpp, optional_yield y, const std::string& key, std::string& script)
-{
-  if (pool.empty()) {
-    ldpp_dout(dpp, 10) << "WARNING: missing pool when reading lua script " << dendl;
-    return 0;
-  }
-  bufferlist bl;
-
-  int r = rgw_get_system_obj(store->svc()->sysobj, pool, key, bl, nullptr, nullptr, y, dpp);
-  if (r < 0) {
-    return r;
-  }
-
-  auto iter = bl.cbegin();
-  try {
-    ceph::decode(script, iter);
-  } catch (buffer::error& err) {
-    return -EIO;
-  }
-
-  return 0;
-}
-
-int RadosLuaManager::put_script(const DoutPrefixProvider* dpp, optional_yield y, const std::string& key, const std::string& script)
-{
-  if (pool.empty()) {
-    ldpp_dout(dpp, 10) << "WARNING: missing pool when writing lua script " << dendl;
-    return 0;
-  }
-  bufferlist bl;
-  ceph::encode(script, bl);
-
-  int r = rgw_put_system_obj(dpp, store->svc()->sysobj, pool, key, bl, false, nullptr, real_time(), y);
-  if (r < 0) {
-    return r;
-  }
-
-  return 0;
-}
-
-int RadosLuaManager::del_script(const DoutPrefixProvider* dpp, optional_yield y, const std::string& key)
-{
-  if (pool.empty()) {
-    ldpp_dout(dpp, 10) << "WARNING: missing pool when deleting lua script " << dendl;
-    return 0;
-  }
-  int r = rgw_delete_system_obj(dpp, store->svc()->sysobj, pool, key, nullptr, y);
-  if (r < 0 && r != -ENOENT) {
-    return r;
-  }
-
-  return 0;
-}
-
-const std::string PACKAGE_LIST_OBJECT_NAME = "lua_package_allowlist";
-
-int RadosLuaManager::add_package(const DoutPrefixProvider *dpp, optional_yield y, const std::string& package_name)
-{
-  // add package to list
-  const bufferlist empty_bl;
-  std::map<std::string, bufferlist> new_package{{package_name, empty_bl}};
-  librados::ObjectWriteOperation op;
-  op.omap_set(new_package);
-  auto ret = rgw_rados_operate(dpp, *(store->getRados()->get_lc_pool_ctx()),
-      PACKAGE_LIST_OBJECT_NAME, &op, y);
-
-  if (ret < 0) {
-    return ret;
-  }
-  return 0;
-}
-
-int RadosLuaManager::remove_package(const DoutPrefixProvider *dpp, optional_yield y, const std::string& package_name)
-{
-  librados::ObjectWriteOperation op;
-  size_t pos = package_name.find(" ");
-  if (pos != package_name.npos) {
-    // remove specfic version of the the package
-    op.omap_rm_keys(std::set<std::string>({package_name}));
-    auto ret = rgw_rados_operate(dpp, *(store->getRados()->get_lc_pool_ctx()),
-        PACKAGE_LIST_OBJECT_NAME, &op, y);
-    if (ret < 0) {
-        return ret;
-    }
-    return 0;
-  }
-  // otherwise, remove any existing versions of the package
-  rgw::lua::packages_t packages;
-  auto ret = list_packages(dpp, y, packages);
-  if (ret < 0 && ret != -ENOENT) {
-    return ret;
-  }
-  for(const auto& package : packages) {
-    const std::string package_no_version = package.substr(0, package.find(" "));
-    if (package_no_version.compare(package_name) == 0) {
-        op.omap_rm_keys(std::set<std::string>({package}));
-        ret = rgw_rados_operate(dpp, *(store->getRados()->get_lc_pool_ctx()),
-            PACKAGE_LIST_OBJECT_NAME, &op, y);
-        if (ret < 0) {
-            return ret;
-        }
-    }
-  }
-  return 0;
-}
-
-int RadosLuaManager::list_packages(const DoutPrefixProvider *dpp, optional_yield y, rgw::lua::packages_t& packages)
-{
-  constexpr auto max_chunk = 1024U;
-  std::string start_after;
-  bool more = true;
-  int rval;
-  while (more) {
-    librados::ObjectReadOperation op;
-    rgw::lua::packages_t packages_chunk;
-    op.omap_get_keys2(start_after, max_chunk, &packages_chunk, &more, &rval);
-    const auto ret = rgw_rados_operate(dpp, *(store->getRados()->get_lc_pool_ctx()),
-      PACKAGE_LIST_OBJECT_NAME, &op, nullptr, y);
-
-    if (ret < 0) {
-      return ret;
-    }
-
-    packages.merge(packages_chunk);
-  }
-
-  return 0;
-}
-
-int RadosOIDCProvider::store_url(const DoutPrefixProvider *dpp, const std::string& url, bool exclusive, optional_yield y)
-{
-  auto sysobj = store->svc()->sysobj;
-  std::string oid = tenant + get_url_oid_prefix() + url;
-
-  bufferlist bl;
-  using ceph::encode;
-  encode(*this, bl);
-  return rgw_put_system_obj(dpp, sysobj, store->svc()->zone->get_zone_params().oidc_pool, oid, bl, exclusive, nullptr, real_time(), y);
-}
-
-int RadosOIDCProvider::read_url(const DoutPrefixProvider *dpp, const std::string& url, const std::string& tenant)
-{
-  auto sysobj = store->svc()->sysobj;
-  auto& pool = store->svc()->zone->get_zone_params().oidc_pool;
-  std::string oid = tenant + get_url_oid_prefix() + url;
-  bufferlist bl;
-
-  int ret = rgw_get_system_obj(sysobj, pool, oid, bl, nullptr, nullptr, null_yield, dpp);
-  if (ret < 0) {
-    return ret;
-  }
-
-  try {
-    using ceph::decode;
-    auto iter = bl.cbegin();
-    decode(*this, iter);
-  } catch (buffer::error& err) {
-    ldpp_dout(dpp, 0) << "ERROR: failed to decode oidc provider info from pool: " << pool.name <<
-                  ": " << url << dendl;
-    return -EIO;
-  }
-
-  return 0;
-}
-
-int RadosOIDCProvider::delete_obj(const DoutPrefixProvider *dpp, optional_yield y)
-{
-  auto& pool = store->svc()->zone->get_zone_params().oidc_pool;
-
-  std::string url, tenant;
-  auto ret = get_tenant_url_from_arn(tenant, url);
-  if (ret < 0) {
-    ldpp_dout(dpp, 0) << "ERROR: failed to parse arn" << dendl;
-    return -EINVAL;
-  }
-
-  if (this->tenant != tenant) {
-    ldpp_dout(dpp, 0) << "ERROR: tenant in arn doesn't match that of user " << this->tenant << ", "
-                  << tenant << ": " << dendl;
-    return -EINVAL;
-  }
-
-  // Delete url
-  std::string oid = tenant + get_url_oid_prefix() + url;
-  ret = rgw_delete_system_obj(dpp, store->svc()->sysobj, pool, oid, nullptr, y);
-  if (ret < 0) {
-    ldpp_dout(dpp, 0) << "ERROR: deleting oidc url from pool: " << pool.name << ": "
-                  << provider_url << ": " << cpp_strerror(-ret) << dendl;
-  }
-
-  return ret;
-}
-
-int RadosRole::store_info(const DoutPrefixProvider *dpp, bool exclusive, optional_yield y)
-{
-  using ceph::encode;
-  std::string oid;
-
-  oid = info.id;
-
-  bufferlist bl;
-  encode(this->info, bl);
-
-  if (!this->info.tags.empty()) {
-    bufferlist bl_tags;
-    encode(this->info.tags, bl_tags);
-    map<string, bufferlist> attrs;
-    attrs.emplace("tagging", bl_tags);
-
-    RGWSI_MBSObj_PutParams params(bl, &attrs, info.mtime, exclusive);
-    std::unique_ptr<RGWSI_MetaBackend::Context> ctx(store->svc()->role->svc.meta_be->alloc_ctx());
-    ctx->init(store->svc()->role->get_be_handler());
-    return store->svc()->role->svc.meta_be->put(ctx.get(), oid, params, &info.objv_tracker, y, dpp);
-  } else {
-    RGWSI_MBSObj_PutParams params(bl, nullptr, info.mtime, exclusive);
-    std::unique_ptr<RGWSI_MetaBackend::Context> ctx(store->svc()->role->svc.meta_be->alloc_ctx());
-    ctx->init(store->svc()->role->get_be_handler());
-    return store->svc()->role->svc.meta_be->put(ctx.get(), oid, params, &info.objv_tracker, y, dpp);
-  }
-}
-
-int RadosRole::store_name(const DoutPrefixProvider *dpp, bool exclusive, optional_yield y)
-{
-  auto sysobj = store->svc()->sysobj;
-  RGWNameToId nameToId;
-  nameToId.obj_id = info.id;
-
-  std::string oid = info.tenant + get_names_oid_prefix() + info.name;
-
-  bufferlist bl;
-  using ceph::encode;
-  encode(nameToId, bl);
-
-  return rgw_put_system_obj(dpp, sysobj, store->svc()->zone->get_zone_params().roles_pool, oid, bl, exclusive, &info.objv_tracker, real_time(), y);
-}
-
-int RadosRole::store_path(const DoutPrefixProvider *dpp, bool exclusive, optional_yield y)
-{
-  auto sysobj = store->svc()->sysobj;
-  std::string oid = info.tenant + get_path_oid_prefix() + info.path + get_info_oid_prefix() + info.id;
-
-  bufferlist bl;
-
-  return rgw_put_system_obj(dpp, sysobj, store->svc()->zone->get_zone_params().roles_pool, oid, bl, exclusive, &info.objv_tracker, real_time(), y);
-}
-
-int RadosRole::read_id(const DoutPrefixProvider *dpp, const std::string& role_name, const std::string& tenant, std::string& role_id, optional_yield y)
-{
-  auto sysobj = store->svc()->sysobj;
-  std::string oid = info.tenant + get_names_oid_prefix() + role_name;
-  bufferlist bl;
-
-  int ret = rgw_get_system_obj(sysobj, store->svc()->zone->get_zone_params().roles_pool, oid, bl, nullptr, nullptr, null_yield, dpp);
-  if (ret < 0) {
-    return ret;
-  }
-
-  RGWNameToId nameToId;
-  try {
-    auto iter = bl.cbegin();
-    using ceph::decode;
-    decode(nameToId, iter);
-  } catch (buffer::error& err) {
-    ldpp_dout(dpp, 0) << "ERROR: failed to decode role from Role pool: " << role_name << dendl;
-    return -EIO;
-  }
-  role_id = nameToId.obj_id;
-  return 0;
-}
-
-int RadosRole::read_name(const DoutPrefixProvider *dpp, optional_yield y)
-{
-  auto sysobj = store->svc()->sysobj;
-  std::string oid = info.tenant + get_names_oid_prefix() + info.name;
-  bufferlist bl;
-
-  int ret = rgw_get_system_obj(sysobj, store->svc()->zone->get_zone_params().roles_pool, oid, bl, nullptr, nullptr, null_yield, dpp);
-  if (ret < 0) {
-    ldpp_dout(dpp, 0) << "ERROR: failed reading role name from Role pool: " << info.name <<
-      ": " << cpp_strerror(-ret) << dendl;
-    return ret;
-  }
-
-  RGWNameToId nameToId;
-  try {
-    using ceph::decode;
-    auto iter = bl.cbegin();
-    decode(nameToId, iter);
-  } catch (buffer::error& err) {
-    ldpp_dout(dpp, 0) << "ERROR: failed to decode role name from Role pool: " << info.name << dendl;
-    return -EIO;
-  }
-  info.id = nameToId.obj_id;
-  return 0;
-}
-
-int RadosRole::read_info(const DoutPrefixProvider *dpp, optional_yield y)
-{
-  std::string oid;
-
-  oid = info.id;
-  ldpp_dout(dpp, 20) << "INFO: oid in read_info is: " << oid << dendl;
-
-  bufferlist bl;
-
-  RGWSI_MBSObj_GetParams params(&bl, &info.attrs, &info.mtime);
-  std::unique_ptr<RGWSI_MetaBackend::Context> ctx(store->svc()->role->svc.meta_be->alloc_ctx());
-  ctx->init(store->svc()->role->get_be_handler());
-  int ret = store->svc()->role->svc.meta_be->get(ctx.get(), oid, params, &info.objv_tracker, y, dpp, true);
-  if (ret < 0) {
-    ldpp_dout(dpp, 0) << "ERROR: failed reading role info from Role pool: " << info.id << ": " << cpp_strerror(-ret) << dendl;
-    return ret;
-  }
-
-  try {
-    using ceph::decode;
-    auto iter = bl.cbegin();
-    decode(this->info, iter);
-  } catch (buffer::error& err) {
-    ldpp_dout(dpp, 0) << "ERROR: failed to decode role info from Role pool: " << info.id << dendl;
-    return -EIO;
-  }
-
-  auto it = info.attrs.find("tagging");
-  if (it != info.attrs.end()) {
-    bufferlist bl_tags = it->second;
-    try {
-      using ceph::decode;
-      auto iter = bl_tags.cbegin();
-      decode(info.tags, iter);
-    } catch (buffer::error& err) {
-      ldpp_dout(dpp, 0) << "ERROR: failed to decode attrs" << info.id << dendl;
-      return -EIO;
-    }
-  }
-
-  return 0;
-}
-
-int RadosRole::create(const DoutPrefixProvider *dpp, bool exclusive, const std::string& role_id, optional_yield y)
-{
-  int ret;
-
-  if (! validate_input(dpp)) {
-    return -EINVAL;
-  }
-
-  if (!role_id.empty()) {
-    info.id = role_id;
-  }
-
-  /* check to see the name is not used */
-  ret = read_id(dpp, info.name, info.tenant, info.id, y);
-  if (exclusive && ret == 0) {
-    ldpp_dout(dpp, 0) << "ERROR: name " << info.name << " already in use for role id "
-                    << info.id << dendl;
-    return -EEXIST;
-  } else if ( ret < 0 && ret != -ENOENT) {
-    ldpp_dout(dpp, 0) << "failed reading role id  " << info.id << ": "
-                  << cpp_strerror(-ret) << dendl;
-    return ret;
-  }
-
-  if (info.id.empty()) {
-    /* create unique id */
-    uuid_d new_uuid;
-    char uuid_str[37];
-    new_uuid.generate_random();
-    new_uuid.print(uuid_str);
-    info.id = uuid_str;
-  }
-
-  //arn
-  info.arn = role_arn_prefix + info.tenant + ":role" + info.path + info.name;
-
-  // Creation time
-  real_clock::time_point t = real_clock::now();
-
-  struct timeval tv;
-  real_clock::to_timeval(t, tv);
-
-  char buf[30];
-  struct tm result;
-  gmtime_r(&tv.tv_sec, &result);
-  strftime(buf,30,"%Y-%m-%dT%H:%M:%S", &result);
-  sprintf(buf + strlen(buf),".%dZ",(int)tv.tv_usec/1000);
-  info.creation_date.assign(buf, strlen(buf));
-
-  auto& pool = store->svc()->zone->get_zone_params().roles_pool;
-  ret = store_info(dpp, exclusive, y);
-  if (ret < 0) {
-    ldpp_dout(dpp, 0) << "ERROR:  storing role info in Role pool: "
-                  << info.id << ": " << cpp_strerror(-ret) << dendl;
-    return ret;
-  }
-
-  ret = store_name(dpp, exclusive, y);
-  if (ret < 0) {
-    ldpp_dout(dpp, 0) << "ERROR: storing role name in Role pool: "
-                  << info.name << ": " << cpp_strerror(-ret) << dendl;
-
-    //Delete the role info that was stored in the previous call
-    std::string oid = get_info_oid_prefix() + info.id;
-    int info_ret = rgw_delete_system_obj(dpp, store->svc()->sysobj, pool, oid, nullptr, y);
-    if (info_ret < 0) {
-      ldpp_dout(dpp, 0) << "ERROR: cleanup of role id from Role pool: "
-                  << info.id << ": " << cpp_strerror(-info_ret) << dendl;
-    }
-    return ret;
-  }
-
-  ret = store_path(dpp, exclusive, y);
-  if (ret < 0) {
-    ldpp_dout(dpp, 0) << "ERROR: storing role path in Role pool: "
-                  << info.path << ": " << cpp_strerror(-ret) << dendl;
-    //Delete the role info that was stored in the previous call
-    std::string oid = get_info_oid_prefix() + info.id;
-    int info_ret = rgw_delete_system_obj(dpp, store->svc()->sysobj, pool, oid, nullptr, y);
-    if (info_ret < 0) {
-      ldpp_dout(dpp, 0) << "ERROR: cleanup of role id from Role pool: "
-                  << info.id << ": " << cpp_strerror(-info_ret) << dendl;
-    }
-    //Delete role name that was stored in previous call
-    oid = info.tenant + get_names_oid_prefix() + info.name;
-    int name_ret = rgw_delete_system_obj(dpp, store->svc()->sysobj, pool, oid, nullptr, y);
-    if (name_ret < 0) {
-      ldpp_dout(dpp, 0) << "ERROR: cleanup of role name from Role pool: "
-                  << info.name << ": " << cpp_strerror(-name_ret) << dendl;
-    }
-    return ret;
-  }
-  return 0;
-}
-
-int RadosRole::delete_obj(const DoutPrefixProvider *dpp, optional_yield y)
-{
-  auto& pool = store->svc()->zone->get_zone_params().roles_pool;
-
-  int ret = read_name(dpp, y);
-  if (ret < 0) {
-    return ret;
-  }
-
-  ret = read_info(dpp, y);
-  if (ret < 0) {
-    return ret;
-  }
-
-  if (! info.perm_policy_map.empty()) {
-    return -ERR_DELETE_CONFLICT;
-  }
-
-  // Delete id
-  std::string oid = get_info_oid_prefix() + info.id;
-  ret = rgw_delete_system_obj(dpp, store->svc()->sysobj, pool, oid, nullptr, y);
-  if (ret < 0) {
-    ldpp_dout(dpp, 0) << "ERROR: deleting role id from Role pool: "
-                  << info.id << ": " << cpp_strerror(-ret) << dendl;
-  }
-
-  // Delete name
-  oid = info.tenant + get_names_oid_prefix() + info.name;
-  ret = rgw_delete_system_obj(dpp, store->svc()->sysobj, pool, oid, nullptr, y);
-  if (ret < 0) {
-    ldpp_dout(dpp, 0) << "ERROR: deleting role name from Role pool: "
-                  << info.name << ": " << cpp_strerror(-ret) << dendl;
-  }
-
-  // Delete path
-  oid = info.tenant + get_path_oid_prefix() + info.path + get_info_oid_prefix() + info.id;
-  ret = rgw_delete_system_obj(dpp, store->svc()->sysobj, pool, oid, nullptr, y);
-  if (ret < 0) {
-    ldpp_dout(dpp, 0) << "ERROR: deleting role path from Role pool: "
-                  << info.path << ": " << cpp_strerror(-ret) << dendl;
-  }
-  return ret;
-}
-
-} // namespace rgw::sal
-
-extern "C" {
-
-void* newRadosStore(void)
-{
-  rgw::sal::RadosStore* store = new rgw::sal::RadosStore();
-  if (store) {
-    RGWRados* rados = new RGWRados();
-
-    if (!rados) {
-      delete store; store = nullptr;
-    } else {
-      store->setRados(rados);
-      rados->set_store(store);
-    }
-  }
-
-  return store;
-}
-
-}
diff --git a/src/rgw/store/rados/rgw_sal_rados.h b/src/rgw/store/rados/rgw_sal_rados.h
deleted file mode 100644 (file)
index 499e099..0000000
+++ /dev/null
@@ -1,959 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab ft=cpp
-
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2020 Red Hat, Inc.
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation. See file COPYING.
- *
- */
-
-#pragma once
-
-#include "rgw_sal_store.h"
-#include "rgw_rados.h"
-#include "rgw_notify.h"
-#include "rgw_oidc_provider.h"
-#include "rgw_role.h"
-#include "rgw_multi.h"
-#include "rgw_putobj_processor.h"
-#include "services/svc_tier_rados.h"
-#include "cls/lock/cls_lock_client.h"
-
-namespace rgw { namespace sal {
-
-class RadosMultipartUpload;
-
-class RadosCompletions : public Completions {
-  public:
-    std::list<librados::AioCompletion*> handles;
-    RadosCompletions() {}
-    ~RadosCompletions() = default;
-    virtual int drain() override;
-};
-
-class RadosPlacementTier: public StorePlacementTier {
-  RadosStore* store;
-  RGWZoneGroupPlacementTier tier;
-public:
-  RadosPlacementTier(RadosStore* _store, const RGWZoneGroupPlacementTier& _tier) : store(_store), tier(_tier) {}
-  virtual ~RadosPlacementTier() = default;
-
-  virtual const std::string& get_tier_type() { return tier.tier_type; }
-  virtual const std::string& get_storage_class() { return tier.storage_class; }
-  virtual bool retain_head_object() { return tier.retain_head_object; }
-  RGWZoneGroupPlacementTier& get_rt() { return tier; }
-};
-
-class RadosZoneGroup : public StoreZoneGroup {
-  RadosStore* store;
-  const RGWZoneGroup group;
-  std::string empty;
-public:
-  RadosZoneGroup(RadosStore* _store, const RGWZoneGroup& _group) : store(_store), group(_group) {}
-  virtual ~RadosZoneGroup() = default;
-
-  virtual const std::string& get_id() const override { return group.get_id(); };
-  virtual const std::string& get_name() const override { return group.get_name(); };
-  virtual int equals(const std::string& other_zonegroup) const override {
-    return group.equals(other_zonegroup);
-  };
-  /** Get the endpoint from zonegroup, or from master zone if not set */
-  virtual const std::string& get_endpoint() const override;
-  virtual bool placement_target_exists(std::string& target) const override;
-  virtual bool is_master_zonegroup() const override {
-    return group.is_master_zonegroup();
-  };
-  virtual const std::string& get_api_name() const override { return group.api_name; };
-  virtual int get_placement_target_names(std::set<std::string>& names) const override;
-  virtual const std::string& get_default_placement_name() const override {
-    return group.default_placement.name; };
-  virtual int get_hostnames(std::list<std::string>& names) const override {
-    names = group.hostnames;
-    return 0;
-  };
-  virtual int get_s3website_hostnames(std::list<std::string>& names) const override {
-    names = group.hostnames_s3website;
-    return 0;
-  };
-  virtual int get_zone_count() const override {
-    return group.zones.size();
-  }
-  virtual int get_placement_tier(const rgw_placement_rule& rule, std::unique_ptr<PlacementTier>* tier);
-  virtual int get_zone_by_id(const std::string& id, std::unique_ptr<Zone>* zone) override;
-  virtual int get_zone_by_name(const std::string& name, std::unique_ptr<Zone>* zone) override;
-  virtual int list_zones(std::list<std::string>& zone_ids) override;
-  virtual std::unique_ptr<ZoneGroup> clone() override {
-    return std::make_unique<RadosZoneGroup>(store, group);
-  }
-  const RGWZoneGroup& get_group() const { return group; }
-};
-
-class RadosZone : public StoreZone {
-  protected:
-    RadosStore* store;
-    std::unique_ptr<ZoneGroup> group;
-    RGWZone rgw_zone;
-    bool local_zone{false};
-  public:
-    RadosZone(RadosStore* _store, std::unique_ptr<ZoneGroup> _zg) : store(_store), group(std::move(_zg)), local_zone(true) {}
-    RadosZone(RadosStore* _store, std::unique_ptr<ZoneGroup> _zg, RGWZone& z) : store(_store), group(std::move(_zg)), rgw_zone(z) {}
-    ~RadosZone() = default;
-
-    virtual std::unique_ptr<Zone> clone() override;
-    virtual ZoneGroup& get_zonegroup() override { return *(group.get()); }
-    virtual const std::string& get_id() override;
-    virtual const std::string& get_name() const override;
-    virtual bool is_writeable() override;
-    virtual bool get_redirect_endpoint(std::string* endpoint) override;
-    virtual bool has_zonegroup_api(const std::string& api) const override;
-    virtual const std::string& get_current_period_id() override;
-    virtual const RGWAccessKey& get_system_key() override;
-    virtual const std::string& get_realm_name() override;
-    virtual const std::string& get_realm_id() override;
-    virtual const std::string_view get_tier_type() override;
-    virtual RGWBucketSyncPolicyHandlerRef get_sync_policy_handler() override;
-};
-
-class RadosStore : public StoreDriver {
-  private:
-    RGWRados* rados;
-    RGWUserCtl* user_ctl;
-    std::string luarocks_path;
-    std::unique_ptr<RadosZone> zone;
-
-  public:
-    RadosStore()
-      : rados(nullptr) {
-      }
-    ~RadosStore() {
-      delete rados;
-    }
-
-    virtual int initialize(CephContext *cct, const DoutPrefixProvider *dpp) override;
-    virtual const std::string get_name() const override {
-      return "rados";
-    }
-    virtual std::string get_cluster_id(const DoutPrefixProvider* dpp,  optional_yield y) override;
-    virtual std::unique_ptr<User> get_user(const rgw_user& u) override;
-    virtual int get_user_by_access_key(const DoutPrefixProvider* dpp, const std::string& key, optional_yield y, std::unique_ptr<User>* user) override;
-    virtual int get_user_by_email(const DoutPrefixProvider* dpp, const std::string& email, optional_yield y, std::unique_ptr<User>* user) override;
-    virtual int get_user_by_swift(const DoutPrefixProvider* dpp, const std::string& user_str, optional_yield y, std::unique_ptr<User>* user) override;
-    virtual std::unique_ptr<Object> get_object(const rgw_obj_key& k) override;
-    virtual int get_bucket(const DoutPrefixProvider* dpp, User* u, const rgw_bucket& b, std::unique_ptr<Bucket>* bucket, optional_yield y) override;
-    virtual int get_bucket(User* u, const RGWBucketInfo& i, std::unique_ptr<Bucket>* bucket) override;
-    virtual int get_bucket(const DoutPrefixProvider* dpp, User* u, const std::string& tenant, const std::string&name, std::unique_ptr<Bucket>* bucket, optional_yield y) override;
-    virtual bool is_meta_master() override;
-    virtual int forward_request_to_master(const DoutPrefixProvider *dpp, User* user, obj_version* objv,
-                                         bufferlist& in_data, JSONParser* jp, req_info& info,
-                                         optional_yield y) override;
-    virtual int forward_iam_request_to_master(const DoutPrefixProvider *dpp, const RGWAccessKey& key, obj_version* objv,
-                                            bufferlist& in_data,
-                                            RGWXMLDecoder::XMLParser* parser, req_info& info,
-                                            optional_yield y) override;
-    virtual Zone* get_zone() { return zone.get(); }
-    virtual std::string zone_unique_id(uint64_t unique_num) override;
-    virtual std::string zone_unique_trans_id(const uint64_t unique_num) override;
-    virtual int get_zonegroup(const std::string& id, std::unique_ptr<ZoneGroup>* zonegroup) override;
-    virtual int list_all_zones(const DoutPrefixProvider* dpp, std::list<std::string>& zone_ids) override;
-    virtual int cluster_stat(RGWClusterStat& stats) override;
-    virtual std::unique_ptr<Lifecycle> get_lifecycle(void) override;
-    virtual std::unique_ptr<Completions> get_completions(void) override;
-    virtual std::unique_ptr<Notification> get_notification(rgw::sal::Object* obj, rgw::sal::Object* src_obj, req_state* s, rgw::notify::EventType event_type, const std::string* object_name=nullptr) override;
-    virtual std::unique_ptr<Notification> get_notification(
-    const DoutPrefixProvider* dpp, rgw::sal::Object* obj, rgw::sal::Object* src_obj, 
-    rgw::notify::EventType event_type, rgw::sal::Bucket* _bucket, std::string& _user_id, std::string& _user_tenant,
-    std::string& _req_id, optional_yield y) override;
-    virtual RGWLC* get_rgwlc(void) override { return rados->get_lc(); }
-    virtual RGWCoroutinesManagerRegistry* get_cr_registry() override { return rados->get_cr_registry(); }
-
-    virtual int log_usage(const DoutPrefixProvider *dpp, std::map<rgw_user_bucket, RGWUsageBatch>& usage_info) override;
-    virtual int log_op(const DoutPrefixProvider *dpp, std::string& oid, bufferlist& bl) override;
-    virtual int register_to_service_map(const DoutPrefixProvider *dpp, const std::string& daemon_type,
-                               const std::map<std::string, std::string>& meta) override;
-    virtual void get_quota(RGWQuota& quota) override;
-    virtual void get_ratelimit(RGWRateLimitInfo& bucket_ratelimit, RGWRateLimitInfo& user_ratelimit, RGWRateLimitInfo& anon_ratelimit) override;
-    virtual int set_buckets_enabled(const DoutPrefixProvider* dpp, std::vector<rgw_bucket>& buckets, bool enabled) override;
-    virtual int get_sync_policy_handler(const DoutPrefixProvider* dpp,
-                                       std::optional<rgw_zone_id> zone,
-                                       std::optional<rgw_bucket> bucket,
-                                       RGWBucketSyncPolicyHandlerRef* phandler,
-                                       optional_yield y) override;
-    virtual RGWDataSyncStatusManager* get_data_sync_manager(const rgw_zone_id& source_zone) override;
-    virtual void wakeup_meta_sync_shards(std::set<int>& shard_ids) override { rados->wakeup_meta_sync_shards(shard_ids); }
-    virtual void wakeup_data_sync_shards(const DoutPrefixProvider *dpp, const rgw_zone_id& source_zone, boost::container::flat_map<int, boost::container::flat_set<rgw_data_notify_entry>>& shard_ids) override { rados->wakeup_data_sync_shards(dpp, source_zone, shard_ids); }
-    virtual int clear_usage(const DoutPrefixProvider *dpp) override { return rados->clear_usage(dpp); }
-    virtual int read_all_usage(const DoutPrefixProvider *dpp, uint64_t start_epoch, uint64_t end_epoch,
-                              uint32_t max_entries, bool* is_truncated,
-                              RGWUsageIter& usage_iter,
-                              std::map<rgw_user_bucket, rgw_usage_log_entry>& usage) override;
-    virtual int trim_all_usage(const DoutPrefixProvider *dpp, uint64_t start_epoch, uint64_t end_epoch) override;
-    virtual int get_config_key_val(std::string name, bufferlist* bl) override;
-    virtual int meta_list_keys_init(const DoutPrefixProvider *dpp, const std::string& section, const std::string& marker, void** phandle) override;
-    virtual int meta_list_keys_next(const DoutPrefixProvider *dpp, void* handle, int max, std::list<std::string>& keys, bool* truncated) override;
-    virtual void meta_list_keys_complete(void* handle) override;
-    virtual std::string meta_get_marker(void* handle) override;
-    virtual int meta_remove(const DoutPrefixProvider* dpp, std::string& metadata_key, optional_yield y) override;
-    virtual const RGWSyncModuleInstanceRef& get_sync_module() { return rados->get_sync_module(); }
-    virtual std::string get_host_id() { return rados->host_id; }
-    virtual std::unique_ptr<LuaManager> get_lua_manager() override;
-    virtual std::unique_ptr<RGWRole> get_role(std::string name,
-                                             std::string tenant,
-                                             std::string path="",
-                                             std::string trust_policy="",
-                                             std::string max_session_duration_str="",
-                std::multimap<std::string,std::string> tags={}) override;
-    virtual std::unique_ptr<RGWRole> get_role(std::string id) override;
-    virtual std::unique_ptr<RGWRole> get_role(const RGWRoleInfo& info) override;
-    virtual int get_roles(const DoutPrefixProvider *dpp,
-                         optional_yield y,
-                         const std::string& path_prefix,
-                         const std::string& tenant,
-                         std::vector<std::unique_ptr<RGWRole>>& roles) override;
-    virtual std::unique_ptr<RGWOIDCProvider> get_oidc_provider() override;
-    virtual int get_oidc_providers(const DoutPrefixProvider *dpp,
-                                  const std::string& tenant,
-                                  std::vector<std::unique_ptr<RGWOIDCProvider>>& providers) override;
-    virtual std::unique_ptr<Writer> get_append_writer(const DoutPrefixProvider *dpp,
-                                 optional_yield y,
-                                 std::unique_ptr<rgw::sal::Object> _head_obj,
-                                 const rgw_user& owner,
-                                 const rgw_placement_rule *ptail_placement_rule,
-                                 const std::string& unique_tag,
-                                 uint64_t position,
-                                 uint64_t *cur_accounted_size) override;
-    virtual std::unique_ptr<Writer> get_atomic_writer(const DoutPrefixProvider *dpp,
-                                 optional_yield y,
-                                 std::unique_ptr<rgw::sal::Object> _head_obj,
-                                 const rgw_user& owner,
-                                 const rgw_placement_rule *ptail_placement_rule,
-                                 uint64_t olh_epoch,
-                                 const std::string& unique_tag) override;
-    virtual const std::string& get_compression_type(const rgw_placement_rule& rule) override;
-    virtual bool valid_placement(const rgw_placement_rule& rule) override;
-
-    virtual void finalize(void) override;
-
-    virtual CephContext* ctx(void) override { return rados->ctx(); }
-
-    virtual const std::string& get_luarocks_path() const override {
-      return luarocks_path;
-    }
-
-    virtual void set_luarocks_path(const std::string& path) override {
-      luarocks_path = path;
-    }
-    virtual void register_admin_apis(RGWRESTMgr* mgr) override;
-
-    /* Unique to RadosStore */
-    int get_obj_head_ioctx(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj,
-                          librados::IoCtx* ioctx);
-    int delete_raw_obj(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj);
-    int delete_raw_obj_aio(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj, Completions* aio);
-    void get_raw_obj(const rgw_placement_rule& placement_rule, const rgw_obj& obj, rgw_raw_obj* raw_obj);
-    int get_raw_chunk_size(const DoutPrefixProvider* dpp, const rgw_raw_obj& obj, uint64_t* chunk_size);
-
-    void setRados(RGWRados * st) { rados = st; }
-    RGWRados* getRados(void) { return rados; }
-
-    RGWServices* svc() { return &rados->svc; }
-    const RGWServices* svc() const { return &rados->svc; }
-    RGWCtl* ctl() { return &rados->ctl; }
-    const RGWCtl* ctl() const { return &rados->ctl; }
-
-    void setUserCtl(RGWUserCtl *_ctl) { user_ctl = _ctl; }
-};
-
-class RadosUser : public StoreUser {
-  private:
-    RadosStore* store;
-
-  public:
-    RadosUser(RadosStore *_st, const rgw_user& _u) : StoreUser(_u), store(_st) { }
-    RadosUser(RadosStore *_st, const RGWUserInfo& _i) : StoreUser(_i), store(_st) { }
-    RadosUser(RadosStore *_st) : store(_st) { }
-    RadosUser(RadosUser& _o) = default;
-
-    virtual std::unique_ptr<User> clone() override {
-      return std::unique_ptr<User>(new RadosUser(*this));
-    }
-    int list_buckets(const DoutPrefixProvider* dpp, const std::string& marker, const std::string& end_marker,
-                    uint64_t max, bool need_stats, BucketList& buckets,
-                    optional_yield y) override;
-    virtual int create_bucket(const DoutPrefixProvider* dpp,
-                            const rgw_bucket& b,
-                            const std::string& zonegroup_id,
-                            rgw_placement_rule& placement_rule,
-                            std::string& swift_ver_location,
-                            const RGWQuotaInfo * pquota_info,
-                            const RGWAccessControlPolicy& policy,
-                           Attrs& attrs,
-                            RGWBucketInfo& info,
-                            obj_version& ep_objv,
-                           bool exclusive,
-                           bool obj_lock_enabled,
-                           bool* existed,
-                           req_info& req_info,
-                           std::unique_ptr<Bucket>* bucket,
-                           optional_yield y) override;
-    virtual int read_attrs(const DoutPrefixProvider* dpp, optional_yield y) override;
-    virtual int merge_and_store_attrs(const DoutPrefixProvider* dpp, Attrs& new_attrs, optional_yield y) override;
-    virtual int read_stats(const DoutPrefixProvider *dpp,
-                           optional_yield y, RGWStorageStats* stats,
-                          ceph::real_time* last_stats_sync = nullptr,
-                          ceph::real_time* last_stats_update = nullptr) override;
-    virtual int read_stats_async(const DoutPrefixProvider *dpp, RGWGetUserStats_CB* cb) override;
-    virtual int complete_flush_stats(const DoutPrefixProvider *dpp, optional_yield y) override;
-    virtual int read_usage(const DoutPrefixProvider *dpp, uint64_t start_epoch, uint64_t end_epoch, uint32_t max_entries,
-                          bool* is_truncated, RGWUsageIter& usage_iter,
-                          std::map<rgw_user_bucket, rgw_usage_log_entry>& usage) override;
-    virtual int trim_usage(const DoutPrefixProvider *dpp, uint64_t start_epoch, uint64_t end_epoch) override;
-
-    virtual int load_user(const DoutPrefixProvider* dpp, optional_yield y) override;
-    virtual int store_user(const DoutPrefixProvider* dpp, optional_yield y, bool exclusive, RGWUserInfo* old_info = nullptr) override;
-    virtual int remove_user(const DoutPrefixProvider* dpp, optional_yield y) override;
-    virtual int verify_mfa(const std::string& mfa_str, bool* verified, const DoutPrefixProvider* dpp, optional_yield y) override;
-
-    friend class RadosBucket;
-};
-
-class RadosObject : public StoreObject {
-  private:
-    RadosStore* store;
-    RGWAccessControlPolicy acls;
-    RGWObjManifest *manifest{nullptr};
-    RGWObjectCtx* rados_ctx;
-    bool rados_ctx_owned;
-
-  public:
-
-    struct RadosReadOp : public ReadOp {
-    private:
-      RadosObject* source;
-      RGWObjectCtx* rctx;
-      RGWRados::Object op_target;
-      RGWRados::Object::Read parent_op;
-
-    public:
-      RadosReadOp(RadosObject *_source, RGWObjectCtx *_rctx);
-
-      virtual int prepare(optional_yield y, const DoutPrefixProvider* dpp) override;
-
-      /*
-       * Both `read` and `iterate` read up through index `end`
-       * *inclusive*. The number of bytes that could be returned is
-       * `end - ofs + 1`.
-       */
-      virtual int read(int64_t ofs, int64_t end,
-                      bufferlist& bl, optional_yield y,
-                      const DoutPrefixProvider* dpp) override;
-      virtual int iterate(const DoutPrefixProvider* dpp,
-                         int64_t ofs, int64_t end,
-                         RGWGetDataCB* cb, optional_yield y) override;
-
-        virtual int get_attr(const DoutPrefixProvider* dpp, const char* name, bufferlist& dest, optional_yield y) override;
-    };
-
-    struct RadosDeleteOp : public DeleteOp {
-    private:
-      RadosObject* source;
-      RGWRados::Object op_target;
-      RGWRados::Object::Delete parent_op;
-
-    public:
-      RadosDeleteOp(RadosObject* _source);
-
-      virtual int delete_obj(const DoutPrefixProvider* dpp, optional_yield y) override;
-    };
-
-    RadosObject(RadosStore *_st, const rgw_obj_key& _k)
-      : StoreObject(_k),
-       store(_st),
-        acls(),
-       rados_ctx(new RGWObjectCtx(dynamic_cast<Driver*>(store))),
-       rados_ctx_owned(true) {
-    }
-    RadosObject(RadosStore *_st, const rgw_obj_key& _k, Bucket* _b)
-      : StoreObject(_k, _b),
-       store(_st),
-        acls(),
-       rados_ctx(new RGWObjectCtx(dynamic_cast<Driver*>(store))) ,
-       rados_ctx_owned(true) {
-    }
-    RadosObject(RadosObject& _o) : StoreObject(_o) {
-      store = _o.store;
-      acls = _o.acls;
-      manifest = _o.manifest;
-      rados_ctx = _o.rados_ctx;
-      rados_ctx_owned = false;
-    }
-
-    virtual ~RadosObject();
-
-    virtual void invalidate() override {
-      StoreObject::invalidate();
-      rados_ctx->invalidate(get_obj());
-    }
-    virtual int delete_object(const DoutPrefixProvider* dpp,
-                             optional_yield y, bool prevent_versioning) override;
-    virtual int delete_obj_aio(const DoutPrefixProvider* dpp, RGWObjState* astate, Completions* aio,
-                              bool keep_index_consistent, optional_yield y) override;
-    virtual int copy_object(User* user,
-               req_info* info, const rgw_zone_id& source_zone,
-               rgw::sal::Object* dest_object, rgw::sal::Bucket* dest_bucket,
-               rgw::sal::Bucket* src_bucket,
-               const rgw_placement_rule& dest_placement,
-               ceph::real_time* src_mtime, ceph::real_time* mtime,
-               const ceph::real_time* mod_ptr, const ceph::real_time* unmod_ptr,
-               bool high_precision_time,
-               const char* if_match, const char* if_nomatch,
-               AttrsMod attrs_mod, bool copy_if_newer, Attrs& attrs,
-               RGWObjCategory category, uint64_t olh_epoch,
-              boost::optional<ceph::real_time> delete_at,
-               std::string* version_id, std::string* tag, std::string* etag,
-               void (*progress_cb)(off_t, void *), void* progress_data,
-               const DoutPrefixProvider* dpp, optional_yield y) override;
-    virtual RGWAccessControlPolicy& get_acl(void) override { return acls; }
-    virtual int set_acl(const RGWAccessControlPolicy& acl) override { acls = acl; return 0; }
-    virtual void set_atomic() override {
-      rados_ctx->set_atomic(state.obj);
-      StoreObject::set_atomic();
-    }
-    virtual void set_prefetch_data() override {
-      rados_ctx->set_prefetch_data(state.obj);
-      StoreObject::set_prefetch_data();
-    }
-    virtual void set_compressed() override {
-      rados_ctx->set_compressed(state.obj);
-      StoreObject::set_compressed();
-    }
-
-    virtual int get_obj_state(const DoutPrefixProvider* dpp, RGWObjState **state, optional_yield y, bool follow_olh = true) override;
-    virtual int set_obj_attrs(const DoutPrefixProvider* dpp, Attrs* setattrs, Attrs* delattrs, optional_yield y) override;
-    virtual int get_obj_attrs(optional_yield y, const DoutPrefixProvider* dpp, rgw_obj* target_obj = NULL) override;
-    virtual int modify_obj_attrs(const char* attr_name, bufferlist& attr_val, optional_yield y, const DoutPrefixProvider* dpp) override;
-    virtual int delete_obj_attrs(const DoutPrefixProvider* dpp, const char* attr_name, optional_yield y) override;
-    virtual bool is_expired() override;
-    virtual void gen_rand_obj_instance_name() override;
-    void get_raw_obj(rgw_raw_obj* raw_obj);
-    virtual std::unique_ptr<Object> clone() override {
-      return std::unique_ptr<Object>(new RadosObject(*this));
-    }
-    virtual std::unique_ptr<MPSerializer> get_serializer(const DoutPrefixProvider *dpp,
-                                                        const std::string& lock_name) override;
-    virtual int transition(Bucket* bucket,
-                          const rgw_placement_rule& placement_rule,
-                          const real_time& mtime,
-                          uint64_t olh_epoch,
-                          const DoutPrefixProvider* dpp,
-                          optional_yield y) override;
-    virtual int transition_to_cloud(Bucket* bucket,
-                          rgw::sal::PlacementTier* tier,
-                          rgw_bucket_dir_entry& o,
-                          std::set<std::string>& cloud_targets,
-                          CephContext* cct,
-                          bool update_object,
-                          const DoutPrefixProvider* dpp,
-                          optional_yield y) override;
-    virtual bool placement_rules_match(rgw_placement_rule& r1, rgw_placement_rule& r2) override;
-    virtual int dump_obj_layout(const DoutPrefixProvider *dpp, optional_yield y, Formatter* f) override;
-
-    /* Swift versioning */
-    virtual int swift_versioning_restore(bool& restored,
-                                        const DoutPrefixProvider* dpp) override;
-    virtual int swift_versioning_copy(const DoutPrefixProvider* dpp,
-                                     optional_yield y) override;
-
-    /* OPs */
-    virtual std::unique_ptr<ReadOp> get_read_op() override;
-    virtual std::unique_ptr<DeleteOp> get_delete_op() override;
-
-    /* OMAP */
-    virtual int omap_get_vals(const DoutPrefixProvider *dpp, const std::string& marker, uint64_t count,
-                             std::map<std::string, bufferlist> *m,
-                             bool* pmore, optional_yield y) override;
-    virtual int omap_get_all(const DoutPrefixProvider *dpp, std::map<std::string, bufferlist> *m,
-                            optional_yield y) override;
-    virtual int omap_get_vals_by_keys(const DoutPrefixProvider *dpp, const std::string& oid,
-                             const std::set<std::string>& keys,
-                             Attrs* vals) override;
-    virtual int omap_set_val_by_key(const DoutPrefixProvider *dpp, const std::string& key, bufferlist& val,
-                                   bool must_exist, optional_yield y) override;
-
-    /* Internal to RadosStore */
-    int get_max_chunk_size(const DoutPrefixProvider* dpp,
-                          rgw_placement_rule placement_rule,
-                          uint64_t* max_chunk_size,
-                          uint64_t* alignment = nullptr);
-    void get_max_aligned_size(uint64_t size, uint64_t alignment, uint64_t* max_size);
-    void raw_obj_to_obj(const rgw_raw_obj& raw_obj);
-    int write_cloud_tier(const DoutPrefixProvider* dpp,
-                          optional_yield y,
-                          uint64_t olh_epoch,
-                          rgw::sal::PlacementTier* tier,
-                          bool is_multipart_upload,
-                          rgw_placement_rule& target_placement,
-                          Object* head_obj);
-    RGWObjManifest* get_manifest() { return manifest; }
-    RGWObjectCtx& get_ctx() { return *rados_ctx; }
-
-  private:
-    int read_attrs(const DoutPrefixProvider* dpp, RGWRados::Object::Read &read_op, optional_yield y, rgw_obj* target_obj = nullptr);
-};
-
-class RadosBucket : public StoreBucket {
-  private:
-    RadosStore* store;
-    RGWAccessControlPolicy acls;
-
-  public:
-    RadosBucket(RadosStore *_st)
-      : store(_st),
-        acls() {
-    }
-
-    RadosBucket(RadosStore *_st, User* _u)
-      : StoreBucket(_u),
-       store(_st),
-        acls() {
-    }
-
-    RadosBucket(RadosStore *_st, const rgw_bucket& _b)
-      : StoreBucket(_b),
-       store(_st),
-        acls() {
-    }
-
-    RadosBucket(RadosStore *_st, const RGWBucketEnt& _e)
-      : StoreBucket(_e),
-       store(_st),
-        acls() {
-    }
-
-    RadosBucket(RadosStore *_st, const RGWBucketInfo& _i)
-      : StoreBucket(_i),
-       store(_st),
-        acls() {
-    }
-
-    RadosBucket(RadosStore *_st, const rgw_bucket& _b, User* _u)
-      : StoreBucket(_b, _u),
-       store(_st),
-        acls() {
-    }
-
-    RadosBucket(RadosStore *_st, const RGWBucketEnt& _e, User* _u)
-      : StoreBucket(_e, _u),
-       store(_st),
-        acls() {
-    }
-
-    RadosBucket(RadosStore *_st, const RGWBucketInfo& _i, User* _u)
-      : StoreBucket(_i, _u),
-       store(_st),
-        acls() {
-    }
-
-    virtual ~RadosBucket();
-    virtual std::unique_ptr<Object> get_object(const rgw_obj_key& k) override;
-    virtual int list(const DoutPrefixProvider* dpp, ListParams&, int, ListResults&, optional_yield y) override;
-    virtual int remove_bucket(const DoutPrefixProvider* dpp, bool delete_children, bool forward_to_master, req_info* req_info, optional_yield y) override;
-    virtual int remove_bucket_bypass_gc(int concurrent_max, bool
-                                       keep_index_consistent,
-                                       optional_yield y, const
-                                       DoutPrefixProvider *dpp) override;
-    virtual RGWAccessControlPolicy& get_acl(void) override { return acls; }
-    virtual int set_acl(const DoutPrefixProvider* dpp, RGWAccessControlPolicy& acl, optional_yield y) override;
-    virtual int load_bucket(const DoutPrefixProvider* dpp, optional_yield y, bool get_stats = false) override;
-    virtual int read_stats(const DoutPrefixProvider *dpp,
-                           const bucket_index_layout_generation& idx_layout,
-                           int shard_id, std::string* bucket_ver, std::string* master_ver,
-                           std::map<RGWObjCategory, RGWStorageStats>& stats,
-                           std::string* max_marker = nullptr,
-                           bool* syncstopped = nullptr) override;
-    virtual int read_stats_async(const DoutPrefixProvider *dpp,
-                                 const bucket_index_layout_generation& idx_layout,
-                                 int shard_id, RGWGetBucketStats_CB* ctx) override;
-    virtual int sync_user_stats(const DoutPrefixProvider *dpp, optional_yield y) override;
-    virtual int update_container_stats(const DoutPrefixProvider* dpp) override;
-    virtual int check_bucket_shards(const DoutPrefixProvider* dpp) override;
-    virtual int chown(const DoutPrefixProvider* dpp, User* new_user, User* old_user, optional_yield y, const std::string* marker = nullptr) override;
-    virtual int put_info(const DoutPrefixProvider* dpp, bool exclusive, ceph::real_time mtime) override;
-    virtual bool is_owner(User* user) override;
-    virtual int check_empty(const DoutPrefixProvider* dpp, optional_yield y) override;
-    virtual int check_quota(const DoutPrefixProvider *dpp, RGWQuota& quota, uint64_t obj_size, optional_yield y, bool check_size_only = false) override;
-    virtual int merge_and_store_attrs(const DoutPrefixProvider* dpp, Attrs& attrs, optional_yield y) override;
-    virtual int try_refresh_info(const DoutPrefixProvider* dpp, ceph::real_time* pmtime) override;
-    virtual int read_usage(const DoutPrefixProvider *dpp, uint64_t start_epoch, uint64_t end_epoch, uint32_t max_entries,
-                          bool* is_truncated, RGWUsageIter& usage_iter,
-                          std::map<rgw_user_bucket, rgw_usage_log_entry>& usage) override;
-    virtual int trim_usage(const DoutPrefixProvider *dpp, uint64_t start_epoch, uint64_t end_epoch) override;
-    virtual int remove_objs_from_index(const DoutPrefixProvider *dpp, std::list<rgw_obj_index_key>& objs_to_unlink) override;
-    virtual int check_index(const DoutPrefixProvider *dpp, std::map<RGWObjCategory, RGWStorageStats>& existing_stats, std::map<RGWObjCategory, RGWStorageStats>& calculated_stats) override;
-    virtual int rebuild_index(const DoutPrefixProvider *dpp) override;
-    virtual int set_tag_timeout(const DoutPrefixProvider *dpp, uint64_t timeout) override;
-    virtual int purge_instance(const DoutPrefixProvider* dpp) override;
-    virtual std::unique_ptr<Bucket> clone() override {
-      return std::make_unique<RadosBucket>(*this);
-    }
-    virtual std::unique_ptr<MultipartUpload> get_multipart_upload(
-                               const std::string& oid,
-                               std::optional<std::string> upload_id=std::nullopt,
-                               ACLOwner owner={}, ceph::real_time mtime=real_clock::now()) override;
-    virtual int list_multiparts(const DoutPrefixProvider *dpp,
-                               const std::string& prefix,
-                               std::string& marker,
-                               const std::string& delim,
-                               const int& max_uploads,
-                               std::vector<std::unique_ptr<MultipartUpload>>& uploads,
-                               std::map<std::string, bool> *common_prefixes,
-                               bool *is_truncated) override;
-    virtual int abort_multiparts(const DoutPrefixProvider* dpp,
-                                CephContext* cct) override;
-
-  private:
-    int link(const DoutPrefixProvider* dpp, User* new_user, optional_yield y, bool update_entrypoint = true, RGWObjVersionTracker* objv = nullptr);
-    int unlink(const DoutPrefixProvider* dpp, User* new_user, optional_yield y, bool update_entrypoint = true);
-    friend class RadosUser;
-};
-
-class RadosMultipartPart : public StoreMultipartPart {
-protected:
-  RGWUploadPartInfo info;
-
-public:
-  RadosMultipartPart() = default;
-  virtual ~RadosMultipartPart() = default;
-
-  virtual uint32_t get_num() { return info.num; }
-  virtual uint64_t get_size() { return info.accounted_size; }
-  virtual const std::string& get_etag() { return info.etag; }
-  virtual ceph::real_time& get_mtime() { return info.modified; }
-
-  /* For RadosStore code */
-  RGWObjManifest& get_manifest() { return info.manifest; }
-
-  friend class RadosMultipartUpload;
-};
-
-class RadosMultipartUpload : public StoreMultipartUpload {
-  RadosStore* store;
-  RGWMPObj mp_obj;
-  ACLOwner owner;
-  ceph::real_time mtime;
-  rgw_placement_rule placement;
-  RGWObjManifest manifest;
-
-public:
-  RadosMultipartUpload(RadosStore* _store, Bucket* _bucket, const std::string& oid,
-                       std::optional<std::string> upload_id, ACLOwner owner,
-                       ceph::real_time _mtime)
-      : StoreMultipartUpload(_bucket), store(_store), mp_obj(oid, upload_id),
-        owner(owner), mtime(_mtime) {}
-  virtual ~RadosMultipartUpload() = default;
-
-  virtual const std::string& get_meta() const override { return mp_obj.get_meta(); }
-  virtual const std::string& get_key() const override { return mp_obj.get_key(); }
-  virtual const std::string& get_upload_id() const override { return mp_obj.get_upload_id(); }
-  virtual const ACLOwner& get_owner() const override { return owner; }
-  virtual ceph::real_time& get_mtime() override { return mtime; }
-  virtual std::unique_ptr<rgw::sal::Object> get_meta_obj() override;
-  virtual int init(const DoutPrefixProvider* dpp, optional_yield y, ACLOwner& owner, rgw_placement_rule& dest_placement, rgw::sal::Attrs& attrs) override;
-  virtual int list_parts(const DoutPrefixProvider* dpp, CephContext* cct,
-                        int num_parts, int marker,
-                        int* next_marker, bool* truncated,
-                        bool assume_unsorted = false) override;
-  virtual int abort(const DoutPrefixProvider* dpp, CephContext* cct) override;
-  virtual int complete(const DoutPrefixProvider* dpp,
-                      optional_yield y, CephContext* cct,
-                      std::map<int, std::string>& part_etags,
-                      std::list<rgw_obj_index_key>& remove_objs,
-                      uint64_t& accounted_size, bool& compressed,
-                      RGWCompressionInfo& cs_info, off_t& ofs,
-                      std::string& tag, ACLOwner& owner,
-                      uint64_t olh_epoch,
-                      rgw::sal::Object* target_obj) override;
-  virtual int get_info(const DoutPrefixProvider *dpp, optional_yield y, rgw_placement_rule** rule, rgw::sal::Attrs* attrs = nullptr) override;
-  virtual std::unique_ptr<Writer> get_writer(const DoutPrefixProvider *dpp,
-                         optional_yield y,
-                         std::unique_ptr<rgw::sal::Object> _head_obj,
-                         const rgw_user& owner,
-                         const rgw_placement_rule *ptail_placement_rule,
-                         uint64_t part_num,
-                         const std::string& part_num_str) override;
-};
-
-class MPRadosSerializer : public StoreMPSerializer {
-  librados::IoCtx ioctx;
-  rados::cls::lock::Lock lock;
-  librados::ObjectWriteOperation op;
-
-public:
-  MPRadosSerializer(const DoutPrefixProvider *dpp, RadosStore* store, RadosObject* obj, const std::string& lock_name);
-
-  virtual int try_lock(const DoutPrefixProvider *dpp, utime_t dur, optional_yield y) override;
-  virtual int unlock() override {
-    return lock.unlock(&ioctx, oid);
-  }
-};
-
-class LCRadosSerializer : public StoreLCSerializer {
-  librados::IoCtx* ioctx;
-  rados::cls::lock::Lock lock;
-
-public:
-  LCRadosSerializer(RadosStore* store, const std::string& oid, const std::string& lock_name, const std::string& cookie);
-
-  virtual int try_lock(const DoutPrefixProvider *dpp, utime_t dur, optional_yield y) override;
-  virtual int unlock() override {
-    return lock.unlock(ioctx, oid);
-  }
-};
-
-class RadosLifecycle : public StoreLifecycle {
-  RadosStore* store;
-
-public:
-  RadosLifecycle(RadosStore* _st) : store(_st) {}
-
-  using StoreLifecycle::get_entry;
-  virtual int get_entry(const std::string& oid, const std::string& marker, std::unique_ptr<LCEntry>* entry) override;
-  virtual int get_next_entry(const std::string& oid, const std::string& marker, std::unique_ptr<LCEntry>* entry) override;
-  virtual int set_entry(const std::string& oid, LCEntry& entry) override;
-  virtual int list_entries(const std::string& oid, const std::string& marker,
-                          uint32_t max_entries,
-                          std::vector<std::unique_ptr<LCEntry>>& entries) override;
-  virtual int rm_entry(const std::string& oid, LCEntry& entry) override;
-  virtual int get_head(const std::string& oid, std::unique_ptr<LCHead>* head) override;
-  virtual int put_head(const std::string& oid, LCHead& head) override;
-  virtual std::unique_ptr<LCSerializer> get_serializer(const std::string& lock_name,
-                                                      const std::string& oid,
-                                                      const std::string& cookie) override;
-};
-
-class RadosNotification : public StoreNotification {
-  RadosStore* store;
-  /* XXX it feels incorrect to me that rgw::notify::reservation_t is
-   * currently RADOS-specific; instead, I think notification types such as
-   * reservation_t should be generally visible, whereas the internal
-   * notification behavior should be made portable (e.g., notification
-   * to non-RADOS message sinks) */
-  rgw::notify::reservation_t res;
-
-  public:
-    RadosNotification(const DoutPrefixProvider* _dpp, RadosStore* _store, Object* _obj, Object* _src_obj, req_state* _s, rgw::notify::EventType _type, const std::string* object_name=nullptr) :
-      StoreNotification(_obj, _src_obj, _type), store(_store), res(_dpp, _store, _s, _obj, _src_obj, object_name) { }
-
-    RadosNotification(const DoutPrefixProvider* _dpp, RadosStore* _store, Object* _obj, Object* _src_obj, rgw::notify::EventType _type, rgw::sal::Bucket* _bucket, std::string& _user_id, std::string& _user_tenant, std::string& _req_id, optional_yield y) :
-      StoreNotification(_obj, _src_obj, _type), store(_store), res(_dpp, _store, _obj, _src_obj, _bucket, _user_id, _user_tenant, _req_id, y) {}
-
-    ~RadosNotification() = default;
-
-    rgw::notify::reservation_t& get_reservation(void) {
-      return res;
-    }
-
-    virtual int publish_reserve(const DoutPrefixProvider *dpp, RGWObjTags* obj_tags = nullptr) override;
-    virtual int publish_commit(const DoutPrefixProvider* dpp, uint64_t size,
-                              const ceph::real_time& mtime, const std::string& etag, const std::string& version) override;
-};
-
-class RadosAtomicWriter : public StoreWriter {
-protected:
-  rgw::sal::RadosStore* store;
-  std::unique_ptr<Aio> aio;
-  RGWObjectCtx* obj_ctx;
-  rgw::putobj::AtomicObjectProcessor processor;
-
-public:
-  RadosAtomicWriter(const DoutPrefixProvider *dpp,
-                   optional_yield y,
-                   std::unique_ptr<rgw::sal::Object> _head_obj,
-                   RadosStore* _store, std::unique_ptr<Aio> _aio,
-                   const rgw_user& owner,
-                   const rgw_placement_rule *ptail_placement_rule,
-                   uint64_t olh_epoch,
-                   const std::string& unique_tag) :
-                       StoreWriter(dpp, y),
-                       store(_store),
-                       aio(std::move(_aio)),
-                       obj_ctx(&dynamic_cast<RadosObject*>(_head_obj.get())->get_ctx()),
-                       processor(&*aio, store,
-                                 ptail_placement_rule, owner, 
-                                 *obj_ctx,
-                                 std::move(_head_obj), olh_epoch, unique_tag,
-                                 dpp, y)
-  {}
-  ~RadosAtomicWriter() = default;
-
-  // prepare to start processing object data
-  virtual int prepare(optional_yield y) override;
-
-  // Process a bufferlist
-  virtual int process(bufferlist&& data, uint64_t offset) override;
-
-  // complete the operation and make its result visible to clients
-  virtual int complete(size_t accounted_size, const std::string& etag,
-                       ceph::real_time *mtime, ceph::real_time set_mtime,
-                       std::map<std::string, bufferlist>& attrs,
-                       ceph::real_time delete_at,
-                       const char *if_match, const char *if_nomatch,
-                       const std::string *user_data,
-                       rgw_zone_set *zones_trace, bool *canceled,
-                       optional_yield y) override;
-};
-
-class RadosAppendWriter : public StoreWriter {
-protected:
-  rgw::sal::RadosStore* store;
-  std::unique_ptr<Aio> aio;
-  RGWObjectCtx* obj_ctx;
-  rgw::putobj::AppendObjectProcessor processor;
-
-public:
-  RadosAppendWriter(const DoutPrefixProvider *dpp,
-                   optional_yield y,
-                   std::unique_ptr<rgw::sal::Object> _head_obj,
-                   RadosStore* _store, std::unique_ptr<Aio> _aio,
-                   const rgw_user& owner,
-                   const rgw_placement_rule *ptail_placement_rule,
-                   const std::string& unique_tag,
-                   uint64_t position,
-                   uint64_t *cur_accounted_size) :
-                       StoreWriter(dpp, y),
-                       store(_store),
-                       aio(std::move(_aio)),
-                       obj_ctx(&dynamic_cast<RadosObject*>(_head_obj.get())->get_ctx()),
-                       processor(&*aio, store,
-                                 ptail_placement_rule, owner,
-                                 *obj_ctx,
-                                 std::move(_head_obj), unique_tag, position,
-                                 cur_accounted_size, dpp, y)
-  {}
-  ~RadosAppendWriter() = default;
-
-  // prepare to start processing object data
-  virtual int prepare(optional_yield y) override;
-
-  // Process a bufferlist
-  virtual int process(bufferlist&& data, uint64_t offset) override;
-
-  // complete the operation and make its result visible to clients
-  virtual int complete(size_t accounted_size, const std::string& etag,
-                       ceph::real_time *mtime, ceph::real_time set_mtime,
-                       std::map<std::string, bufferlist>& attrs,
-                       ceph::real_time delete_at,
-                       const char *if_match, const char *if_nomatch,
-                       const std::string *user_data,
-                       rgw_zone_set *zones_trace, bool *canceled,
-                       optional_yield y) override;
-};
-
-class RadosMultipartWriter : public StoreWriter {
-protected:
-  rgw::sal::RadosStore* store;
-  std::unique_ptr<Aio> aio;
-  RGWObjectCtx* obj_ctx;
-  rgw::putobj::MultipartObjectProcessor processor;
-
-public:
-  RadosMultipartWriter(const DoutPrefixProvider *dpp,
-                      optional_yield y, MultipartUpload* upload,
-                      std::unique_ptr<rgw::sal::Object> _head_obj,
-                      RadosStore* _store, std::unique_ptr<Aio> _aio,
-                      const rgw_user& owner,
-                      const rgw_placement_rule *ptail_placement_rule,
-                      uint64_t part_num, const std::string& part_num_str) :
-                       StoreWriter(dpp, y),
-                       store(_store),
-                       aio(std::move(_aio)),
-                       obj_ctx(&dynamic_cast<RadosObject*>(_head_obj.get())->get_ctx()),
-                       processor(&*aio, store,
-                                 ptail_placement_rule, owner,
-                                 *obj_ctx,
-                                 std::move(_head_obj), upload->get_upload_id(),
-                                 part_num, part_num_str, dpp, y)
-  {}
-  ~RadosMultipartWriter() = default;
-
-  // prepare to start processing object data
-  virtual int prepare(optional_yield y) override;
-
-  // Process a bufferlist
-  virtual int process(bufferlist&& data, uint64_t offset) override;
-
-  // complete the operation and make its result visible to clients
-  virtual int complete(size_t accounted_size, const std::string& etag,
-                       ceph::real_time *mtime, ceph::real_time set_mtime,
-                       std::map<std::string, bufferlist>& attrs,
-                       ceph::real_time delete_at,
-                       const char *if_match, const char *if_nomatch,
-                       const std::string *user_data,
-                       rgw_zone_set *zones_trace, bool *canceled,
-                       optional_yield y) override;
-};
-
-class RadosLuaManager : public StoreLuaManager {
-  RadosStore* const store;
-  rgw_pool pool;
-
-public:
-  RadosLuaManager(RadosStore* _s);
-  virtual ~RadosLuaManager() = default;
-
-  virtual int get_script(const DoutPrefixProvider* dpp, optional_yield y, const std::string& key, std::string& script);
-  virtual int put_script(const DoutPrefixProvider* dpp, optional_yield y, const std::string& key, const std::string& script);
-  virtual int del_script(const DoutPrefixProvider* dpp, optional_yield y, const std::string& key);
-  virtual int add_package(const DoutPrefixProvider* dpp, optional_yield y, const std::string& package_name);
-  virtual int remove_package(const DoutPrefixProvider* dpp, optional_yield y, const std::string& package_name);
-  virtual int list_packages(const DoutPrefixProvider* dpp, optional_yield y, rgw::lua::packages_t& packages);
-};
-
-class RadosOIDCProvider : public RGWOIDCProvider {
-  RadosStore* store;
-public:
-  RadosOIDCProvider(RadosStore* _store) : store(_store) {}
-  ~RadosOIDCProvider() = default;
-
-  virtual int store_url(const DoutPrefixProvider *dpp, const std::string& url, bool exclusive, optional_yield y) override;
-  virtual int read_url(const DoutPrefixProvider *dpp, const std::string& url, const std::string& tenant) override;
-  virtual int delete_obj(const DoutPrefixProvider *dpp, optional_yield y) override;
-  void encode(bufferlist& bl) const {
-    RGWOIDCProvider::encode(bl);
-  }
-  void decode(bufferlist::const_iterator& bl) {
-    RGWOIDCProvider::decode(bl);
-  }
-};
-
-class RadosRole : public RGWRole {
-  RadosStore* store;
-public:
-  RadosRole(RadosStore* _store, std::string name,
-          std::string tenant,
-          std::string path,
-          std::string trust_policy,
-          std::string max_session_duration,
-          std::multimap<std::string,std::string> tags) : RGWRole(name, tenant, path, trust_policy, max_session_duration, tags), store(_store) {}
-  RadosRole(RadosStore* _store, std::string id) : RGWRole(id), store(_store) {}
-  RadosRole(RadosStore* _store, const RGWRoleInfo& info) : RGWRole(info), store(_store) {}
-  RadosRole(RadosStore* _store) : store(_store) {}
-  ~RadosRole() = default;
-
-  virtual int store_info(const DoutPrefixProvider *dpp, bool exclusive, optional_yield y) override;
-  virtual int store_name(const DoutPrefixProvider *dpp, bool exclusive, optional_yield y) override;
-  virtual int store_path(const DoutPrefixProvider *dpp, bool exclusive, optional_yield y) override;
-  virtual int read_id(const DoutPrefixProvider *dpp, const std::string& role_name, const std::string& tenant, std::string& role_id, optional_yield y) override;
-  virtual int read_name(const DoutPrefixProvider *dpp, optional_yield y) override;
-  virtual int read_info(const DoutPrefixProvider *dpp, optional_yield y) override;
-  virtual int create(const DoutPrefixProvider *dpp, bool exclusive, const std::string& role_id, optional_yield y) override;
-  virtual int delete_obj(const DoutPrefixProvider *dpp, optional_yield y) override;
-};
-}} // namespace rgw::sal
-
-WRITE_CLASS_ENCODER(rgw::sal::RadosOIDCProvider)
diff --git a/src/rgw/store/rados/rgw_service.cc b/src/rgw/store/rados/rgw_service.cc
deleted file mode 100644 (file)
index 4fcb1eb..0000000
+++ /dev/null
@@ -1,476 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab ft=cpp
-
-#include "rgw_service.h"
-
-#include "services/svc_finisher.h"
-#include "services/svc_bi_rados.h"
-#include "services/svc_bilog_rados.h"
-#include "services/svc_bucket_sobj.h"
-#include "services/svc_bucket_sync_sobj.h"
-#include "services/svc_cls.h"
-#include "services/svc_config_key_rados.h"
-#include "services/svc_mdlog.h"
-#include "services/svc_meta.h"
-#include "services/svc_meta_be.h"
-#include "services/svc_meta_be_sobj.h"
-#include "services/svc_meta_be_otp.h"
-#include "services/svc_notify.h"
-#include "services/svc_otp.h"
-#include "services/svc_rados.h"
-#include "services/svc_zone.h"
-#include "services/svc_zone_utils.h"
-#include "services/svc_quota.h"
-#include "services/svc_sync_modules.h"
-#include "services/svc_sys_obj.h"
-#include "services/svc_sys_obj_cache.h"
-#include "services/svc_sys_obj_core.h"
-#include "services/svc_user_rados.h"
-#include "services/svc_role_rados.h"
-
-#include "common/errno.h"
-
-#include "rgw_bucket.h"
-#include "rgw_datalog.h"
-#include "rgw_metadata.h"
-#include "rgw_otp.h"
-#include "rgw_user.h"
-#include "rgw_role.h"
-
-#define dout_subsys ceph_subsys_rgw
-
-using namespace std;
-
-RGWServices_Def::RGWServices_Def() = default;
-RGWServices_Def::~RGWServices_Def()
-{
-  shutdown();
-}
-
-int RGWServices_Def::init(CephContext *cct,
-                         bool have_cache,
-                          bool raw,
-                         bool run_sync,
-                         optional_yield y,
-                          const DoutPrefixProvider *dpp)
-{
-  finisher = std::make_unique<RGWSI_Finisher>(cct);
-  bucket_sobj = std::make_unique<RGWSI_Bucket_SObj>(cct);
-  bucket_sync_sobj = std::make_unique<RGWSI_Bucket_Sync_SObj>(cct);
-  bi_rados = std::make_unique<RGWSI_BucketIndex_RADOS>(cct);
-  bilog_rados = std::make_unique<RGWSI_BILog_RADOS>(cct);
-  cls = std::make_unique<RGWSI_Cls>(cct);
-  config_key_rados = std::make_unique<RGWSI_ConfigKey_RADOS>(cct);
-  datalog_rados = std::make_unique<RGWDataChangesLog>(cct);
-  mdlog = std::make_unique<RGWSI_MDLog>(cct, run_sync);
-  meta = std::make_unique<RGWSI_Meta>(cct);
-  meta_be_sobj = std::make_unique<RGWSI_MetaBackend_SObj>(cct);
-  meta_be_otp = std::make_unique<RGWSI_MetaBackend_OTP>(cct);
-  notify = std::make_unique<RGWSI_Notify>(cct);
-  otp = std::make_unique<RGWSI_OTP>(cct);
-  rados = std::make_unique<RGWSI_RADOS>(cct);
-  zone = std::make_unique<RGWSI_Zone>(cct);
-  zone_utils = std::make_unique<RGWSI_ZoneUtils>(cct);
-  quota = std::make_unique<RGWSI_Quota>(cct);
-  sync_modules = std::make_unique<RGWSI_SyncModules>(cct);
-  sysobj = std::make_unique<RGWSI_SysObj>(cct);
-  sysobj_core = std::make_unique<RGWSI_SysObj_Core>(cct);
-  user_rados = std::make_unique<RGWSI_User_RADOS>(cct);
-  role_rados = std::make_unique<RGWSI_Role_RADOS>(cct);
-
-  if (have_cache) {
-    sysobj_cache = std::make_unique<RGWSI_SysObj_Cache>(dpp, cct);
-  }
-
-  vector<RGWSI_MetaBackend *> meta_bes{meta_be_sobj.get(), meta_be_otp.get()};
-
-  finisher->init();
-  bi_rados->init(zone.get(), rados.get(), bilog_rados.get(), datalog_rados.get());
-  bilog_rados->init(bi_rados.get());
-  bucket_sobj->init(zone.get(), sysobj.get(), sysobj_cache.get(),
-                    bi_rados.get(), meta.get(), meta_be_sobj.get(),
-                    sync_modules.get(), bucket_sync_sobj.get());
-  bucket_sync_sobj->init(zone.get(),
-                         sysobj.get(),
-                         sysobj_cache.get(),
-                         bucket_sobj.get());
-  cls->init(zone.get(), rados.get());
-  config_key_rados->init(rados.get());
-  mdlog->init(rados.get(), zone.get(), sysobj.get(), cls.get());
-  meta->init(sysobj.get(), mdlog.get(), meta_bes);
-  meta_be_sobj->init(sysobj.get(), mdlog.get());
-  meta_be_otp->init(sysobj.get(), mdlog.get(), cls.get());
-  notify->init(zone.get(), rados.get(), finisher.get());
-  otp->init(zone.get(), meta.get(), meta_be_otp.get());
-  rados->init();
-  zone->init(sysobj.get(), rados.get(), sync_modules.get(), bucket_sync_sobj.get());
-  zone_utils->init(rados.get(), zone.get());
-  quota->init(zone.get());
-  sync_modules->init(zone.get());
-  sysobj_core->core_init(rados.get(), zone.get());
-  if (have_cache) {
-    sysobj_cache->init(rados.get(), zone.get(), notify.get());
-    sysobj->init(rados.get(), sysobj_cache.get());
-  } else {
-    sysobj->init(rados.get(), sysobj_core.get());
-  }
-  user_rados->init(rados.get(), zone.get(), sysobj.get(), sysobj_cache.get(),
-                   meta.get(), meta_be_sobj.get(), sync_modules.get());
-  role_rados->init(zone.get(), meta.get(), meta_be_sobj.get(), sysobj.get());
-
-  can_shutdown = true;
-
-  int r = finisher->start(y, dpp);
-  if (r < 0) {
-    ldpp_dout(dpp, 0) << "ERROR: failed to start finisher service (" << cpp_strerror(-r) << dendl;
-    return r;
-  }
-
-  if (!raw) {
-    r = notify->start(y, dpp);
-    if (r < 0) {
-      ldpp_dout(dpp, 0) << "ERROR: failed to start notify service (" << cpp_strerror(-r) << dendl;
-      return r;
-    }
-  }
-
-  r = rados->start(y, dpp);
-  if (r < 0) {
-    ldpp_dout(dpp, 0) << "ERROR: failed to start rados service (" << cpp_strerror(-r) << dendl;
-    return r;
-  }
-
-  if (!raw) {
-    r = zone->start(y, dpp);
-    if (r < 0) {
-      ldpp_dout(dpp, 0) << "ERROR: failed to start zone service (" << cpp_strerror(-r) << dendl;
-      return r;
-    }
-
-    r = datalog_rados->start(dpp, &zone->get_zone(),
-                            zone->get_zone_params(),
-                            rados->get_rados_handle());
-    if (r < 0) {
-      ldpp_dout(dpp, 0) << "ERROR: failed to start datalog_rados service (" << cpp_strerror(-r) << dendl;
-      return r;
-    }
-
-    r = mdlog->start(y, dpp);
-    if (r < 0) {
-      ldpp_dout(dpp, 0) << "ERROR: failed to start mdlog service (" << cpp_strerror(-r) << dendl;
-      return r;
-    }
-
-    r = sync_modules->start(y, dpp);
-    if (r < 0) {
-      ldpp_dout(dpp, 0) << "ERROR: failed to start sync modules service (" << cpp_strerror(-r) << dendl;
-      return r;
-    }
-  }
-
-  r = cls->start(y, dpp);
-  if (r < 0) {
-    ldpp_dout(dpp, 0) << "ERROR: failed to start cls service (" << cpp_strerror(-r) << dendl;
-    return r;
-  }
-
-  r = config_key_rados->start(y, dpp);
-  if (r < 0) {
-    ldpp_dout(dpp, 0) << "ERROR: failed to start config_key service (" << cpp_strerror(-r) << dendl;
-    return r;
-  }
-
-  r = zone_utils->start(y, dpp);
-  if (r < 0) {
-    ldpp_dout(dpp, 0) << "ERROR: failed to start zone_utils service (" << cpp_strerror(-r) << dendl;
-    return r;
-  }
-
-  r = quota->start(y, dpp);
-  if (r < 0) {
-    ldpp_dout(dpp, 0) << "ERROR: failed to start quota service (" << cpp_strerror(-r) << dendl;
-    return r;
-  }
-
-  r = sysobj_core->start(y, dpp);
-  if (r < 0) {
-    ldpp_dout(dpp, 0) << "ERROR: failed to start sysobj_core service (" << cpp_strerror(-r) << dendl;
-    return r;
-  }
-
-  if (have_cache) {
-    r = sysobj_cache->start(y, dpp);
-    if (r < 0) {
-      ldpp_dout(dpp, 0) << "ERROR: failed to start sysobj_cache service (" << cpp_strerror(-r) << dendl;
-      return r;
-    }
-  }
-
-  r = sysobj->start(y, dpp);
-  if (r < 0) {
-    ldpp_dout(dpp, 0) << "ERROR: failed to start sysobj service (" << cpp_strerror(-r) << dendl;
-    return r;
-  }
-
-  if (!raw) {
-    r = meta_be_sobj->start(y, dpp);
-    if (r < 0) {
-      ldpp_dout(dpp, 0) << "ERROR: failed to start meta_be_sobj service (" << cpp_strerror(-r) << dendl;
-      return r;
-    }
-
-    r = meta->start(y, dpp);
-    if (r < 0) {
-      ldpp_dout(dpp, 0) << "ERROR: failed to start meta service (" << cpp_strerror(-r) << dendl;
-      return r;
-    }
-
-    r = bucket_sobj->start(y, dpp);
-    if (r < 0) {
-      ldpp_dout(dpp, 0) << "ERROR: failed to start bucket service (" << cpp_strerror(-r) << dendl;
-      return r;
-    }
-
-    r = bucket_sync_sobj->start(y, dpp);
-    if (r < 0) {
-      ldpp_dout(dpp, 0) << "ERROR: failed to start bucket_sync service (" << cpp_strerror(-r) << dendl;
-      return r;
-    }
-
-    r = user_rados->start(y, dpp);
-    if (r < 0) {
-      ldpp_dout(dpp, 0) << "ERROR: failed to start user_rados service (" << cpp_strerror(-r) << dendl;
-      return r;
-    }
-
-    r = otp->start(y, dpp);
-    if (r < 0) {
-      ldpp_dout(dpp, 0) << "ERROR: failed to start otp service (" << cpp_strerror(-r) << dendl;
-      return r;
-    }
-
-    r = role_rados->start(y, dpp);
-    if (r < 0) {
-      ldout(cct, 0) << "ERROR: failed to start role_rados service (" << cpp_strerror(-r) << dendl;
-      return r;
-    }
-
-  }
-
-  /* cache or core services will be started by sysobj */
-
-  return  0;
-}
-
-void RGWServices_Def::shutdown()
-{
-  if (!can_shutdown) {
-    return;
-  }
-
-  if (has_shutdown) {
-    return;
-  }
-
-  role_rados->shutdown();
-  datalog_rados.reset();
-  user_rados->shutdown();
-  sync_modules->shutdown();
-  otp->shutdown();
-  notify->shutdown();
-  meta_be_otp->shutdown();
-  meta_be_sobj->shutdown();
-  meta->shutdown();
-  mdlog->shutdown();
-  config_key_rados->shutdown();
-  cls->shutdown();
-  bilog_rados->shutdown();
-  bi_rados->shutdown();
-  bucket_sync_sobj->shutdown();
-  bucket_sobj->shutdown();
-  finisher->shutdown();
-
-  sysobj->shutdown();
-  sysobj_core->shutdown();
-  notify->shutdown();
-  if (sysobj_cache) {
-    sysobj_cache->shutdown();
-  }
-  quota->shutdown();
-  zone_utils->shutdown();
-  zone->shutdown();
-  rados->shutdown();
-
-  has_shutdown = true;
-
-}
-
-
-int RGWServices::do_init(CephContext *_cct, bool have_cache, bool raw, bool run_sync, optional_yield y, const DoutPrefixProvider *dpp)
-{
-  cct = _cct;
-
-  int r = _svc.init(cct, have_cache, raw, run_sync, y, dpp);
-  if (r < 0) {
-    return r;
-  }
-
-  finisher = _svc.finisher.get();
-  bi_rados = _svc.bi_rados.get();
-  bi = bi_rados;
-  bilog_rados = _svc.bilog_rados.get();
-  bucket_sobj = _svc.bucket_sobj.get();
-  bucket = bucket_sobj;
-  bucket_sync_sobj = _svc.bucket_sync_sobj.get();
-  bucket_sync = bucket_sync_sobj;
-  cls = _svc.cls.get();
-  config_key_rados = _svc.config_key_rados.get();
-  config_key = config_key_rados;
-  datalog_rados = _svc.datalog_rados.get();
-  mdlog = _svc.mdlog.get();
-  meta = _svc.meta.get();
-  meta_be_sobj = _svc.meta_be_sobj.get();
-  meta_be_otp = _svc.meta_be_otp.get();
-  notify = _svc.notify.get();
-  otp = _svc.otp.get();
-  rados = _svc.rados.get();
-  zone = _svc.zone.get();
-  zone_utils = _svc.zone_utils.get();
-  quota = _svc.quota.get();
-  sync_modules = _svc.sync_modules.get();
-  sysobj = _svc.sysobj.get();
-  cache = _svc.sysobj_cache.get();
-  core = _svc.sysobj_core.get();
-  user = _svc.user_rados.get();
-  role = _svc.role_rados.get();
-
-  return 0;
-}
-
-RGWServiceInstance::~RGWServiceInstance() {}
-
-int RGWServiceInstance::start(optional_yield y, const DoutPrefixProvider *dpp)
-{
-  if (start_state != StateInit) {
-    return 0;
-  }
-
-  start_state = StateStarting;; /* setting started prior to do_start() on purpose so that circular
-                                   references can call start() on each other */
-
-  int r = do_start(y, dpp);
-  if (r < 0) {
-    return r;
-  }
-
-  start_state = StateStarted;
-
-  return 0;
-}
-
-RGWCtlDef::RGWCtlDef() {}
-RGWCtlDef::~RGWCtlDef() {}
-RGWCtlDef::_meta::_meta() {}
-RGWCtlDef::_meta::~_meta() {}
-
-
-int RGWCtlDef::init(RGWServices& svc, rgw::sal::Driver* driver, const DoutPrefixProvider *dpp)
-{
-  meta.mgr.reset(new RGWMetadataManager(svc.meta));
-
-  meta.user.reset(RGWUserMetaHandlerAllocator::alloc(svc.user));
-
-  auto sync_module = svc.sync_modules->get_sync_module();
-  if (sync_module) {
-    meta.bucket.reset(sync_module->alloc_bucket_meta_handler());
-    meta.bucket_instance.reset(sync_module->alloc_bucket_instance_meta_handler(driver));
-  } else {
-    meta.bucket.reset(RGWBucketMetaHandlerAllocator::alloc());
-    meta.bucket_instance.reset(RGWBucketInstanceMetaHandlerAllocator::alloc(driver));
-  }
-
-  meta.otp.reset(RGWOTPMetaHandlerAllocator::alloc());
-  meta.role = std::make_unique<rgw::sal::RGWRoleMetadataHandler>(driver, svc.role);
-
-  user.reset(new RGWUserCtl(svc.zone, svc.user, (RGWUserMetadataHandler *)meta.user.get()));
-  bucket.reset(new RGWBucketCtl(svc.zone,
-                                svc.bucket,
-                                svc.bucket_sync,
-                                svc.bi, svc.user));
-  otp.reset(new RGWOTPCtl(svc.zone, svc.otp));
-
-  RGWBucketMetadataHandlerBase *bucket_meta_handler = static_cast<RGWBucketMetadataHandlerBase *>(meta.bucket.get());
-  RGWBucketInstanceMetadataHandlerBase *bi_meta_handler = static_cast<RGWBucketInstanceMetadataHandlerBase *>(meta.bucket_instance.get());
-
-  bucket_meta_handler->init(svc.bucket, bucket.get());
-  bi_meta_handler->init(svc.zone, svc.bucket, svc.bi);
-
-  RGWOTPMetadataHandlerBase *otp_handler = static_cast<RGWOTPMetadataHandlerBase *>(meta.otp.get());
-  otp_handler->init(svc.zone, svc.meta_be_otp, svc.otp);
-
-  user->init(bucket.get());
-  bucket->init(user.get(),
-               (RGWBucketMetadataHandler *)bucket_meta_handler,
-               (RGWBucketInstanceMetadataHandler *)bi_meta_handler,
-              svc.datalog_rados,
-               dpp);
-
-  otp->init((RGWOTPMetadataHandler *)meta.otp.get());
-
-  return 0;
-}
-
-int RGWCtl::init(RGWServices *_svc, rgw::sal::Driver* driver, const DoutPrefixProvider *dpp)
-{
-  svc = _svc;
-  cct = svc->cct;
-
-  int r = _ctl.init(*svc, driver, dpp);
-  if (r < 0) {
-    ldpp_dout(dpp, 0) << "ERROR: failed to start init ctls (" << cpp_strerror(-r) << dendl;
-    return r;
-  }
-
-  meta.mgr = _ctl.meta.mgr.get();
-  meta.user = _ctl.meta.user.get();
-  meta.bucket = _ctl.meta.bucket.get();
-  meta.bucket_instance = _ctl.meta.bucket_instance.get();
-  meta.otp = _ctl.meta.otp.get();
-  meta.role = _ctl.meta.role.get();
-
-  user = _ctl.user.get();
-  bucket = _ctl.bucket.get();
-  otp = _ctl.otp.get();
-
-  r = meta.user->attach(meta.mgr);
-  if (r < 0) {
-    ldout(cct, 0) << "ERROR: failed to start init meta.user ctl (" << cpp_strerror(-r) << dendl;
-    return r;
-  }
-
-  r = meta.bucket->attach(meta.mgr);
-  if (r < 0) {
-    ldout(cct, 0) << "ERROR: failed to start init meta.bucket ctl (" << cpp_strerror(-r) << dendl;
-    return r;
-  }
-
-  r = meta.bucket_instance->attach(meta.mgr);
-  if (r < 0) {
-    ldout(cct, 0) << "ERROR: failed to start init meta.bucket_instance ctl (" << cpp_strerror(-r) << dendl;
-    return r;
-  }
-
-  r = meta.otp->attach(meta.mgr);
-  if (r < 0) {
-    ldout(cct, 0) << "ERROR: failed to start init otp ctl (" << cpp_strerror(-r) << dendl;
-    return r;
-  }
-
-  r = meta.role->attach(meta.mgr);
-  if (r < 0) {
-    ldout(cct, 0) << "ERROR: failed to start init otp ctl (" << cpp_strerror(-r) << dendl;
-    return r;
-  }
-  return 0;
-}
-
diff --git a/src/rgw/store/rados/rgw_service.h b/src/rgw/store/rados/rgw_service.h
deleted file mode 100644 (file)
index dc49913..0000000
+++ /dev/null
@@ -1,219 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab ft=cpp
-
-#ifndef CEPH_RGW_SERVICE_H
-#define CEPH_RGW_SERVICE_H
-
-
-#include <string>
-#include <vector>
-#include <memory>
-
-#include "common/async/yield_context.h"
-
-#include "rgw_common.h"
-
-struct RGWServices_Def;
-
-class RGWServiceInstance
-{
-  friend struct RGWServices_Def;
-
-protected:
-  CephContext *cct;
-
-  enum StartState {
-    StateInit = 0,
-    StateStarting = 1,
-    StateStarted = 2,
-  } start_state{StateInit};
-
-  virtual void shutdown() {}
-  virtual int do_start(optional_yield, const DoutPrefixProvider *dpp) {
-    return 0;
-  }
-public:
-  RGWServiceInstance(CephContext *_cct) : cct(_cct) {}
-  virtual ~RGWServiceInstance();
-
-  int start(optional_yield y, const DoutPrefixProvider *dpp);
-  bool is_started() {
-    return (start_state == StateStarted);
-  }
-
-  CephContext *ctx() {
-    return cct;
-  }
-};
-
-class RGWSI_Finisher;
-class RGWSI_Bucket;
-class RGWSI_Bucket_SObj;
-class RGWSI_Bucket_Sync;
-class RGWSI_Bucket_Sync_SObj;
-class RGWSI_BucketIndex;
-class RGWSI_BucketIndex_RADOS;
-class RGWSI_BILog_RADOS;
-class RGWSI_Cls;
-class RGWSI_ConfigKey;
-class RGWSI_ConfigKey_RADOS;
-class RGWSI_MDLog;
-class RGWSI_Meta;
-class RGWSI_MetaBackend;
-class RGWSI_MetaBackend_SObj;
-class RGWSI_MetaBackend_OTP;
-class RGWSI_Notify;
-class RGWSI_OTP;
-class RGWSI_RADOS;
-class RGWSI_Zone;
-class RGWSI_ZoneUtils;
-class RGWSI_Quota;
-class RGWSI_SyncModules;
-class RGWSI_SysObj;
-class RGWSI_SysObj_Core;
-class RGWSI_SysObj_Cache;
-class RGWSI_User;
-class RGWSI_User_RADOS;
-class RGWDataChangesLog;
-class RGWSI_Role_RADOS;
-
-struct RGWServices_Def
-{
-  bool can_shutdown{false};
-  bool has_shutdown{false};
-
-  std::unique_ptr<RGWSI_Finisher> finisher;
-  std::unique_ptr<RGWSI_Bucket_SObj> bucket_sobj;
-  std::unique_ptr<RGWSI_Bucket_Sync_SObj> bucket_sync_sobj;
-  std::unique_ptr<RGWSI_BucketIndex_RADOS> bi_rados;
-  std::unique_ptr<RGWSI_BILog_RADOS> bilog_rados;
-  std::unique_ptr<RGWSI_Cls> cls;
-  std::unique_ptr<RGWSI_ConfigKey_RADOS> config_key_rados;
-  std::unique_ptr<RGWSI_MDLog> mdlog;
-  std::unique_ptr<RGWSI_Meta> meta;
-  std::unique_ptr<RGWSI_MetaBackend_SObj> meta_be_sobj;
-  std::unique_ptr<RGWSI_MetaBackend_OTP> meta_be_otp;
-  std::unique_ptr<RGWSI_Notify> notify;
-  std::unique_ptr<RGWSI_OTP> otp;
-  std::unique_ptr<RGWSI_RADOS> rados;
-  std::unique_ptr<RGWSI_Zone> zone;
-  std::unique_ptr<RGWSI_ZoneUtils> zone_utils;
-  std::unique_ptr<RGWSI_Quota> quota;
-  std::unique_ptr<RGWSI_SyncModules> sync_modules;
-  std::unique_ptr<RGWSI_SysObj> sysobj;
-  std::unique_ptr<RGWSI_SysObj_Core> sysobj_core;
-  std::unique_ptr<RGWSI_SysObj_Cache> sysobj_cache;
-  std::unique_ptr<RGWSI_User_RADOS> user_rados;
-  std::unique_ptr<RGWDataChangesLog> datalog_rados;
-  std::unique_ptr<RGWSI_Role_RADOS> role_rados;
-
-  RGWServices_Def();
-  ~RGWServices_Def();
-
-  int init(CephContext *cct, bool have_cache, bool raw_storage, bool run_sync, optional_yield y, const DoutPrefixProvider *dpp);
-  void shutdown();
-};
-
-
-struct RGWServices
-{
-  RGWServices_Def _svc;
-
-  CephContext *cct;
-
-  RGWSI_Finisher *finisher{nullptr};
-  RGWSI_Bucket *bucket{nullptr};
-  RGWSI_Bucket_SObj *bucket_sobj{nullptr};
-  RGWSI_Bucket_Sync *bucket_sync{nullptr};
-  RGWSI_Bucket_Sync_SObj *bucket_sync_sobj{nullptr};
-  RGWSI_BucketIndex *bi{nullptr};
-  RGWSI_BucketIndex_RADOS *bi_rados{nullptr};
-  RGWSI_BILog_RADOS *bilog_rados{nullptr};
-  RGWSI_Cls *cls{nullptr};
-  RGWSI_ConfigKey_RADOS *config_key_rados{nullptr};
-  RGWSI_ConfigKey *config_key{nullptr};
-  RGWDataChangesLog *datalog_rados{nullptr};
-  RGWSI_MDLog *mdlog{nullptr};
-  RGWSI_Meta *meta{nullptr};
-  RGWSI_MetaBackend *meta_be_sobj{nullptr};
-  RGWSI_MetaBackend *meta_be_otp{nullptr};
-  RGWSI_Notify *notify{nullptr};
-  RGWSI_OTP *otp{nullptr};
-  RGWSI_RADOS *rados{nullptr};
-  RGWSI_Zone *zone{nullptr};
-  RGWSI_ZoneUtils *zone_utils{nullptr};
-  RGWSI_Quota *quota{nullptr};
-  RGWSI_SyncModules *sync_modules{nullptr};
-  RGWSI_SysObj *sysobj{nullptr};
-  RGWSI_SysObj_Cache *cache{nullptr};
-  RGWSI_SysObj_Core *core{nullptr};
-  RGWSI_User *user{nullptr};
-  RGWSI_Role_RADOS *role{nullptr};
-
-  int do_init(CephContext *cct, bool have_cache, bool raw_storage, bool run_sync, optional_yield y, const DoutPrefixProvider *dpp);
-
-  int init(CephContext *cct, bool have_cache, bool run_sync, optional_yield y, const DoutPrefixProvider *dpp) {
-    return do_init(cct, have_cache, false, run_sync, y, dpp);
-  }
-
-  int init_raw(CephContext *cct, bool have_cache, optional_yield y, const DoutPrefixProvider *dpp) {
-    return do_init(cct, have_cache, true, false, y, dpp);
-  }
-  void shutdown() {
-    _svc.shutdown();
-  }
-};
-
-class RGWMetadataManager;
-class RGWMetadataHandler;
-class RGWUserCtl;
-class RGWBucketCtl;
-class RGWOTPCtl;
-
-struct RGWCtlDef {
-  struct _meta {
-    std::unique_ptr<RGWMetadataManager> mgr;
-    std::unique_ptr<RGWMetadataHandler> bucket;
-    std::unique_ptr<RGWMetadataHandler> bucket_instance;
-    std::unique_ptr<RGWMetadataHandler> user;
-    std::unique_ptr<RGWMetadataHandler> otp;
-    std::unique_ptr<RGWMetadataHandler> role;
-
-    _meta();
-    ~_meta();
-  } meta;
-
-  std::unique_ptr<RGWUserCtl> user;
-  std::unique_ptr<RGWBucketCtl> bucket;
-  std::unique_ptr<RGWOTPCtl> otp;
-
-  RGWCtlDef();
-  ~RGWCtlDef();
-
-  int init(RGWServices& svc, rgw::sal::Driver* driver, const DoutPrefixProvider *dpp);
-};
-
-struct RGWCtl {
-  CephContext *cct{nullptr};
-  RGWServices *svc{nullptr};
-
-  RGWCtlDef _ctl;
-
-  struct _meta {
-    RGWMetadataManager *mgr{nullptr};
-
-    RGWMetadataHandler *bucket{nullptr};
-    RGWMetadataHandler *bucket_instance{nullptr};
-    RGWMetadataHandler *user{nullptr};
-    RGWMetadataHandler *otp{nullptr};
-    RGWMetadataHandler *role{nullptr};
-  } meta;
-
-  RGWUserCtl *user{nullptr};
-  RGWBucketCtl *bucket{nullptr};
-  RGWOTPCtl *otp{nullptr};
-
-  int init(RGWServices *_svc, rgw::sal::Driver* driver, const DoutPrefixProvider *dpp);
-};
-
-#endif
diff --git a/src/rgw/store/rados/rgw_sync.cc b/src/rgw/store/rados/rgw_sync.cc
deleted file mode 100644 (file)
index 065d209..0000000
+++ /dev/null
@@ -1,2567 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab ft=cpp
-
-#include "rgw_sync.h"
-#include "rgw_rest_conn.h"
-#include "rgw_cr_rados.h"
-#include "rgw_cr_rest.h"
-
-#include "services/svc_zone.h"
-#include "services/svc_mdlog.h"
-#include "services/svc_cls.h"
-
-#include <boost/asio/yield.hpp>
-
-#define dout_subsys ceph_subsys_rgw
-
-#undef dout_prefix
-#define dout_prefix (*_dout << "meta sync: ")
-
-using namespace std;
-
-static string mdlog_sync_status_oid = "mdlog.sync-status";
-static string mdlog_sync_status_shard_prefix = "mdlog.sync-status.shard";
-static string mdlog_sync_full_sync_index_prefix = "meta.full-sync.index";
-
-RGWContinuousLeaseCR::~RGWContinuousLeaseCR() {}
-
-RGWSyncErrorLogger::RGWSyncErrorLogger(rgw::sal::RadosStore* _store, const string &oid_prefix, int _num_shards) : store(_store), num_shards(_num_shards) {
-  for (int i = 0; i < num_shards; i++) {
-    oids.push_back(get_shard_oid(oid_prefix, i));
-  }
-}
-string RGWSyncErrorLogger::get_shard_oid(const string& oid_prefix, int shard_id) {
-  char buf[oid_prefix.size() + 16];
-  snprintf(buf, sizeof(buf), "%s.%d", oid_prefix.c_str(), shard_id);
-  return string(buf);
-}
-
-RGWCoroutine *RGWSyncErrorLogger::log_error_cr(const DoutPrefixProvider *dpp, const string& source_zone, const string& section, const string& name, uint32_t error_code, const string& message) {
-  cls_log_entry entry;
-
-  rgw_sync_error_info info(source_zone, error_code, message);
-  bufferlist bl;
-  encode(info, bl);
-  store->svc()->cls->timelog.prepare_entry(entry, real_clock::now(), section, name, bl);
-
-  uint32_t shard_id = ++counter % num_shards;
-
-
-  return new RGWRadosTimelogAddCR(dpp, store, oids[shard_id], entry);
-}
-
-void RGWSyncBackoff::update_wait_time()
-{
-  if (cur_wait == 0) {
-    cur_wait = 1;
-  } else {
-    cur_wait = (cur_wait << 1);
-  }
-  if (cur_wait >= max_secs) {
-    cur_wait = max_secs;
-  }
-}
-
-void RGWSyncBackoff::backoff_sleep()
-{
-  update_wait_time();
-  sleep(cur_wait);
-}
-
-void RGWSyncBackoff::backoff(RGWCoroutine *op)
-{
-  update_wait_time();
-  op->wait(utime_t(cur_wait, 0));
-}
-
-int RGWBackoffControlCR::operate(const DoutPrefixProvider *dpp) {
-  reenter(this) {
-    // retry the operation until it succeeds
-    while (true) {
-      yield {
-       std::lock_guard l{lock};
-        cr = alloc_cr();
-        cr->get();
-        call(cr);
-      }
-      {
-       std::lock_guard l{lock};
-        cr->put();
-        cr = NULL;
-      }
-      if (retcode >= 0) {
-        break;
-      }
-      if (retcode != -EBUSY && retcode != -EAGAIN) {
-        ldout(cct, 0) << "ERROR: RGWBackoffControlCR called coroutine returned " << retcode << dendl;
-        if (exit_on_error) {
-          return set_cr_error(retcode);
-        }
-      }
-      if (reset_backoff) {
-        backoff.reset();
-      }
-      yield backoff.backoff(this);
-    }
-
-    // run an optional finisher
-    yield call(alloc_finisher_cr());
-    if (retcode < 0) {
-      ldout(cct, 0) << "ERROR: call to finisher_cr() failed: retcode=" << retcode << dendl;
-      return set_cr_error(retcode);
-    }
-    return set_cr_done();
-  }
-  return 0;
-}
-
-void rgw_mdlog_info::decode_json(JSONObj *obj) {
-  JSONDecoder::decode_json("num_objects", num_shards, obj);
-  JSONDecoder::decode_json("period", period, obj);
-  JSONDecoder::decode_json("realm_epoch", realm_epoch, obj);
-}
-
-void rgw_mdlog_entry::decode_json(JSONObj *obj) {
-  JSONDecoder::decode_json("id", id, obj);
-  JSONDecoder::decode_json("section", section, obj);
-  JSONDecoder::decode_json("name", name, obj);
-  utime_t ut;
-  JSONDecoder::decode_json("timestamp", ut, obj);
-  timestamp = ut.to_real_time();
-  JSONDecoder::decode_json("data", log_data, obj);
-}
-
-void rgw_mdlog_shard_data::decode_json(JSONObj *obj) {
-  JSONDecoder::decode_json("marker", marker, obj);
-  JSONDecoder::decode_json("truncated", truncated, obj);
-  JSONDecoder::decode_json("entries", entries, obj);
-};
-
-int RGWShardCollectCR::operate(const DoutPrefixProvider *dpp) {
-  reenter(this) {
-    while (spawn_next()) {
-      current_running++;
-
-      if (current_running >= max_concurrent) {
-        int child_ret;
-        yield wait_for_child();
-        if (collect_next(&child_ret)) {
-          current_running--;
-          child_ret = handle_result(child_ret);
-          if (child_ret < 0) {
-            status = child_ret;
-          }
-        }
-      }
-    }
-    while (current_running > 0) {
-      int child_ret;
-      yield wait_for_child();
-      if (collect_next(&child_ret)) {
-        current_running--;
-        child_ret = handle_result(child_ret);
-        if (child_ret < 0) {
-          status = child_ret;
-        }
-      }
-    }
-    if (status < 0) {
-      return set_cr_error(status);
-    }
-    return set_cr_done();
-  }
-  return 0;
-}
-
-class RGWReadRemoteMDLogInfoCR : public RGWShardCollectCR {
-  RGWMetaSyncEnv *sync_env;
-
-  const std::string& period;
-  int num_shards;
-  map<int, RGWMetadataLogInfo> *mdlog_info;
-
-  int shard_id;
-#define READ_MDLOG_MAX_CONCURRENT 10
-
-  int handle_result(int r) override {
-    if (r == -ENOENT) { // ENOENT is not a fatal error
-      return 0;
-    }
-    if (r < 0) {
-      ldout(cct, 4) << "failed to fetch mdlog status: " << cpp_strerror(r) << dendl;
-    }
-    return r;
-  }
-public:
-  RGWReadRemoteMDLogInfoCR(RGWMetaSyncEnv *_sync_env,
-                     const std::string& period, int _num_shards,
-                     map<int, RGWMetadataLogInfo> *_mdlog_info) : RGWShardCollectCR(_sync_env->cct, READ_MDLOG_MAX_CONCURRENT),
-                                                                 sync_env(_sync_env),
-                                                                 period(period), num_shards(_num_shards),
-                                                                 mdlog_info(_mdlog_info), shard_id(0) {}
-  bool spawn_next() override;
-};
-
-class RGWListRemoteMDLogCR : public RGWShardCollectCR {
-  RGWMetaSyncEnv *sync_env;
-
-  const std::string& period;
-  map<int, string> shards;
-  int max_entries_per_shard;
-  map<int, rgw_mdlog_shard_data> *result;
-
-  map<int, string>::iterator iter;
-#define READ_MDLOG_MAX_CONCURRENT 10
-
-  int handle_result(int r) override {
-    if (r == -ENOENT) { // ENOENT is not a fatal error
-      return 0;
-    }
-    if (r < 0) {
-      ldout(cct, 4) << "failed to list remote mdlog shard: " << cpp_strerror(r) << dendl;
-    }
-    return r;
-  }
-public:
-  RGWListRemoteMDLogCR(RGWMetaSyncEnv *_sync_env,
-                     const std::string& period, map<int, string>& _shards,
-                     int _max_entries_per_shard,
-                     map<int, rgw_mdlog_shard_data> *_result) : RGWShardCollectCR(_sync_env->cct, READ_MDLOG_MAX_CONCURRENT),
-                                                                 sync_env(_sync_env), period(period),
-                                                                 max_entries_per_shard(_max_entries_per_shard),
-                                                                 result(_result) {
-    shards.swap(_shards);
-    iter = shards.begin();
-  }
-  bool spawn_next() override;
-};
-
-int RGWRemoteMetaLog::read_log_info(const DoutPrefixProvider *dpp, rgw_mdlog_info *log_info)
-{
-  rgw_http_param_pair pairs[] = { { "type", "metadata" },
-                                  { NULL, NULL } };
-
-  int ret = conn->get_json_resource(dpp, "/admin/log", pairs, null_yield, *log_info);
-  if (ret < 0) {
-    ldpp_dout(dpp, 0) << "ERROR: failed to fetch mdlog info" << dendl;
-    return ret;
-  }
-
-  ldpp_dout(dpp, 20) << "remote mdlog, num_shards=" << log_info->num_shards << dendl;
-
-  return 0;
-}
-
-int RGWRemoteMetaLog::read_master_log_shards_info(const DoutPrefixProvider *dpp, const string &master_period, map<int, RGWMetadataLogInfo> *shards_info)
-{
-  if (store->svc()->zone->is_meta_master()) {
-    return 0;
-  }
-
-  rgw_mdlog_info log_info;
-  int ret = read_log_info(dpp, &log_info);
-  if (ret < 0) {
-    return ret;
-  }
-
-  return run(dpp, new RGWReadRemoteMDLogInfoCR(&sync_env, master_period, log_info.num_shards, shards_info));
-}
-
-int RGWRemoteMetaLog::read_master_log_shards_next(const DoutPrefixProvider *dpp, const string& period, map<int, string> shard_markers, map<int, rgw_mdlog_shard_data> *result)
-{
-  if (store->svc()->zone->is_meta_master()) {
-    return 0;
-  }
-
-  return run(dpp, new RGWListRemoteMDLogCR(&sync_env, period, shard_markers, 1, result));
-}
-
-int RGWRemoteMetaLog::init()
-{
-  conn = store->svc()->zone->get_master_conn();
-
-  int ret = http_manager.start();
-  if (ret < 0) {
-    ldpp_dout(dpp, 0) << "failed in http_manager.start() ret=" << ret << dendl;
-    return ret;
-  }
-
-  error_logger = new RGWSyncErrorLogger(store, RGW_SYNC_ERROR_LOG_SHARD_PREFIX, ERROR_LOGGER_SHARDS);
-
-  init_sync_env(&sync_env);
-
-  tn = sync_env.sync_tracer->add_node(sync_env.sync_tracer->root_node, "meta");
-
-  return 0;
-}
-
-#define CLONE_MAX_ENTRIES 100
-
-int RGWMetaSyncStatusManager::init(const DoutPrefixProvider *dpp)
-{
-  if (store->svc()->zone->is_meta_master()) {
-    return 0;
-  }
-
-  if (!store->svc()->zone->get_master_conn()) {
-    ldpp_dout(dpp, -1) << "no REST connection to master zone" << dendl;
-    return -EIO;
-  }
-
-  int r = rgw_init_ioctx(dpp, store->getRados()->get_rados_handle(), store->svc()->zone->get_zone_params().log_pool, ioctx, true);
-  if (r < 0) {
-    ldpp_dout(dpp, -1) << "ERROR: failed to open log pool (" << store->svc()->zone->get_zone_params().log_pool << " ret=" << r << dendl;
-    return r;
-  }
-
-  r = master_log.init();
-  if (r < 0) {
-    ldpp_dout(dpp, -1) << "ERROR: failed to init remote log, r=" << r << dendl;
-    return r;
-  }
-
-  RGWMetaSyncEnv& sync_env = master_log.get_sync_env();
-
-  rgw_meta_sync_status sync_status;
-  r = read_sync_status(dpp, &sync_status);
-  if (r < 0 && r != -ENOENT) {
-    ldpp_dout(dpp, -1) << "ERROR: failed to read sync status, r=" << r << dendl;
-    return r;
-  }
-
-  int num_shards = sync_status.sync_info.num_shards;
-
-  for (int i = 0; i < num_shards; i++) {
-    shard_objs[i] = rgw_raw_obj(store->svc()->zone->get_zone_params().log_pool, sync_env.shard_obj_name(i));
-  }
-
-  std::unique_lock wl{ts_to_shard_lock};
-  for (int i = 0; i < num_shards; i++) {
-    clone_markers.push_back(string());
-    utime_shard ut;
-    ut.shard_id = i;
-    ts_to_shard[ut] = i;
-  }
-
-  return 0;
-}
-
-void RGWMetaSyncEnv::init(const DoutPrefixProvider *_dpp, CephContext *_cct, rgw::sal::RadosStore* _store, RGWRESTConn *_conn,
-                          RGWAsyncRadosProcessor *_async_rados, RGWHTTPManager *_http_manager,
-                          RGWSyncErrorLogger *_error_logger, RGWSyncTraceManager *_sync_tracer) {
-  dpp = _dpp;
-  cct = _cct;
-  store = _store;
-  conn = _conn;
-  async_rados = _async_rados;
-  http_manager = _http_manager;
-  error_logger = _error_logger;
-  sync_tracer = _sync_tracer;
-}
-
-string RGWMetaSyncEnv::status_oid()
-{
-  return mdlog_sync_status_oid;
-}
-
-string RGWMetaSyncEnv::shard_obj_name(int shard_id)
-{
-  char buf[mdlog_sync_status_shard_prefix.size() + 16];
-  snprintf(buf, sizeof(buf), "%s.%d", mdlog_sync_status_shard_prefix.c_str(), shard_id);
-
-  return string(buf);
-}
-
-class RGWAsyncReadMDLogEntries : public RGWAsyncRadosRequest {
-  const DoutPrefixProvider *dpp;
-  rgw::sal::RadosStore* store;
-  RGWMetadataLog *mdlog;
-  int shard_id;
-  int max_entries;
-
-protected:
-  int _send_request(const DoutPrefixProvider *dpp) override {
-    real_time from_time;
-    real_time end_time;
-
-    void *handle;
-
-    mdlog->init_list_entries(shard_id, from_time, end_time, marker, &handle);
-
-    int ret = mdlog->list_entries(dpp, handle, max_entries, entries, &marker, &truncated);
-
-    mdlog->complete_list_entries(handle);
-
-    return ret;
-  }
-public:
-  string marker;
-  list<cls_log_entry> entries;
-  bool truncated;
-
-  RGWAsyncReadMDLogEntries(const DoutPrefixProvider *dpp, RGWCoroutine *caller, RGWAioCompletionNotifier *cn, rgw::sal::RadosStore* _store,
-                           RGWMetadataLog* mdlog, int _shard_id,
-                           std::string _marker, int _max_entries)
-    : RGWAsyncRadosRequest(caller, cn), dpp(dpp), store(_store), mdlog(mdlog),
-      shard_id(_shard_id), max_entries(_max_entries), marker(std::move(_marker)) {}
-};
-
-class RGWReadMDLogEntriesCR : public RGWSimpleCoroutine {
-  RGWMetaSyncEnv *sync_env;
-  RGWMetadataLog *const mdlog;
-  int shard_id;
-  string marker;
-  string *pmarker;
-  int max_entries;
-  list<cls_log_entry> *entries;
-  bool *truncated;
-
-  RGWAsyncReadMDLogEntries *req{nullptr};
-
-public:
-  RGWReadMDLogEntriesCR(RGWMetaSyncEnv *_sync_env, RGWMetadataLog* mdlog,
-                        int _shard_id, string*_marker, int _max_entries,
-                        list<cls_log_entry> *_entries, bool *_truncated)
-    : RGWSimpleCoroutine(_sync_env->cct), sync_env(_sync_env), mdlog(mdlog),
-      shard_id(_shard_id), pmarker(_marker), max_entries(_max_entries),
-      entries(_entries), truncated(_truncated) {}
-
-  ~RGWReadMDLogEntriesCR() override {
-    if (req) {
-      req->finish();
-    }
-  }
-
-  int send_request(const DoutPrefixProvider *dpp) override {
-    marker = *pmarker;
-    req = new RGWAsyncReadMDLogEntries(dpp, this, stack->create_completion_notifier(),
-                                       sync_env->store, mdlog, shard_id, marker,
-                                       max_entries);
-    sync_env->async_rados->queue(req);
-    return 0;
-  }
-
-  int request_complete() override {
-    *pmarker = std::move(req->marker);
-    *entries = std::move(req->entries);
-    *truncated = req->truncated;
-    return req->get_ret_status();
-  }
-};
-
-
-class RGWReadRemoteMDLogShardInfoCR : public RGWCoroutine {
-  RGWMetaSyncEnv *env;
-  RGWRESTReadResource *http_op;
-
-  const std::string& period;
-  int shard_id;
-  RGWMetadataLogInfo *shard_info;
-
-public:
-  RGWReadRemoteMDLogShardInfoCR(RGWMetaSyncEnv *env, const std::string& period,
-                                int _shard_id, RGWMetadataLogInfo *_shard_info)
-    : RGWCoroutine(env->store->ctx()), env(env), http_op(NULL),
-      period(period), shard_id(_shard_id), shard_info(_shard_info) {}
-
-  int operate(const DoutPrefixProvider *dpp) override {
-    auto store = env->store;
-    RGWRESTConn *conn = store->svc()->zone->get_master_conn();
-    reenter(this) {
-      yield {
-       char buf[16];
-       snprintf(buf, sizeof(buf), "%d", shard_id);
-        rgw_http_param_pair pairs[] = { { "type" , "metadata" },
-                                       { "id", buf },
-                                       { "period", period.c_str() },
-                                       { "info" , NULL },
-                                       { NULL, NULL } };
-
-        string p = "/admin/log/";
-
-        http_op = new RGWRESTReadResource(conn, p, pairs, NULL,
-                                          env->http_manager);
-
-        init_new_io(http_op);
-
-        int ret = http_op->aio_read(dpp);
-        if (ret < 0) {
-          ldpp_dout(env->dpp, 0) << "ERROR: failed to read from " << p << dendl;
-          log_error() << "failed to send http operation: " << http_op->to_str() << " ret=" << ret << std::endl;
-          http_op->put();
-          return set_cr_error(ret);
-        }
-
-        return io_block(0);
-      }
-      yield {
-        int ret = http_op->wait(shard_info, null_yield);
-        http_op->put();
-        if (ret < 0) {
-          return set_cr_error(ret);
-        }
-        return set_cr_done();
-      }
-    }
-    return 0;
-  }
-};
-
-RGWCoroutine* create_read_remote_mdlog_shard_info_cr(RGWMetaSyncEnv *env,
-                                                     const std::string& period,
-                                                     int shard_id,
-                                                     RGWMetadataLogInfo* info)
-{
-  return new RGWReadRemoteMDLogShardInfoCR(env, period, shard_id, info);
-}
-
-class RGWListRemoteMDLogShardCR : public RGWSimpleCoroutine {
-  RGWMetaSyncEnv *sync_env;
-  RGWRESTReadResource *http_op;
-
-  const std::string& period;
-  int shard_id;
-  string marker;
-  uint32_t max_entries;
-  rgw_mdlog_shard_data *result;
-
-public:
-  RGWListRemoteMDLogShardCR(RGWMetaSyncEnv *env, const std::string& period,
-                            int _shard_id, const string& _marker, uint32_t _max_entries,
-                            rgw_mdlog_shard_data *_result)
-    : RGWSimpleCoroutine(env->store->ctx()), sync_env(env), http_op(NULL),
-      period(period), shard_id(_shard_id), marker(_marker), max_entries(_max_entries), result(_result) {}
-
-  int send_request(const DoutPrefixProvider *dpp) override {
-    RGWRESTConn *conn = sync_env->conn;
-
-    char buf[32];
-    snprintf(buf, sizeof(buf), "%d", shard_id);
-
-    char max_entries_buf[32];
-    snprintf(max_entries_buf, sizeof(max_entries_buf), "%d", (int)max_entries);
-
-    const char *marker_key = (marker.empty() ? "" : "marker");
-
-    rgw_http_param_pair pairs[] = { { "type", "metadata" },
-      { "id", buf },
-      { "period", period.c_str() },
-      { "max-entries", max_entries_buf },
-      { marker_key, marker.c_str() },
-      { NULL, NULL } };
-
-    string p = "/admin/log/";
-
-    http_op = new RGWRESTReadResource(conn, p, pairs, NULL, sync_env->http_manager);
-    init_new_io(http_op);
-
-    int ret = http_op->aio_read(dpp);
-    if (ret < 0) {
-      ldpp_dout(dpp, 0) << "ERROR: failed to read from " << p << dendl;
-      log_error() << "failed to send http operation: " << http_op->to_str() << " ret=" << ret << std::endl;
-      http_op->put();
-      return ret;
-    }
-
-    return 0;
-  }
-
-  int request_complete() override {
-    int ret = http_op->wait(result, null_yield);
-    http_op->put();
-    if (ret < 0 && ret != -ENOENT) {
-      ldpp_dout(sync_env->dpp, 0) << "ERROR: failed to list remote mdlog shard, ret=" << ret << dendl;
-      return ret;
-    }
-    return 0;
-  }
-};
-
-RGWCoroutine* create_list_remote_mdlog_shard_cr(RGWMetaSyncEnv *env,
-                                                const std::string& period,
-                                                int shard_id,
-                                                const std::string& marker,
-                                                uint32_t max_entries,
-                                                rgw_mdlog_shard_data *result)
-{
-  return new RGWListRemoteMDLogShardCR(env, period, shard_id, marker,
-                                       max_entries, result);
-}
-
-bool RGWReadRemoteMDLogInfoCR::spawn_next() {
-  if (shard_id >= num_shards) {
-    return false;
-  }
-  spawn(new RGWReadRemoteMDLogShardInfoCR(sync_env, period, shard_id, &(*mdlog_info)[shard_id]), false);
-  shard_id++;
-  return true;
-}
-
-bool RGWListRemoteMDLogCR::spawn_next() {
-  if (iter == shards.end()) {
-    return false;
-  }
-
-  spawn(new RGWListRemoteMDLogShardCR(sync_env, period, iter->first, iter->second, max_entries_per_shard, &(*result)[iter->first]), false);
-  ++iter;
-  return true;
-}
-
-class RGWInitSyncStatusCoroutine : public RGWCoroutine {
-  RGWMetaSyncEnv *sync_env;
-
-  rgw_meta_sync_info status;
-  vector<RGWMetadataLogInfo> shards_info;
-  boost::intrusive_ptr<RGWContinuousLeaseCR> lease_cr;
-  boost::intrusive_ptr<RGWCoroutinesStack> lease_stack;
-public:
-  RGWInitSyncStatusCoroutine(RGWMetaSyncEnv *_sync_env,
-                             const rgw_meta_sync_info &status)
-    : RGWCoroutine(_sync_env->store->ctx()), sync_env(_sync_env),
-      status(status), shards_info(status.num_shards),
-      lease_cr(nullptr), lease_stack(nullptr) {}
-
-  ~RGWInitSyncStatusCoroutine() override {
-    if (lease_cr) {
-      lease_cr->abort();
-    }
-  }
-
-  int operate(const DoutPrefixProvider *dpp) override {
-    int ret;
-    reenter(this) {
-      yield {
-        set_status("acquiring sync lock");
-       uint32_t lock_duration = cct->_conf->rgw_sync_lease_period;
-        string lock_name = "sync_lock";
-       rgw::sal::RadosStore* store = sync_env->store;
-        lease_cr.reset(new RGWContinuousLeaseCR(sync_env->async_rados, store,
-                                                rgw_raw_obj(store->svc()->zone->get_zone_params().log_pool, sync_env->status_oid()),
-                                                lock_name, lock_duration, this));
-        lease_stack.reset(spawn(lease_cr.get(), false));
-      }
-      while (!lease_cr->is_locked()) {
-        if (lease_cr->is_done()) {
-          ldpp_dout(dpp, 5) << "failed to take lease" << dendl;
-          set_status("lease lock failed, early abort");
-          return set_cr_error(lease_cr->get_ret_status());
-        }
-        set_sleeping(true);
-        yield;
-      }
-      yield {
-        set_status("writing sync status");
-       rgw::sal::RadosStore* store = sync_env->store;
-        call(new RGWSimpleRadosWriteCR<rgw_meta_sync_info>(dpp, sync_env->async_rados, store->svc()->sysobj,
-                                                           rgw_raw_obj(store->svc()->zone->get_zone_params().log_pool, sync_env->status_oid()),
-                                                           status));
-      }
-
-      if (retcode < 0) {
-        set_status("failed to write sync status");
-        ldpp_dout(dpp, 0) << "ERROR: failed to write sync status, retcode=" << retcode << dendl;
-        yield lease_cr->go_down();
-        return set_cr_error(retcode);
-      }
-      /* fetch current position in logs */
-      set_status("fetching remote log position");
-      yield {
-        for (int i = 0; i < (int)status.num_shards; i++) {
-          spawn(new RGWReadRemoteMDLogShardInfoCR(sync_env, status.period, i,
-                                                  &shards_info[i]), false);
-       }
-      }
-
-      drain_all_but_stack(lease_stack.get()); /* the lease cr still needs to run */
-
-      yield {
-        set_status("updating sync status");
-        for (int i = 0; i < (int)status.num_shards; i++) {
-         rgw_meta_sync_marker marker;
-          RGWMetadataLogInfo& info = shards_info[i];
-         marker.next_step_marker = info.marker;
-         marker.timestamp = info.last_update;
-         rgw::sal::RadosStore* store = sync_env->store;
-          spawn(new RGWSimpleRadosWriteCR<rgw_meta_sync_marker>(dpp,
-                                                                sync_env->async_rados,
-                                                                store->svc()->sysobj,
-                                                                rgw_raw_obj(store->svc()->zone->get_zone_params().log_pool, sync_env->shard_obj_name(i)),
-                                                                marker), true);
-        }
-      }
-      yield {
-        set_status("changing sync state: build full sync maps");
-       status.state = rgw_meta_sync_info::StateBuildingFullSyncMaps;
-       rgw::sal::RadosStore* store = sync_env->store;
-        call(new RGWSimpleRadosWriteCR<rgw_meta_sync_info>(dpp, sync_env->async_rados, store->svc()->sysobj,
-                                                           rgw_raw_obj(store->svc()->zone->get_zone_params().log_pool, sync_env->status_oid()),
-                                                           status));
-      }
-      set_status("drop lock lease");
-      yield lease_cr->go_down();
-      while (collect(&ret, NULL)) {
-       if (ret < 0) {
-         return set_cr_error(ret);
-       }
-        yield;
-      }
-      drain_all();
-      return set_cr_done();
-    }
-    return 0;
-  }
-};
-
-class RGWReadSyncStatusMarkersCR : public RGWShardCollectCR {
-  static constexpr int MAX_CONCURRENT_SHARDS = 16;
-
-  RGWMetaSyncEnv *env;
-  const int num_shards;
-  int shard_id{0};
-  map<uint32_t, rgw_meta_sync_marker>& markers;
-
-  int handle_result(int r) override {
-    if (r == -ENOENT) { // ENOENT is not a fatal error
-      return 0;
-    }
-    if (r < 0) {
-      ldout(cct, 4) << "failed to read metadata sync markers: "
-          << cpp_strerror(r) << dendl;
-    }
-    return r;
-  }
- public:
-  RGWReadSyncStatusMarkersCR(RGWMetaSyncEnv *env, int num_shards,
-                             map<uint32_t, rgw_meta_sync_marker>& markers)
-    : RGWShardCollectCR(env->cct, MAX_CONCURRENT_SHARDS),
-      env(env), num_shards(num_shards), markers(markers)
-  {}
-  bool spawn_next() override;
-};
-
-bool RGWReadSyncStatusMarkersCR::spawn_next()
-{
-  if (shard_id >= num_shards) {
-    return false;
-  }
-  using CR = RGWSimpleRadosReadCR<rgw_meta_sync_marker>;
-  rgw_raw_obj obj{env->store->svc()->zone->get_zone_params().log_pool,
-                  env->shard_obj_name(shard_id)};
-  spawn(new CR(env->dpp, env->async_rados, env->store->svc()->sysobj, obj, &markers[shard_id]), false);
-  shard_id++;
-  return true;
-}
-
-class RGWReadSyncStatusCoroutine : public RGWCoroutine {
-  RGWMetaSyncEnv *sync_env;
-  rgw_meta_sync_status *sync_status;
-
-public:
-  RGWReadSyncStatusCoroutine(RGWMetaSyncEnv *_sync_env,
-                             rgw_meta_sync_status *_status)
-    : RGWCoroutine(_sync_env->cct), sync_env(_sync_env), sync_status(_status)
-  {}
-  int operate(const DoutPrefixProvider *dpp) override;
-};
-
-int RGWReadSyncStatusCoroutine::operate(const DoutPrefixProvider *dpp)
-{
-  reenter(this) {
-    // read sync info
-    using ReadInfoCR = RGWSimpleRadosReadCR<rgw_meta_sync_info>;
-    yield {
-      bool empty_on_enoent = false; // fail on ENOENT
-      rgw_raw_obj obj{sync_env->store->svc()->zone->get_zone_params().log_pool,
-                      sync_env->status_oid()};
-      call(new ReadInfoCR(dpp, sync_env->async_rados, sync_env->store->svc()->sysobj, obj,
-                          &sync_status->sync_info, empty_on_enoent));
-    }
-    if (retcode < 0) {
-      ldpp_dout(dpp, 4) << "failed to read sync status info with "
-          << cpp_strerror(retcode) << dendl;
-      return set_cr_error(retcode);
-    }
-    // read shard markers
-    using ReadMarkersCR = RGWReadSyncStatusMarkersCR;
-    yield call(new ReadMarkersCR(sync_env, sync_status->sync_info.num_shards,
-                                 sync_status->sync_markers));
-    if (retcode < 0) {
-      ldpp_dout(dpp, 4) << "failed to read sync status markers with "
-          << cpp_strerror(retcode) << dendl;
-      return set_cr_error(retcode);
-    }
-    return set_cr_done();
-  }
-  return 0;
-}
-
-class RGWFetchAllMetaCR : public RGWCoroutine {
-  RGWMetaSyncEnv *sync_env;
-
-  int num_shards;
-
-
-  int ret_status;
-
-  list<string> sections;
-  list<string>::iterator sections_iter;
-
-  struct meta_list_result {
-    list<string> keys;
-    string marker;
-    uint64_t count{0};
-    bool truncated{false};
-
-    void decode_json(JSONObj *obj) {
-      JSONDecoder::decode_json("keys", keys, obj);
-      JSONDecoder::decode_json("marker", marker, obj);
-      JSONDecoder::decode_json("count", count, obj);
-      JSONDecoder::decode_json("truncated", truncated, obj);
-    }
-  } result;
-  list<string>::iterator iter;
-
-  std::unique_ptr<RGWShardedOmapCRManager> entries_index;
-
-  boost::intrusive_ptr<RGWContinuousLeaseCR> lease_cr;
-  boost::intrusive_ptr<RGWCoroutinesStack> lease_stack;
-  bool lost_lock;
-  bool failed;
-
-  string marker;
-
-  map<uint32_t, rgw_meta_sync_marker>& markers;
-
-  RGWSyncTraceNodeRef tn;
-
-public:
-  RGWFetchAllMetaCR(RGWMetaSyncEnv *_sync_env, int _num_shards,
-                    map<uint32_t, rgw_meta_sync_marker>& _markers,
-                    RGWSyncTraceNodeRef& _tn_parent) : RGWCoroutine(_sync_env->cct), sync_env(_sync_env),
-                                                     num_shards(_num_shards),
-                                                     ret_status(0), lease_cr(nullptr), lease_stack(nullptr),
-                                                      lost_lock(false), failed(false), markers(_markers) {
-    tn = sync_env->sync_tracer->add_node(_tn_parent, "fetch_all_meta");
-  }
-
-  ~RGWFetchAllMetaCR() override {
-  }
-
-  void append_section_from_set(set<string>& all_sections, const string& name) {
-    set<string>::iterator iter = all_sections.find(name);
-    if (iter != all_sections.end()) {
-      sections.emplace_back(std::move(*iter));
-      all_sections.erase(iter);
-    }
-  }
-  /*
-   * meta sync should go in the following order: user, bucket.instance, bucket
-   * then whatever other sections exist (if any)
-   */
-  void rearrange_sections() {
-    set<string> all_sections;
-    std::move(sections.begin(), sections.end(),
-              std::inserter(all_sections, all_sections.end()));
-    sections.clear();
-
-    append_section_from_set(all_sections, "user");
-    append_section_from_set(all_sections, "bucket.instance");
-    append_section_from_set(all_sections, "bucket");
-    append_section_from_set(all_sections, "roles");
-
-    std::move(all_sections.begin(), all_sections.end(),
-              std::back_inserter(sections));
-  }
-
-  int operate(const DoutPrefixProvider *dpp) override {
-    RGWRESTConn *conn = sync_env->conn;
-
-    reenter(this) {
-      yield {
-        set_status(string("acquiring lock (") + sync_env->status_oid() + ")");
-       uint32_t lock_duration = cct->_conf->rgw_sync_lease_period;
-        string lock_name = "sync_lock";
-        lease_cr.reset(new RGWContinuousLeaseCR(sync_env->async_rados,
-                                                sync_env->store,
-                                                rgw_raw_obj(sync_env->store->svc()->zone->get_zone_params().log_pool, sync_env->status_oid()),
-                                                lock_name, lock_duration, this));
-        lease_stack.reset(spawn(lease_cr.get(), false));
-      }
-      while (!lease_cr->is_locked()) {
-        if (lease_cr->is_done()) {
-          ldpp_dout(dpp, 5) << "failed to take lease" << dendl;
-          set_status("lease lock failed, early abort");
-          return set_cr_error(lease_cr->get_ret_status());
-        }
-        set_sleeping(true);
-        yield;
-      }
-      entries_index.reset(new RGWShardedOmapCRManager(sync_env->async_rados, sync_env->store, this, num_shards,
-                                                      sync_env->store->svc()->zone->get_zone_params().log_pool,
-                                                      mdlog_sync_full_sync_index_prefix));
-      yield {
-       call(new RGWReadRESTResourceCR<list<string> >(cct, conn, sync_env->http_manager,
-                                      "/admin/metadata", NULL, &sections));
-      }
-      if (get_ret_status() < 0) {
-        ldpp_dout(dpp, 0) << "ERROR: failed to fetch metadata sections" << dendl;
-        yield entries_index->finish();
-        yield lease_cr->go_down();
-        drain_all();
-       return set_cr_error(get_ret_status());
-      }
-      rearrange_sections();
-      sections_iter = sections.begin();
-      for (; sections_iter != sections.end(); ++sections_iter) {
-        do {
-          yield {
-#define META_FULL_SYNC_CHUNK_SIZE "1000"
-            string entrypoint = string("/admin/metadata/") + *sections_iter;
-            rgw_http_param_pair pairs[] = { { "max-entries", META_FULL_SYNC_CHUNK_SIZE },
-              { "marker", result.marker.c_str() },
-              { NULL, NULL } };
-            result.keys.clear();
-            call(new RGWReadRESTResourceCR<meta_list_result >(cct, conn, sync_env->http_manager,
-                                                              entrypoint, pairs, &result));
-          }
-          ret_status = get_ret_status();
-          if (ret_status == -ENOENT) {
-            set_retcode(0); /* reset coroutine status so that we don't return it */
-            ret_status = 0;
-          }
-          if (ret_status < 0) {
-            tn->log(0, SSTR("ERROR: failed to fetch metadata section: " << *sections_iter));
-            yield entries_index->finish();
-            yield lease_cr->go_down();
-            drain_all();
-            return set_cr_error(ret_status);
-          }
-          iter = result.keys.begin();
-          for (; iter != result.keys.end(); ++iter) {
-            if (!lease_cr->is_locked()) {
-              lost_lock = true;
-              tn->log(1, "lease is lost, abort");
-              break;
-            }
-            yield; // allow entries_index consumer to make progress
-
-            tn->log(20, SSTR("list metadata: section=" << *sections_iter << " key=" << *iter));
-            string s = *sections_iter + ":" + *iter;
-            int shard_id;
-           rgw::sal::RadosStore* store = sync_env->store;
-            int ret = store->ctl()->meta.mgr->get_shard_id(*sections_iter, *iter, &shard_id);
-            if (ret < 0) {
-              tn->log(0, SSTR("ERROR: could not determine shard id for " << *sections_iter << ":" << *iter));
-              ret_status = ret;
-              break;
-            }
-            if (!entries_index->append(s, shard_id)) {
-              break;
-            }
-          }
-        } while (result.truncated);
-      }
-      yield {
-        if (!entries_index->finish()) {
-          failed = true;
-        }
-      }
-      if (!failed) {
-        for (map<uint32_t, rgw_meta_sync_marker>::iterator iter = markers.begin(); iter != markers.end(); ++iter) {
-          int shard_id = (int)iter->first;
-          rgw_meta_sync_marker& marker = iter->second;
-          marker.total_entries = entries_index->get_total_entries(shard_id);
-          spawn(new RGWSimpleRadosWriteCR<rgw_meta_sync_marker>(dpp, sync_env->async_rados, sync_env->store->svc()->sysobj,
-                                                                rgw_raw_obj(sync_env->store->svc()->zone->get_zone_params().log_pool, sync_env->shard_obj_name(shard_id)),
-                                                                marker), true);
-        }
-      }
-
-      drain_all_but_stack(lease_stack.get()); /* the lease cr still needs to run */
-
-      yield lease_cr->go_down();
-
-      int ret;
-      while (collect(&ret, NULL)) {
-       if (ret < 0) {
-         return set_cr_error(ret);
-       }
-        yield;
-      }
-      drain_all();
-      if (failed) {
-        yield return set_cr_error(-EIO);
-      }
-      if (lost_lock) {
-        yield return set_cr_error(-EBUSY);
-      }
-
-      if (ret_status < 0) {
-        yield return set_cr_error(ret_status);
-      }
-
-      yield return set_cr_done();
-    }
-    return 0;
-  }
-};
-
-static string full_sync_index_shard_oid(int shard_id)
-{
-  char buf[mdlog_sync_full_sync_index_prefix.size() + 16];
-  snprintf(buf, sizeof(buf), "%s.%d", mdlog_sync_full_sync_index_prefix.c_str(), shard_id);
-  return string(buf);
-}
-
-class RGWReadRemoteMetadataCR : public RGWCoroutine {
-  RGWMetaSyncEnv *sync_env;
-
-  RGWRESTReadResource *http_op;
-
-  string section;
-  string key;
-
-  bufferlist *pbl;
-
-  RGWSyncTraceNodeRef tn;
-
-public:
-  RGWReadRemoteMetadataCR(RGWMetaSyncEnv *_sync_env,
-                                                      const string& _section, const string& _key, bufferlist *_pbl,
-                                                      const RGWSyncTraceNodeRef& _tn_parent) : RGWCoroutine(_sync_env->cct), sync_env(_sync_env),
-                                                      http_op(NULL),
-                                                      section(_section),
-                                                      key(_key),
-                                                     pbl(_pbl) {
-    tn = sync_env->sync_tracer->add_node(_tn_parent, "read_remote_meta",
-                                         section + ":" + key);
-  }
-
-  int operate(const DoutPrefixProvider *dpp) override {
-    RGWRESTConn *conn = sync_env->conn;
-    reenter(this) {
-      yield {
-        string key_encode;
-        url_encode(key, key_encode);
-        rgw_http_param_pair pairs[] = { { "key" , key.c_str()},
-                                       { NULL, NULL } };
-
-        string p = string("/admin/metadata/") + section + "/" + key_encode;
-
-        http_op = new RGWRESTReadResource(conn, p, pairs, NULL, sync_env->http_manager);
-
-        init_new_io(http_op);
-
-        int ret = http_op->aio_read(dpp);
-        if (ret < 0) {
-          ldpp_dout(dpp, 0) << "ERROR: failed to fetch mdlog data" << dendl;
-          log_error() << "failed to send http operation: " << http_op->to_str() << " ret=" << ret << std::endl;
-          http_op->put();
-          return set_cr_error(ret);
-        }
-
-        return io_block(0);
-      }
-      yield {
-        int ret = http_op->wait(pbl, null_yield);
-        http_op->put();
-        if (ret < 0) {
-          return set_cr_error(ret);
-        }
-        return set_cr_done();
-      }
-    }
-    return 0;
-  }
-};
-
-class RGWAsyncMetaStoreEntry : public RGWAsyncRadosRequest {
-  rgw::sal::RadosStore* store;
-  string raw_key;
-  bufferlist bl;
-  const DoutPrefixProvider *dpp;
-protected:
-  int _send_request(const DoutPrefixProvider *dpp) override {
-    int ret = store->ctl()->meta.mgr->put(raw_key, bl, null_yield, dpp, RGWMDLogSyncType::APPLY_ALWAYS, true);
-    if (ret < 0) {
-      ldpp_dout(dpp, 0) << "ERROR: can't store key: " << raw_key << " ret=" << ret << dendl;
-      return ret;
-    }
-    return 0;
-  }
-public:
-  RGWAsyncMetaStoreEntry(RGWCoroutine *caller, RGWAioCompletionNotifier *cn, rgw::sal::RadosStore* _store,
-                       const string& _raw_key,
-                       bufferlist& _bl,
-                       const DoutPrefixProvider *dpp) : RGWAsyncRadosRequest(caller, cn), store(_store),
-                                          raw_key(_raw_key), bl(_bl), dpp(dpp) {}
-};
-
-
-class RGWMetaStoreEntryCR : public RGWSimpleCoroutine {
-  RGWMetaSyncEnv *sync_env;
-  string raw_key;
-  bufferlist bl;
-
-  RGWAsyncMetaStoreEntry *req;
-
-public:
-  RGWMetaStoreEntryCR(RGWMetaSyncEnv *_sync_env,
-                       const string& _raw_key,
-                       bufferlist& _bl) : RGWSimpleCoroutine(_sync_env->cct), sync_env(_sync_env),
-                                          raw_key(_raw_key), bl(_bl), req(NULL) {
-  }
-
-  ~RGWMetaStoreEntryCR() override {
-    if (req) {
-      req->finish();
-    }
-  }
-
-  int send_request(const DoutPrefixProvider *dpp) override {
-    req = new RGWAsyncMetaStoreEntry(this, stack->create_completion_notifier(),
-                                  sync_env->store, raw_key, bl, dpp);
-    sync_env->async_rados->queue(req);
-    return 0;
-  }
-
-  int request_complete() override {
-    return req->get_ret_status();
-  }
-};
-
-class RGWAsyncMetaRemoveEntry : public RGWAsyncRadosRequest {
-  rgw::sal::RadosStore* store;
-  string raw_key;
-  const DoutPrefixProvider *dpp;
-protected:
-  int _send_request(const DoutPrefixProvider *dpp) override {
-    int ret = store->ctl()->meta.mgr->remove(raw_key, null_yield, dpp);
-    if (ret < 0) {
-      ldpp_dout(dpp, 0) << "ERROR: can't remove key: " << raw_key << " ret=" << ret << dendl;
-      return ret;
-    }
-    return 0;
-  }
-public:
-  RGWAsyncMetaRemoveEntry(RGWCoroutine *caller, RGWAioCompletionNotifier *cn, rgw::sal::RadosStore* _store,
-                       const string& _raw_key, const DoutPrefixProvider *dpp) : RGWAsyncRadosRequest(caller, cn), store(_store),
-                                          raw_key(_raw_key), dpp(dpp) {}
-};
-
-
-class RGWMetaRemoveEntryCR : public RGWSimpleCoroutine {
-  RGWMetaSyncEnv *sync_env;
-  string raw_key;
-
-  RGWAsyncMetaRemoveEntry *req;
-
-public:
-  RGWMetaRemoveEntryCR(RGWMetaSyncEnv *_sync_env,
-                       const string& _raw_key) : RGWSimpleCoroutine(_sync_env->cct), sync_env(_sync_env),
-                                          raw_key(_raw_key), req(NULL) {
-  }
-
-  ~RGWMetaRemoveEntryCR() override {
-    if (req) {
-      req->finish();
-    }
-  }
-
-  int send_request(const DoutPrefixProvider *dpp) override {
-    req = new RGWAsyncMetaRemoveEntry(this, stack->create_completion_notifier(),
-                                  sync_env->store, raw_key, dpp);
-    sync_env->async_rados->queue(req);
-    return 0;
-  }
-
-  int request_complete() override {
-    int r = req->get_ret_status();
-    if (r == -ENOENT) {
-      r = 0;
-    }
-    return r;
-  }
-};
-
-#define META_SYNC_UPDATE_MARKER_WINDOW 10
-
-
-int RGWLastCallerWinsCR::operate(const DoutPrefixProvider *dpp) {
-  RGWCoroutine *call_cr;
-  reenter(this) {
-    while (cr) {
-      call_cr = cr;
-      cr = nullptr;
-      yield call(call_cr);
-      /* cr might have been modified at this point */
-    }
-    return set_cr_done();
-  }
-  return 0;
-}
-
-class RGWMetaSyncShardMarkerTrack : public RGWSyncShardMarkerTrack<string, string> {
-  RGWMetaSyncEnv *sync_env;
-
-  string marker_oid;
-  rgw_meta_sync_marker sync_marker;
-
-  RGWSyncTraceNodeRef tn;
-
-public:
-  RGWMetaSyncShardMarkerTrack(RGWMetaSyncEnv *_sync_env,
-                         const string& _marker_oid,
-                         const rgw_meta_sync_marker& _marker,
-                         RGWSyncTraceNodeRef& _tn) : RGWSyncShardMarkerTrack(META_SYNC_UPDATE_MARKER_WINDOW),
-                                                                sync_env(_sync_env),
-                                                                marker_oid(_marker_oid),
-                                                                sync_marker(_marker),
-                                                                tn(_tn){}
-
-  RGWCoroutine *store_marker(const string& new_marker, uint64_t index_pos, const real_time& timestamp) override {
-    sync_marker.marker = new_marker;
-    if (index_pos > 0) {
-      sync_marker.pos = index_pos;
-    }
-
-    if (!real_clock::is_zero(timestamp)) {
-      sync_marker.timestamp = timestamp;
-    }
-
-    ldpp_dout(sync_env->dpp, 20) << __func__ << "(): updating marker marker_oid=" << marker_oid << " marker=" << new_marker << " realm_epoch=" << sync_marker.realm_epoch << dendl;
-    tn->log(20, SSTR("new marker=" << new_marker));
-    rgw::sal::RadosStore* store = sync_env->store;
-    return new RGWSimpleRadosWriteCR<rgw_meta_sync_marker>(sync_env->dpp, sync_env->async_rados,
-                                                           store->svc()->sysobj,
-                                                           rgw_raw_obj(store->svc()->zone->get_zone_params().log_pool, marker_oid),
-                                                           sync_marker);
-  }
-
-  RGWOrderCallCR *allocate_order_control_cr() override {
-    return new RGWLastCallerWinsCR(sync_env->cct);
-  }
-};
-
-RGWMetaSyncSingleEntryCR::RGWMetaSyncSingleEntryCR(RGWMetaSyncEnv *_sync_env,
-                          const string& _raw_key, const string& _entry_marker,
-                           const RGWMDLogStatus& _op_status,
-                           RGWMetaSyncShardMarkerTrack *_marker_tracker, const RGWSyncTraceNodeRef& _tn_parent) : RGWCoroutine(_sync_env->cct),
-                                                      sync_env(_sync_env),
-                                                     raw_key(_raw_key), entry_marker(_entry_marker),
-                                                      op_status(_op_status),
-                                                      pos(0), sync_status(0),
-                                                      marker_tracker(_marker_tracker), tries(0) {
-  error_injection = (sync_env->cct->_conf->rgw_sync_meta_inject_err_probability > 0);
-  tn = sync_env->sync_tracer->add_node(_tn_parent, "entry", raw_key);
-}
-
-int RGWMetaSyncSingleEntryCR::operate(const DoutPrefixProvider *dpp) {
-  reenter(this) {
-#define NUM_TRANSIENT_ERROR_RETRIES 10
-
-    if (error_injection &&
-        rand() % 10000 < cct->_conf->rgw_sync_meta_inject_err_probability * 10000.0) {
-      return set_cr_error(-EIO);
-    }
-
-    if (op_status != MDLOG_STATUS_COMPLETE) {
-      tn->log(20, "skipping pending operation");
-      yield call(marker_tracker->finish(entry_marker));
-      if (retcode < 0) {
-        return set_cr_error(retcode);
-      }
-      return set_cr_done();
-    }
-    tn->set_flag(RGW_SNS_FLAG_ACTIVE);
-    for (tries = 0; tries < NUM_TRANSIENT_ERROR_RETRIES; tries++) {
-      yield {
-        pos = raw_key.find(':');
-        section = raw_key.substr(0, pos);
-        key = raw_key.substr(pos + 1);
-        tn->log(10, SSTR("fetching remote metadata entry" << (tries == 0 ? "" : " (retry)")));
-        call(new RGWReadRemoteMetadataCR(sync_env, section, key, &md_bl, tn));
-      }
-
-      sync_status = retcode;
-
-      if (sync_status == -ENOENT) {
-        break;
-      }
-
-      if (sync_status < 0) {
-        if (tries < NUM_TRANSIENT_ERROR_RETRIES - 1) {
-          ldpp_dout(dpp, 20) << *this << ": failed to fetch remote metadata entry: " << section << ":" << key << ", will retry" << dendl;
-          continue;
-        }
-
-        tn->log(10, SSTR("failed to read remote metadata entry: section=" << section << " key=" << key << " status=" << sync_status));
-        log_error() << "failed to read remote metadata entry: section=" << section << " key=" << key << " status=" << sync_status << std::endl;
-        yield call(sync_env->error_logger->log_error_cr(dpp, sync_env->conn->get_remote_id(), section, key, -sync_status,
-                                                        string("failed to read remote metadata entry: ") + cpp_strerror(-sync_status)));
-        return set_cr_error(sync_status);
-      }
-
-      break;
-    }
-
-    retcode = 0;
-    for (tries = 0; tries < NUM_TRANSIENT_ERROR_RETRIES; tries++) {
-      if (sync_status != -ENOENT) {
-        tn->log(10, SSTR("storing local metadata entry: " << section << ":" << key));
-        yield call(new RGWMetaStoreEntryCR(sync_env, raw_key, md_bl));
-      } else {
-        tn->log(10, SSTR("removing local metadata entry:" << section << ":" << key));
-        yield call(new RGWMetaRemoveEntryCR(sync_env, raw_key));
-        if (retcode == -ENOENT) {
-          retcode = 0;
-          break;
-        }
-      }
-      if ((retcode < 0) && (tries < NUM_TRANSIENT_ERROR_RETRIES - 1)) {
-        ldpp_dout(dpp, 20) << *this << ": failed to store metadata entry: " << section << ":" << key << ", got retcode=" << retcode << ", will retry" << dendl;
-        continue;
-      }
-      break;
-    }
-
-    sync_status = retcode;
-
-    if (sync_status == 0 && marker_tracker) {
-      /* update marker */
-      yield call(marker_tracker->finish(entry_marker));
-      sync_status = retcode;
-    }
-    if (sync_status < 0) {
-      tn->log(10, SSTR("failed, status=" << sync_status));
-      return set_cr_error(sync_status);
-    }
-    tn->log(10, "success");
-    return set_cr_done();
-  }
-  return 0;
-}
-
-class RGWCloneMetaLogCoroutine : public RGWCoroutine {
-  RGWMetaSyncEnv *sync_env;
-  RGWMetadataLog *mdlog;
-
-  const std::string& period;
-  int shard_id;
-  string marker;
-  bool truncated = false;
-  string *new_marker;
-
-  int max_entries = CLONE_MAX_ENTRIES;
-
-  RGWRESTReadResource *http_op = nullptr;
-  boost::intrusive_ptr<RGWMetadataLogInfoCompletion> completion;
-
-  RGWMetadataLogInfo shard_info;
-  rgw_mdlog_shard_data data;
-
-public:
-  RGWCloneMetaLogCoroutine(RGWMetaSyncEnv *_sync_env, RGWMetadataLog* mdlog,
-                           const std::string& period, int _id,
-                           const string& _marker, string *_new_marker)
-    : RGWCoroutine(_sync_env->cct), sync_env(_sync_env), mdlog(mdlog),
-      period(period), shard_id(_id), marker(_marker), new_marker(_new_marker) {
-    if (new_marker) {
-      *new_marker = marker;
-    }
-  }
-  ~RGWCloneMetaLogCoroutine() override {
-    if (http_op) {
-      http_op->put();
-    }
-    if (completion) {
-      completion->cancel();
-    }
-  }
-
-  int operate(const DoutPrefixProvider *dpp) override;
-
-  int state_init();
-  int state_read_shard_status();
-  int state_read_shard_status_complete();
-  int state_send_rest_request(const DoutPrefixProvider *dpp);
-  int state_receive_rest_response();
-  int state_store_mdlog_entries();
-  int state_store_mdlog_entries_complete();
-};
-
-class RGWMetaSyncShardCR : public RGWCoroutine {
-  RGWMetaSyncEnv *sync_env;
-
-  const rgw_pool& pool;
-  const std::string& period; //< currently syncing period id
-  const epoch_t realm_epoch; //< realm_epoch of period
-  RGWMetadataLog* mdlog; //< log of syncing period
-  uint32_t shard_id;
-  rgw_meta_sync_marker& sync_marker;
-  boost::optional<rgw_meta_sync_marker> temp_marker; //< for pending updates
-  string marker;
-  string max_marker;
-  const std::string& period_marker; //< max marker stored in next period
-
-  RGWRadosGetOmapKeysCR::ResultPtr omapkeys;
-  std::set<std::string> entries;
-  std::set<std::string>::iterator iter;
-
-  string oid;
-
-  RGWMetaSyncShardMarkerTrack *marker_tracker = nullptr;
-
-  list<cls_log_entry> log_entries;
-  list<cls_log_entry>::iterator log_iter;
-  bool truncated = false;
-
-  string mdlog_marker;
-  string raw_key;
-  rgw_mdlog_entry mdlog_entry;
-
-  ceph::mutex inc_lock = ceph::make_mutex("RGWMetaSyncShardCR::inc_lock");
-  ceph::condition_variable inc_cond;
-
-  boost::asio::coroutine incremental_cr;
-  boost::asio::coroutine full_cr;
-
-  boost::intrusive_ptr<RGWContinuousLeaseCR> lease_cr;
-  boost::intrusive_ptr<RGWCoroutinesStack> lease_stack;
-
-  bool lost_lock = false;
-
-  bool *reset_backoff;
-
-  // hold a reference to the cr stack while it's in the map
-  using StackRef = boost::intrusive_ptr<RGWCoroutinesStack>;
-  map<StackRef, string> stack_to_pos;
-  map<string, string> pos_to_prev;
-
-  bool can_adjust_marker = false;
-  bool done_with_period = false;
-
-  int total_entries = 0;
-
-  RGWSyncTraceNodeRef tn;
-public:
-  RGWMetaSyncShardCR(RGWMetaSyncEnv *_sync_env, const rgw_pool& _pool,
-                     const std::string& period, epoch_t realm_epoch,
-                     RGWMetadataLog* mdlog, uint32_t _shard_id,
-                     rgw_meta_sync_marker& _marker,
-                     const std::string& period_marker, bool *_reset_backoff,
-                     RGWSyncTraceNodeRef& _tn)
-    : RGWCoroutine(_sync_env->cct), sync_env(_sync_env), pool(_pool),
-      period(period), realm_epoch(realm_epoch), mdlog(mdlog),
-      shard_id(_shard_id), sync_marker(_marker),
-      period_marker(period_marker),
-      reset_backoff(_reset_backoff), tn(_tn) {
-    *reset_backoff = false;
-  }
-
-  ~RGWMetaSyncShardCR() override {
-    delete marker_tracker;
-    if (lease_cr) {
-      lease_cr->abort();
-    }
-  }
-
-  void set_marker_tracker(RGWMetaSyncShardMarkerTrack *mt) {
-    delete marker_tracker;
-    marker_tracker = mt;
-  }
-
-  int operate(const DoutPrefixProvider *dpp) override {
-    int r;
-    while (true) {
-      switch (sync_marker.state) {
-      case rgw_meta_sync_marker::FullSync:
-        r  = full_sync();
-        if (r < 0) {
-          ldpp_dout(dpp, 10) << "sync: full_sync: shard_id=" << shard_id << " r=" << r << dendl;
-          return set_cr_error(r);
-        }
-        return 0;
-      case rgw_meta_sync_marker::IncrementalSync:
-        r  = incremental_sync();
-        if (r < 0) {
-          ldpp_dout(dpp, 10) << "sync: incremental_sync: shard_id=" << shard_id << " r=" << r << dendl;
-          return set_cr_error(r);
-        }
-        return 0;
-      }
-    }
-    /* unreachable */
-    return 0;
-  }
-
-  void collect_children()
-  {
-    int child_ret;
-    RGWCoroutinesStack *child;
-    while (collect_next(&child_ret, &child)) {
-      auto iter = stack_to_pos.find(child);
-      if (iter == stack_to_pos.end()) {
-        /* some other stack that we don't care about */
-        continue;
-      }
-
-      string& pos = iter->second;
-
-      if (child_ret < 0) {
-        ldpp_dout(sync_env->dpp, 0) << *this << ": child operation stack=" << child << " entry=" << pos << " returned " << child_ret << dendl;
-        // on any error code from RGWMetaSyncSingleEntryCR, we do not advance
-        // the sync status marker past this entry, and set
-        // can_adjust_marker=false to exit out of RGWMetaSyncShardCR.
-        // RGWMetaSyncShardControlCR will rerun RGWMetaSyncShardCR from the
-        // previous marker and retry
-        can_adjust_marker = false;
-      }
-
-      map<string, string>::iterator prev_iter = pos_to_prev.find(pos);
-      ceph_assert(prev_iter != pos_to_prev.end());
-
-      if (pos_to_prev.size() == 1) {
-        if (can_adjust_marker) {
-          sync_marker.marker = pos;
-        }
-        pos_to_prev.erase(prev_iter);
-      } else {
-        ceph_assert(pos_to_prev.size() > 1);
-        pos_to_prev.erase(prev_iter);
-        prev_iter = pos_to_prev.begin();
-        if (can_adjust_marker) {
-          sync_marker.marker = prev_iter->second;
-        }
-      }
-
-      ldpp_dout(sync_env->dpp, 4) << *this << ": adjusting marker pos=" << sync_marker.marker << dendl;
-      stack_to_pos.erase(iter);
-    }
-  }
-
-  int full_sync() {
-#define OMAP_GET_MAX_ENTRIES 100
-    int max_entries = OMAP_GET_MAX_ENTRIES;
-    reenter(&full_cr) {
-      set_status("full_sync");
-      tn->log(10, "start full sync");
-      oid = full_sync_index_shard_oid(shard_id);
-      can_adjust_marker = true;
-      /* grab lock */
-      yield {
-       uint32_t lock_duration = cct->_conf->rgw_sync_lease_period;
-        string lock_name = "sync_lock";
-       rgw::sal::RadosStore* store = sync_env->store;
-        lease_cr.reset(new RGWContinuousLeaseCR(sync_env->async_rados, store,
-                                                rgw_raw_obj(pool, sync_env->shard_obj_name(shard_id)),
-                                                lock_name, lock_duration, this));
-        lease_stack.reset(spawn(lease_cr.get(), false));
-        lost_lock = false;
-      }
-      while (!lease_cr->is_locked()) {
-        if (lease_cr->is_done()) {
-          drain_all();
-          tn->log(5, "failed to take lease");
-          return lease_cr->get_ret_status();
-        }
-        set_sleeping(true);
-        yield;
-      }
-      tn->log(10, "took lease");
-
-      /* lock succeeded, a retry now should avoid previous backoff status */
-      *reset_backoff = true;
-
-      /* prepare marker tracker */
-      set_marker_tracker(new RGWMetaSyncShardMarkerTrack(sync_env,
-                                                         sync_env->shard_obj_name(shard_id),
-                                                         sync_marker, tn));
-
-      marker = sync_marker.marker;
-
-      total_entries = sync_marker.pos;
-
-      /* sync! */
-      do {
-        if (!lease_cr->is_locked()) {
-          tn->log(1, "lease is lost, abort");
-          lost_lock = true;
-          break;
-        }
-        omapkeys = std::make_shared<RGWRadosGetOmapKeysCR::Result>();
-        yield call(new RGWRadosGetOmapKeysCR(sync_env->store, rgw_raw_obj(pool, oid),
-                                             marker, max_entries, omapkeys));
-        if (retcode < 0) {
-          ldpp_dout(sync_env->dpp, 0) << "ERROR: " << __func__ << "(): RGWRadosGetOmapKeysCR() returned ret=" << retcode << dendl;
-          tn->log(0, SSTR("ERROR: failed to list omap keys, status=" << retcode));
-          yield lease_cr->go_down();
-          drain_all();
-          return retcode;
-        }
-        entries = std::move(omapkeys->entries);
-        tn->log(20, SSTR("retrieved " << entries.size() << " entries to sync"));
-        if (entries.size() > 0) {
-          tn->set_flag(RGW_SNS_FLAG_ACTIVE); /* actually have entries to sync */
-        }
-        iter = entries.begin();
-        for (; iter != entries.end(); ++iter) {
-          marker = *iter;
-          tn->log(20, SSTR("full sync: " << marker));
-          total_entries++;
-          if (!marker_tracker->start(marker, total_entries, real_time())) {
-            tn->log(0, SSTR("ERROR: cannot start syncing " << marker << ". Duplicate entry?"));
-          } else {
-            // fetch remote and write locally
-            yield {
-              RGWCoroutinesStack *stack = spawn(new RGWMetaSyncSingleEntryCR(sync_env, marker, marker, MDLOG_STATUS_COMPLETE, marker_tracker, tn), false);
-              // stack_to_pos holds a reference to the stack
-              stack_to_pos[stack] = marker;
-              pos_to_prev[marker] = marker;
-            }
-            // limit spawn window
-            while (num_spawned() > static_cast<size_t>(cct->_conf->rgw_meta_sync_spawn_window)) {
-              yield wait_for_child();
-              collect_children();
-            }
-          }
-        }
-        collect_children();
-      } while (omapkeys->more && can_adjust_marker);
-
-      tn->unset_flag(RGW_SNS_FLAG_ACTIVE); /* actually have entries to sync */
-
-      while (num_spawned() > 1) {
-        yield wait_for_child();
-        collect_children();
-      }
-
-      if (!lost_lock) {
-        /* update marker to reflect we're done with full sync */
-        if (can_adjust_marker) {
-          // apply updates to a temporary marker, or operate() will send us
-          // to incremental_sync() after we yield
-          temp_marker = sync_marker;
-         temp_marker->state = rgw_meta_sync_marker::IncrementalSync;
-         temp_marker->marker = std::move(temp_marker->next_step_marker);
-         temp_marker->next_step_marker.clear();
-         temp_marker->realm_epoch = realm_epoch;
-         ldpp_dout(sync_env->dpp, 4) << *this << ": saving marker pos=" << temp_marker->marker << " realm_epoch=" << realm_epoch << dendl;
-
-         using WriteMarkerCR = RGWSimpleRadosWriteCR<rgw_meta_sync_marker>;
-         yield call(new WriteMarkerCR(sync_env->dpp, sync_env->async_rados, sync_env->store->svc()->sysobj,
-                                      rgw_raw_obj(pool, sync_env->shard_obj_name(shard_id)),
-                                      *temp_marker));
-        }
-
-        if (retcode < 0) {
-          ldpp_dout(sync_env->dpp, 0) << "ERROR: failed to set sync marker: retcode=" << retcode << dendl;
-          yield lease_cr->go_down();
-          drain_all();
-          return retcode;
-        }
-        // clean up full sync index
-        yield {
-          auto oid = full_sync_index_shard_oid(shard_id);
-          call(new RGWRadosRemoveCR(sync_env->store, {pool, oid}));
-        }
-      }
-
-      /* 
-       * if we reached here, it means that lost_lock is true, otherwise the state
-       * change in the previous block will prevent us from reaching here
-       */
-
-      yield lease_cr->go_down();
-
-      lease_cr.reset();
-
-      drain_all();
-
-      if (!can_adjust_marker) {
-        return -EAGAIN;
-      }
-
-      if (lost_lock) {
-        return -EBUSY;
-      }
-
-      tn->log(10, "full sync complete");
-
-      // apply the sync marker update
-      ceph_assert(temp_marker);
-      sync_marker = std::move(*temp_marker);
-      temp_marker = boost::none;
-      // must not yield after this point!
-    }
-    return 0;
-  }
-    
-
-  int incremental_sync() {
-    reenter(&incremental_cr) {
-      set_status("incremental_sync");
-      tn->log(10, "start incremental sync");
-      can_adjust_marker = true;
-      /* grab lock */
-      if (!lease_cr) { /* could have had  a lease_cr lock from previous state */
-        yield {
-          uint32_t lock_duration = cct->_conf->rgw_sync_lease_period;
-          string lock_name = "sync_lock";
-         rgw::sal::RadosStore* store = sync_env->store;
-          lease_cr.reset( new RGWContinuousLeaseCR(sync_env->async_rados, store,
-                                                   rgw_raw_obj(pool, sync_env->shard_obj_name(shard_id)),
-                                                   lock_name, lock_duration, this));
-          lease_stack.reset(spawn(lease_cr.get(), false));
-          lost_lock = false;
-        }
-        while (!lease_cr->is_locked()) {
-          if (lease_cr->is_done()) {
-            drain_all();
-            tn->log(5, "failed to take lease");
-            return lease_cr->get_ret_status();
-          }
-          set_sleeping(true);
-          yield;
-        }
-      }
-      tn->log(10, "took lease");
-      // if the period has advanced, we can't use the existing marker
-      if (sync_marker.realm_epoch < realm_epoch) {
-        ldpp_dout(sync_env->dpp, 4) << "clearing marker=" << sync_marker.marker
-            << " from old realm_epoch=" << sync_marker.realm_epoch
-            << " (now " << realm_epoch << ')' << dendl;
-        sync_marker.realm_epoch = realm_epoch;
-        sync_marker.marker.clear();
-      }
-      mdlog_marker = sync_marker.marker;
-      set_marker_tracker(new RGWMetaSyncShardMarkerTrack(sync_env,
-                                                         sync_env->shard_obj_name(shard_id),
-                                                         sync_marker, tn));
-
-      /*
-       * mdlog_marker: the remote sync marker positiion
-       * sync_marker: the local sync marker position
-       * max_marker: the max mdlog position that we fetched
-       * marker: the current position we try to sync
-       * period_marker: the last marker before the next period begins (optional)
-       */
-      marker = max_marker = sync_marker.marker;
-      /* inc sync */
-      do {
-        if (!lease_cr->is_locked()) {
-          lost_lock = true;
-          tn->log(1, "lease is lost, abort");
-          break;
-        }
-#define INCREMENTAL_MAX_ENTRIES 100
-        ldpp_dout(sync_env->dpp, 20) << __func__ << ":" << __LINE__ << ": shard_id=" << shard_id << " mdlog_marker=" << mdlog_marker << " sync_marker.marker=" << sync_marker.marker << " period_marker=" << period_marker << " truncated=" << truncated << dendl;
-        if (!period_marker.empty() && period_marker <= mdlog_marker) {
-          tn->log(10, SSTR("finished syncing current period: mdlog_marker=" << mdlog_marker << " sync_marker=" << sync_marker.marker << " period_marker=" << period_marker));
-          done_with_period = true;
-          break;
-        }
-       if (mdlog_marker <= max_marker || !truncated) {
-         /* we're at the tip, try to bring more entries */
-          ldpp_dout(sync_env->dpp, 20) << __func__ << ":" << __LINE__ << ": shard_id=" << shard_id << " syncing mdlog for shard_id=" << shard_id << dendl;
-          yield call(new RGWCloneMetaLogCoroutine(sync_env, mdlog,
-                                                  period, shard_id,
-                                                  mdlog_marker, &mdlog_marker));
-       }
-        if (retcode < 0) {
-          tn->log(10, SSTR(*this << ": failed to fetch more log entries, retcode=" << retcode));
-          yield lease_cr->go_down();
-          drain_all();
-          *reset_backoff = false; // back off and try again later
-          return retcode;
-        }
-        truncated = true;
-        *reset_backoff = true; /* if we got to this point, all systems function */
-       if (mdlog_marker > max_marker) {
-          tn->set_flag(RGW_SNS_FLAG_ACTIVE); /* actually have entries to sync */
-          tn->log(20, SSTR("mdlog_marker=" << mdlog_marker << " sync_marker=" << sync_marker.marker));
-          marker = max_marker;
-          yield call(new RGWReadMDLogEntriesCR(sync_env, mdlog, shard_id,
-                                               &max_marker, INCREMENTAL_MAX_ENTRIES,
-                                               &log_entries, &truncated));
-          if (retcode < 0) {
-            tn->log(10, SSTR("failed to list mdlog entries, retcode=" << retcode));
-            yield lease_cr->go_down();
-            drain_all();
-            *reset_backoff = false; // back off and try again later
-            return retcode;
-          }
-          for (log_iter = log_entries.begin(); log_iter != log_entries.end() && !done_with_period; ++log_iter) {
-            if (!period_marker.empty() && period_marker <= log_iter->id) {
-              done_with_period = true;
-              if (period_marker < log_iter->id) {
-                tn->log(10, SSTR("found key=" << log_iter->id
-                    << " past period_marker=" << period_marker));
-                break;
-              }
-              ldpp_dout(sync_env->dpp, 10) << "found key at period_marker=" << period_marker << dendl;
-              // sync this entry, then return control to RGWMetaSyncCR
-            }
-            if (!mdlog_entry.convert_from(*log_iter)) {
-              tn->log(0, SSTR("ERROR: failed to convert mdlog entry, shard_id=" << shard_id << " log_entry: " << log_iter->id << ":" << log_iter->section << ":" << log_iter->name << ":" << log_iter->timestamp << " ... skipping entry"));
-              continue;
-            }
-            tn->log(20, SSTR("log_entry: " << log_iter->id << ":" << log_iter->section << ":" << log_iter->name << ":" << log_iter->timestamp));
-            if (!marker_tracker->start(log_iter->id, 0, log_iter->timestamp.to_real_time())) {
-              ldpp_dout(sync_env->dpp, 0) << "ERROR: cannot start syncing " << log_iter->id << ". Duplicate entry?" << dendl;
-            } else {
-              raw_key = log_iter->section + ":" + log_iter->name;
-              yield {
-                RGWCoroutinesStack *stack = spawn(new RGWMetaSyncSingleEntryCR(sync_env, raw_key, log_iter->id, mdlog_entry.log_data.status, marker_tracker, tn), false);
-                ceph_assert(stack);
-                // stack_to_pos holds a reference to the stack
-                stack_to_pos[stack] = log_iter->id;
-                pos_to_prev[log_iter->id] = marker;
-              }
-              // limit spawn window
-              while (num_spawned() > static_cast<size_t>(cct->_conf->rgw_meta_sync_spawn_window)) {
-                yield wait_for_child();
-                collect_children();
-              }
-            }
-            marker = log_iter->id;
-          }
-        }
-        collect_children();
-       ldpp_dout(sync_env->dpp, 20) << __func__ << ":" << __LINE__ << ": shard_id=" << shard_id << " mdlog_marker=" << mdlog_marker << " max_marker=" << max_marker << " sync_marker.marker=" << sync_marker.marker << " period_marker=" << period_marker << dendl;
-        if (done_with_period) {
-          // return control to RGWMetaSyncCR and advance to the next period
-          tn->log(10, SSTR(*this << ": done with period"));
-          break;
-        }
-       if (mdlog_marker == max_marker && can_adjust_marker) {
-          tn->unset_flag(RGW_SNS_FLAG_ACTIVE);
-         yield wait(utime_t(cct->_conf->rgw_meta_sync_poll_interval, 0));
-       }
-      } while (can_adjust_marker);
-
-      tn->unset_flag(RGW_SNS_FLAG_ACTIVE);
-
-      while (num_spawned() > 1) {
-        yield wait_for_child();
-        collect_children();
-      }
-
-      yield lease_cr->go_down();
-
-      drain_all();
-
-      if (lost_lock) {
-        return -EBUSY;
-      }
-
-      if (!can_adjust_marker) {
-        return -EAGAIN;
-      }
-
-      return set_cr_done();
-    }
-    /* TODO */
-    return 0;
-  }
-};
-
-class RGWMetaSyncShardControlCR : public RGWBackoffControlCR
-{
-  RGWMetaSyncEnv *sync_env;
-
-  const rgw_pool& pool;
-  const std::string& period;
-  epoch_t realm_epoch;
-  RGWMetadataLog* mdlog;
-  uint32_t shard_id;
-  rgw_meta_sync_marker sync_marker;
-  const std::string period_marker;
-
-  RGWSyncTraceNodeRef tn;
-
-  static constexpr bool exit_on_error = false; // retry on all errors
-public:
-  RGWMetaSyncShardControlCR(RGWMetaSyncEnv *_sync_env, const rgw_pool& _pool,
-                            const std::string& period, epoch_t realm_epoch,
-                            RGWMetadataLog* mdlog, uint32_t _shard_id,
-                            const rgw_meta_sync_marker& _marker,
-                            std::string&& period_marker,
-                            RGWSyncTraceNodeRef& _tn_parent)
-    : RGWBackoffControlCR(_sync_env->cct, exit_on_error), sync_env(_sync_env),
-      pool(_pool), period(period), realm_epoch(realm_epoch), mdlog(mdlog),
-      shard_id(_shard_id), sync_marker(_marker),
-      period_marker(std::move(period_marker)) {
-    tn = sync_env->sync_tracer->add_node(_tn_parent, "shard",
-                                         std::to_string(shard_id));
-  }
-
-  RGWCoroutine *alloc_cr() override {
-    return new RGWMetaSyncShardCR(sync_env, pool, period, realm_epoch, mdlog,
-                                  shard_id, sync_marker, period_marker, backoff_ptr(), tn);
-  }
-
-  RGWCoroutine *alloc_finisher_cr() override {
-    rgw::sal::RadosStore* store = sync_env->store;
-    return new RGWSimpleRadosReadCR<rgw_meta_sync_marker>(sync_env->dpp, sync_env->async_rados, store->svc()->sysobj,
-                                                          rgw_raw_obj(pool, sync_env->shard_obj_name(shard_id)),
-                                                          &sync_marker);
-  }
-};
-
-class RGWMetaSyncCR : public RGWCoroutine {
-  RGWMetaSyncEnv *sync_env;
-  const rgw_pool& pool;
-  RGWPeriodHistory::Cursor cursor; //< sync position in period history
-  RGWPeriodHistory::Cursor next; //< next period in history
-  rgw_meta_sync_status sync_status;
-  RGWSyncTraceNodeRef tn;
-
-  std::mutex mutex; //< protect access to shard_crs
-
-  // TODO: it should be enough to hold a reference on the stack only, as calling
-  // RGWCoroutinesStack::wakeup() doesn't refer to the RGWCoroutine if it has
-  // already completed
-  using ControlCRRef = boost::intrusive_ptr<RGWMetaSyncShardControlCR>;
-  using StackRef = boost::intrusive_ptr<RGWCoroutinesStack>;
-  using RefPair = std::pair<ControlCRRef, StackRef>;
-  map<int, RefPair> shard_crs;
-  int ret{0};
-
-public:
-  RGWMetaSyncCR(RGWMetaSyncEnv *_sync_env, const RGWPeriodHistory::Cursor &cursor,
-                const rgw_meta_sync_status& _sync_status, RGWSyncTraceNodeRef& _tn)
-    : RGWCoroutine(_sync_env->cct), sync_env(_sync_env),
-      pool(sync_env->store->svc()->zone->get_zone_params().log_pool),
-      cursor(cursor), sync_status(_sync_status), tn(_tn) {}
-
-  ~RGWMetaSyncCR() {
-  }
-
-  int operate(const DoutPrefixProvider *dpp) override {
-    reenter(this) {
-      // loop through one period at a time
-      tn->log(1, "start");
-      for (;;) {
-        if (cursor == sync_env->store->svc()->mdlog->get_period_history()->get_current()) {
-          next = RGWPeriodHistory::Cursor{};
-          if (cursor) {
-            ldpp_dout(dpp, 10) << "RGWMetaSyncCR on current period="
-                << cursor.get_period().get_id() << dendl;
-          } else {
-            ldpp_dout(dpp, 10) << "RGWMetaSyncCR with no period" << dendl;
-          }
-        } else {
-          next = cursor;
-          next.next();
-          ldpp_dout(dpp, 10) << "RGWMetaSyncCR on period="
-              << cursor.get_period().get_id() << ", next="
-              << next.get_period().get_id() << dendl;
-        }
-
-        yield {
-          // get the mdlog for the current period (may be empty)
-          auto& period_id = sync_status.sync_info.period;
-          auto realm_epoch = sync_status.sync_info.realm_epoch;
-          auto mdlog = sync_env->store->svc()->mdlog->get_log(period_id);
-
-          tn->log(1, SSTR("realm epoch=" << realm_epoch << " period id=" << period_id));
-
-          // prevent wakeup() from accessing shard_crs while we're spawning them
-          std::lock_guard<std::mutex> lock(mutex);
-
-          // sync this period on each shard
-          for (const auto& m : sync_status.sync_markers) {
-            uint32_t shard_id = m.first;
-            auto& marker = m.second;
-
-            std::string period_marker;
-            if (next) {
-              // read the maximum marker from the next period's sync status
-              period_marker = next.get_period().get_sync_status()[shard_id];
-              if (period_marker.empty()) {
-                // no metadata changes have occurred on this shard, skip it
-                ldpp_dout(dpp, 10) << "RGWMetaSyncCR: skipping shard " << shard_id
-                    << " with empty period marker" << dendl;
-                continue;
-              }
-            }
-
-            using ShardCR = RGWMetaSyncShardControlCR;
-            auto cr = new ShardCR(sync_env, pool, period_id, realm_epoch,
-                                  mdlog, shard_id, marker,
-                                  std::move(period_marker), tn);
-            auto stack = spawn(cr, false);
-            shard_crs[shard_id] = RefPair{cr, stack};
-          }
-        }
-        // wait for each shard to complete
-        while (ret == 0 && num_spawned() > 0) {
-          yield wait_for_child();
-          collect(&ret, nullptr);
-        }
-        drain_all();
-        {
-          // drop shard cr refs under lock
-          std::lock_guard<std::mutex> lock(mutex);
-          shard_crs.clear();
-        }
-        if (ret < 0) {
-          return set_cr_error(ret);
-        }
-        // advance to the next period
-        ceph_assert(next);
-        cursor = next;
-
-        // write the updated sync info
-        sync_status.sync_info.period = cursor.get_period().get_id();
-        sync_status.sync_info.realm_epoch = cursor.get_epoch();
-        yield call(new RGWSimpleRadosWriteCR<rgw_meta_sync_info>(dpp, sync_env->async_rados,
-                                                                 sync_env->store->svc()->sysobj,
-                                                                 rgw_raw_obj(pool, sync_env->status_oid()),
-                                                                 sync_status.sync_info));
-      }
-    }
-    return 0;
-  }
-
-  void wakeup(int shard_id) {
-    std::lock_guard<std::mutex> lock(mutex);
-    auto iter = shard_crs.find(shard_id);
-    if (iter == shard_crs.end()) {
-      return;
-    }
-    iter->second.first->wakeup();
-  }
-};
-
-void RGWRemoteMetaLog::init_sync_env(RGWMetaSyncEnv *env) {
-  env->dpp = dpp;
-  env->cct = store->ctx();
-  env->store = store;
-  env->conn = conn;
-  env->async_rados = async_rados;
-  env->http_manager = &http_manager;
-  env->error_logger = error_logger;
-  env->sync_tracer = store->getRados()->get_sync_tracer();
-}
-
-int RGWRemoteMetaLog::read_sync_status(const DoutPrefixProvider *dpp, rgw_meta_sync_status *sync_status)
-{
-  if (store->svc()->zone->is_meta_master()) {
-    return 0;
-  }
-  // cannot run concurrently with run_sync(), so run in a separate manager
-  RGWCoroutinesManager crs(store->ctx(), store->getRados()->get_cr_registry());
-  RGWHTTPManager http_manager(store->ctx(), crs.get_completion_mgr());
-  int ret = http_manager.start();
-  if (ret < 0) {
-    ldpp_dout(dpp, 0) << "failed in http_manager.start() ret=" << ret << dendl;
-    return ret;
-  }
-  RGWMetaSyncEnv sync_env_local = sync_env;
-  sync_env_local.http_manager = &http_manager;
-  tn->log(20, "read sync status");
-  ret = crs.run(dpp, new RGWReadSyncStatusCoroutine(&sync_env_local, sync_status));
-  http_manager.stop();
-  return ret;
-}
-
-int RGWRemoteMetaLog::init_sync_status(const DoutPrefixProvider *dpp)
-{
-  if (store->svc()->zone->is_meta_master()) {
-    return 0;
-  }
-
-  rgw_mdlog_info mdlog_info;
-  int r = read_log_info(dpp, &mdlog_info);
-  if (r < 0) {
-    ldpp_dout(dpp, -1) << "ERROR: fail to fetch master log info (r=" << r << ")" << dendl;
-    return r;
-  }
-
-  rgw_meta_sync_info sync_info;
-  sync_info.num_shards = mdlog_info.num_shards;
-  auto cursor = store->svc()->mdlog->get_period_history()->get_current();
-  if (cursor) {
-    sync_info.period = cursor.get_period().get_id();
-    sync_info.realm_epoch = cursor.get_epoch();
-  }
-
-  return run(dpp, new RGWInitSyncStatusCoroutine(&sync_env, sync_info));
-}
-
-int RGWRemoteMetaLog::store_sync_info(const DoutPrefixProvider *dpp, const rgw_meta_sync_info& sync_info)
-{
-  tn->log(20, "store sync info");
-  return run(dpp, new RGWSimpleRadosWriteCR<rgw_meta_sync_info>(dpp, async_rados, store->svc()->sysobj,
-                                                           rgw_raw_obj(store->svc()->zone->get_zone_params().log_pool, sync_env.status_oid()),
-                                                           sync_info));
-}
-
-// return a cursor to the period at our sync position
-static RGWPeriodHistory::Cursor get_period_at(const DoutPrefixProvider *dpp,
-                                              rgw::sal::RadosStore* store,
-                                              const rgw_meta_sync_info& info,
-                                             optional_yield y)
-{
-  if (info.period.empty()) {
-    // return an empty cursor with error=0
-    return RGWPeriodHistory::Cursor{};
-  }
-
-  // look for an existing period in our history
-  auto cursor = store->svc()->mdlog->get_period_history()->lookup(info.realm_epoch);
-  if (cursor) {
-    // verify that the period ids match
-    auto& existing = cursor.get_period().get_id();
-    if (existing != info.period) {
-      ldpp_dout(dpp, -1) << "ERROR: sync status period=" << info.period
-          << " does not match period=" << existing
-          << " in history at realm epoch=" << info.realm_epoch << dendl;
-      return RGWPeriodHistory::Cursor{-EEXIST};
-    }
-    return cursor;
-  }
-
-  // read the period from rados or pull it from the master
-  RGWPeriod period;
-  int r = store->svc()->mdlog->pull_period(dpp, info.period, period, y);
-  if (r < 0) {
-    ldpp_dout(dpp, -1) << "ERROR: failed to read period id "
-        << info.period << ": " << cpp_strerror(r) << dendl;
-    return RGWPeriodHistory::Cursor{r};
-  }
-  // attach the period to our history
-  cursor = store->svc()->mdlog->get_period_history()->attach(dpp, std::move(period), y);
-  if (!cursor) {
-    r = cursor.get_error();
-    ldpp_dout(dpp, -1) << "ERROR: failed to read period history back to "
-        << info.period << ": " << cpp_strerror(r) << dendl;
-  }
-  return cursor;
-}
-
-int RGWRemoteMetaLog::run_sync(const DoutPrefixProvider *dpp, optional_yield y)
-{
-  if (store->svc()->zone->is_meta_master()) {
-    return 0;
-  }
-
-  int r = 0;
-
-  // get shard count and oldest log period from master
-  rgw_mdlog_info mdlog_info;
-  for (;;) {
-    if (going_down) {
-      ldpp_dout(dpp, 1) << __func__ << "(): going down" << dendl;
-      return 0;
-    }
-    r = read_log_info(dpp, &mdlog_info);
-    if (r == -EIO || r == -ENOENT) {
-      // keep retrying if master isn't alive or hasn't initialized the log
-      ldpp_dout(dpp, 10) << __func__ << "(): waiting for master.." << dendl;
-      backoff.backoff_sleep();
-      continue;
-    }
-    backoff.reset();
-    if (r < 0) {
-      ldpp_dout(dpp, -1) << "ERROR: fail to fetch master log info (r=" << r << ")" << dendl;
-      return r;
-    }
-    break;
-  }
-
-  rgw_meta_sync_status sync_status;
-  do {
-    if (going_down) {
-      ldpp_dout(dpp, 1) << __func__ << "(): going down" << dendl;
-      return 0;
-    }
-    r = run(dpp, new RGWReadSyncStatusCoroutine(&sync_env, &sync_status));
-    if (r < 0 && r != -ENOENT) {
-      ldpp_dout(dpp, 0) << "ERROR: failed to fetch sync status r=" << r << dendl;
-      return r;
-    }
-
-    if (!mdlog_info.period.empty()) {
-      // restart sync if the remote has a period, but:
-      // a) our status does not, or
-      // b) our sync period comes before the remote's oldest log period
-      if (sync_status.sync_info.period.empty() ||
-          sync_status.sync_info.realm_epoch < mdlog_info.realm_epoch) {
-        sync_status.sync_info.state = rgw_meta_sync_info::StateInit;
-        string reason;
-        if (sync_status.sync_info.period.empty()) {
-          reason = "period is empty";
-        } else {
-          reason = SSTR("sync_info realm epoch is behind: " << sync_status.sync_info.realm_epoch << " < " << mdlog_info.realm_epoch);
-        }
-        tn->log(1, "initialize sync (reason: " + reason + ")");
-        ldpp_dout(dpp, 1) << "epoch=" << sync_status.sync_info.realm_epoch
-           << " in sync status comes before remote's oldest mdlog epoch="
-           << mdlog_info.realm_epoch << ", restarting sync" << dendl;
-      }
-    }
-
-    if (sync_status.sync_info.state == rgw_meta_sync_info::StateInit) {
-      ldpp_dout(dpp, 20) << __func__ << "(): init" << dendl;
-      sync_status.sync_info.num_shards = mdlog_info.num_shards;
-      auto cursor = store->svc()->mdlog->get_period_history()->get_current();
-      if (cursor) {
-        // run full sync, then start incremental from the current period/epoch
-        sync_status.sync_info.period = cursor.get_period().get_id();
-        sync_status.sync_info.realm_epoch = cursor.get_epoch();
-      }
-      r = run(dpp, new RGWInitSyncStatusCoroutine(&sync_env, sync_status.sync_info));
-      if (r == -EBUSY) {
-        backoff.backoff_sleep();
-        continue;
-      }
-      backoff.reset();
-      if (r < 0) {
-        ldpp_dout(dpp, 0) << "ERROR: failed to init sync status r=" << r << dendl;
-        return r;
-      }
-    }
-  } while (sync_status.sync_info.state == rgw_meta_sync_info::StateInit);
-
-  auto num_shards = sync_status.sync_info.num_shards;
-  if (num_shards != mdlog_info.num_shards) {
-    ldpp_dout(dpp, -1) << "ERROR: can't sync, mismatch between num shards, master num_shards=" << mdlog_info.num_shards << " local num_shards=" << num_shards << dendl;
-    return -EINVAL;
-  }
-
-  RGWPeriodHistory::Cursor cursor;
-  do {
-    r = run(dpp, new RGWReadSyncStatusCoroutine(&sync_env, &sync_status));
-    if (r < 0 && r != -ENOENT) {
-      tn->log(0, SSTR("ERROR: failed to fetch sync status r=" << r));
-      return r;
-    }
-
-    switch ((rgw_meta_sync_info::SyncState)sync_status.sync_info.state) {
-      case rgw_meta_sync_info::StateBuildingFullSyncMaps:
-        tn->log(20, "building full sync maps");
-        r = run(dpp, new RGWFetchAllMetaCR(&sync_env, num_shards, sync_status.sync_markers, tn));
-        if (r == -EBUSY || r == -EIO) {
-          backoff.backoff_sleep();
-          continue;
-        }
-        backoff.reset();
-        if (r < 0) {
-          tn->log(0, SSTR("ERROR: failed to fetch all metadata keys (r=" << r << ")"));
-          return r;
-        }
-
-        sync_status.sync_info.state = rgw_meta_sync_info::StateSync;
-        r = store_sync_info(dpp, sync_status.sync_info);
-        if (r < 0) {
-          tn->log(0, SSTR("ERROR: failed to update sync status (r=" << r << ")"));
-          return r;
-        }
-        /* fall through */
-      case rgw_meta_sync_info::StateSync:
-        tn->log(20, "sync");
-        // find our position in the period history (if any)
-        cursor = get_period_at(dpp, store, sync_status.sync_info, y);
-        r = cursor.get_error();
-        if (r < 0) {
-          return r;
-        }
-        meta_sync_cr = new RGWMetaSyncCR(&sync_env, cursor, sync_status, tn);
-        r = run(dpp, meta_sync_cr);
-        if (r < 0) {
-          tn->log(0, "ERROR: failed to fetch all metadata keys");
-          return r;
-        }
-        break;
-      default:
-        tn->log(0, "ERROR: bad sync state!");
-        return -EIO;
-    }
-  } while (!going_down);
-
-  return 0;
-}
-
-void RGWRemoteMetaLog::wakeup(int shard_id)
-{
-  if (!meta_sync_cr) {
-    return;
-  }
-  meta_sync_cr->wakeup(shard_id);
-}
-
-int RGWCloneMetaLogCoroutine::operate(const DoutPrefixProvider *dpp)
-{
-  reenter(this) {
-    do {
-      yield {
-        ldpp_dout(dpp, 20) << __func__ << ": shard_id=" << shard_id << ": init request" << dendl;
-        return state_init();
-      }
-      yield {
-        ldpp_dout(dpp, 20) << __func__ << ": shard_id=" << shard_id << ": reading shard status" << dendl;
-        return state_read_shard_status();
-      }
-      yield {
-        ldpp_dout(dpp, 20) << __func__ << ": shard_id=" << shard_id << ": reading shard status complete" << dendl;
-        return state_read_shard_status_complete();
-      }
-      yield {
-        ldpp_dout(dpp, 20) << __func__ << ": shard_id=" << shard_id << ": sending rest request" << dendl;
-        return state_send_rest_request(dpp);
-      }
-      yield {
-        ldpp_dout(dpp, 20) << __func__ << ": shard_id=" << shard_id << ": receiving rest response" << dendl;
-        return state_receive_rest_response();
-      }
-      yield {
-        ldpp_dout(dpp, 20) << __func__ << ": shard_id=" << shard_id << ": storing mdlog entries" << dendl;
-        return state_store_mdlog_entries();
-      }
-    } while (truncated);
-    yield {
-      ldpp_dout(dpp, 20) << __func__ << ": shard_id=" << shard_id << ": storing mdlog entries complete" << dendl;
-      return state_store_mdlog_entries_complete();
-    }
-  }
-
-  return 0;
-}
-
-int RGWCloneMetaLogCoroutine::state_init()
-{
-  data = rgw_mdlog_shard_data();
-
-  return 0;
-}
-
-int RGWCloneMetaLogCoroutine::state_read_shard_status()
-{
-  const bool add_ref = false; // default constructs with refs=1
-
-  completion.reset(new RGWMetadataLogInfoCompletion(
-    [this](int ret, const cls_log_header& header) {
-      if (ret < 0) {
-        if (ret != -ENOENT) {
-          ldpp_dout(sync_env->dpp, 1) << "ERROR: failed to read mdlog info with "
-                                      << cpp_strerror(ret) << dendl;
-        }
-      } else {
-        shard_info.marker = header.max_marker;
-        shard_info.last_update = header.max_time.to_real_time();
-      }
-      // wake up parent stack
-      io_complete();
-    }), add_ref);
-
-  int ret = mdlog->get_info_async(sync_env->dpp, shard_id, completion.get());
-  if (ret < 0) {
-    ldpp_dout(sync_env->dpp, 0) << "ERROR: mdlog->get_info_async() returned ret=" << ret << dendl;
-    return set_cr_error(ret);
-  }
-
-  return io_block(0);
-}
-
-int RGWCloneMetaLogCoroutine::state_read_shard_status_complete()
-{
-  completion.reset();
-
-  ldpp_dout(sync_env->dpp, 20) << "shard_id=" << shard_id << " marker=" << shard_info.marker << " last_update=" << shard_info.last_update << dendl;
-
-  marker = shard_info.marker;
-
-  return 0;
-}
-
-int RGWCloneMetaLogCoroutine::state_send_rest_request(const DoutPrefixProvider *dpp)
-{
-  RGWRESTConn *conn = sync_env->conn;
-
-  char buf[32];
-  snprintf(buf, sizeof(buf), "%d", shard_id);
-
-  char max_entries_buf[32];
-  snprintf(max_entries_buf, sizeof(max_entries_buf), "%d", max_entries);
-
-  const char *marker_key = (marker.empty() ? "" : "marker");
-
-  rgw_http_param_pair pairs[] = { { "type", "metadata" },
-                                  { "id", buf },
-                                  { "period", period.c_str() },
-                                  { "max-entries", max_entries_buf },
-                                  { marker_key, marker.c_str() },
-                                  { NULL, NULL } };
-
-  http_op = new RGWRESTReadResource(conn, "/admin/log", pairs, NULL, sync_env->http_manager);
-
-  init_new_io(http_op);
-
-  int ret = http_op->aio_read(dpp);
-  if (ret < 0) {
-    ldpp_dout(dpp, 0) << "ERROR: failed to fetch mdlog data" << dendl;
-    log_error() << "failed to send http operation: " << http_op->to_str() << " ret=" << ret << std::endl;
-    http_op->put();
-    http_op = NULL;
-    return set_cr_error(ret);
-  }
-
-  return io_block(0);
-}
-
-int RGWCloneMetaLogCoroutine::state_receive_rest_response()
-{
-  int ret = http_op->wait(&data, null_yield);
-  if (ret < 0) {
-    error_stream << "http operation failed: " << http_op->to_str() << " status=" << http_op->get_http_status() << std::endl;
-    ldpp_dout(sync_env->dpp, 5) << "failed to wait for op, ret=" << ret << dendl;
-    http_op->put();
-    http_op = NULL;
-    return set_cr_error(ret);
-  }
-  http_op->put();
-  http_op = NULL;
-
-  ldpp_dout(sync_env->dpp, 20) << "remote mdlog, shard_id=" << shard_id << " num of shard entries: " << data.entries.size() << dendl;
-
-  truncated = ((int)data.entries.size() == max_entries);
-
-  if (data.entries.empty()) {
-    if (new_marker) {
-      *new_marker = marker;
-    }
-    return set_cr_done();
-  }
-
-  if (new_marker) {
-    *new_marker = data.entries.back().id;
-  }
-
-  return 0;
-}
-
-
-int RGWCloneMetaLogCoroutine::state_store_mdlog_entries()
-{
-  list<cls_log_entry> dest_entries;
-
-  vector<rgw_mdlog_entry>::iterator iter;
-  for (iter = data.entries.begin(); iter != data.entries.end(); ++iter) {
-    rgw_mdlog_entry& entry = *iter;
-    ldpp_dout(sync_env->dpp, 20) << "entry: name=" << entry.name << dendl;
-
-    cls_log_entry dest_entry;
-    dest_entry.id = entry.id;
-    dest_entry.section = entry.section;
-    dest_entry.name = entry.name;
-    dest_entry.timestamp = utime_t(entry.timestamp);
-  
-    encode(entry.log_data, dest_entry.data);
-
-    dest_entries.push_back(dest_entry);
-
-    marker = entry.id;
-  }
-
-  RGWAioCompletionNotifier *cn = stack->create_completion_notifier();
-
-  int ret = mdlog->store_entries_in_shard(sync_env->dpp, dest_entries, shard_id, cn->completion());
-  if (ret < 0) {
-    cn->put();
-    ldpp_dout(sync_env->dpp, 10) << "failed to store md log entries shard_id=" << shard_id << " ret=" << ret << dendl;
-    return set_cr_error(ret);
-  }
-  return io_block(0);
-}
-
-int RGWCloneMetaLogCoroutine::state_store_mdlog_entries_complete()
-{
-  return set_cr_done();
-}
-
-void rgw_meta_sync_info::decode_json(JSONObj *obj)
-{
-  string s;
-  JSONDecoder::decode_json("status", s, obj);
-  if (s == "init") {
-    state = StateInit;
-  } else if (s == "building-full-sync-maps") {
-    state = StateBuildingFullSyncMaps;
-  } else if (s == "sync") {
-    state = StateSync;
-  }
-  JSONDecoder::decode_json("num_shards", num_shards, obj);
-  JSONDecoder::decode_json("period", period, obj);
-  JSONDecoder::decode_json("realm_epoch", realm_epoch, obj);
-}
-
-void rgw_meta_sync_info::dump(Formatter *f) const
-{
-  string s;
-  switch ((SyncState)state) {
-  case StateInit:
-    s = "init";
-    break;
-  case StateBuildingFullSyncMaps:
-    s = "building-full-sync-maps";
-    break;
-  case StateSync:
-    s = "sync";
-    break;
-  default:
-    s = "unknown";
-    break;
-  }
-  encode_json("status", s, f);
-  encode_json("num_shards", num_shards, f);
-  encode_json("period", period, f);
-  encode_json("realm_epoch", realm_epoch, f);
-}
-
-
-void rgw_meta_sync_marker::decode_json(JSONObj *obj)
-{
-  int s;
-  JSONDecoder::decode_json("state", s, obj);
-  state = s;
-  JSONDecoder::decode_json("marker", marker, obj);
-  JSONDecoder::decode_json("next_step_marker", next_step_marker, obj);
-  JSONDecoder::decode_json("total_entries", total_entries, obj);
-  JSONDecoder::decode_json("pos", pos, obj);
-  utime_t ut;
-  JSONDecoder::decode_json("timestamp", ut, obj);
-  timestamp = ut.to_real_time();
-  JSONDecoder::decode_json("realm_epoch", realm_epoch, obj);
-}
-
-void rgw_meta_sync_marker::dump(Formatter *f) const
-{
-  encode_json("state", (int)state, f);
-  encode_json("marker", marker, f);
-  encode_json("next_step_marker", next_step_marker, f);
-  encode_json("total_entries", total_entries, f);
-  encode_json("pos", pos, f);
-  encode_json("timestamp", utime_t(timestamp), f);
-  encode_json("realm_epoch", realm_epoch, f);
-}
-
-void rgw_meta_sync_status::decode_json(JSONObj *obj)
-{
-  JSONDecoder::decode_json("info", sync_info, obj);
-  JSONDecoder::decode_json("markers", sync_markers, obj);
-}
-
-void rgw_meta_sync_status::dump(Formatter *f) const {
-  encode_json("info", sync_info, f);
-  encode_json("markers", sync_markers, f);
-}
-
-void rgw_sync_error_info::dump(Formatter *f) const {
-  encode_json("source_zone", source_zone, f);
-  encode_json("error_code", error_code, f);
-  encode_json("message", message, f);
-}
-
diff --git a/src/rgw/store/rados/rgw_sync.h b/src/rgw/store/rados/rgw_sync.h
deleted file mode 100644 (file)
index 8c4e511..0000000
+++ /dev/null
@@ -1,549 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab ft=cpp
-
-#ifndef CEPH_RGW_SYNC_H
-#define CEPH_RGW_SYNC_H
-
-#include <atomic>
-
-#include "include/stringify.h"
-
-#include "rgw_coroutine.h"
-#include "rgw_http_client.h"
-#include "rgw_metadata.h"
-#include "rgw_meta_sync_status.h"
-#include "rgw_sal.h"
-#include "rgw_sal_rados.h"
-#include "rgw_sync_trace.h"
-#include "rgw_mdlog.h"
-
-#define ERROR_LOGGER_SHARDS 32
-#define RGW_SYNC_ERROR_LOG_SHARD_PREFIX "sync.error-log"
-
-struct rgw_mdlog_info {
-  uint32_t num_shards;
-  std::string period; //< period id of the master's oldest metadata log
-  epoch_t realm_epoch; //< realm epoch of oldest metadata log
-
-  rgw_mdlog_info() : num_shards(0), realm_epoch(0) {}
-
-  void decode_json(JSONObj *obj);
-};
-
-
-struct rgw_mdlog_entry {
-  std::string id;
-  std::string section;
-  std::string name;
-  ceph::real_time timestamp;
-  RGWMetadataLogData log_data;
-
-  void decode_json(JSONObj *obj);
-
-  bool convert_from(cls_log_entry& le) {
-    id = le.id;
-    section = le.section;
-    name = le.name;
-    timestamp = le.timestamp.to_real_time();
-    try {
-      auto iter = le.data.cbegin();
-      decode(log_data, iter);
-    } catch (buffer::error& err) {
-      return false;
-    }
-    return true;
-  }
-};
-
-struct rgw_mdlog_shard_data {
-  std::string marker;
-  bool truncated;
-  std::vector<rgw_mdlog_entry> entries;
-
-  void decode_json(JSONObj *obj);
-};
-
-class RGWAsyncRadosProcessor;
-class RGWMetaSyncStatusManager;
-class RGWMetaSyncCR;
-class RGWRESTConn;
-class RGWSyncTraceManager;
-
-class RGWSyncErrorLogger {
-  rgw::sal::RadosStore* store;
-
-  std::vector<std::string> oids;
-  int num_shards;
-
-  std::atomic<int64_t> counter = { 0 };
-public:
-  RGWSyncErrorLogger(rgw::sal::RadosStore* _store, const std::string &oid_prefix, int _num_shards);
-  RGWCoroutine *log_error_cr(const DoutPrefixProvider *dpp, const std::string& source_zone, const std::string& section, const std::string& name, uint32_t error_code, const std::string& message);
-
-  static std::string get_shard_oid(const std::string& oid_prefix, int shard_id);
-};
-
-struct rgw_sync_error_info {
-  std::string source_zone;
-  uint32_t error_code;
-  std::string message;
-
-  rgw_sync_error_info() : error_code(0) {}
-  rgw_sync_error_info(const std::string& _source_zone, uint32_t _error_code, const std::string& _message) : source_zone(_source_zone), error_code(_error_code), message(_message) {}
-
-  void encode(bufferlist& bl) const {
-    ENCODE_START(1, 1, bl);
-    encode(source_zone, bl);
-    encode(error_code, bl);
-    encode(message, bl);
-    ENCODE_FINISH(bl);
-  }
-
-  void decode(bufferlist::const_iterator& bl) {
-    DECODE_START(1, bl);
-    decode(source_zone, bl);
-    decode(error_code, bl);
-    decode(message, bl);
-    DECODE_FINISH(bl);
-  }
-
-  void dump(Formatter *f) const;
-};
-WRITE_CLASS_ENCODER(rgw_sync_error_info)
-
-#define DEFAULT_BACKOFF_MAX 30
-
-class RGWSyncBackoff {
-  int cur_wait;
-  int max_secs;
-
-  void update_wait_time();
-public:
-  explicit RGWSyncBackoff(int _max_secs = DEFAULT_BACKOFF_MAX) : cur_wait(0), max_secs(_max_secs) {}
-
-  void backoff_sleep();
-  void reset() {
-    cur_wait = 0;
-  }
-
-  void backoff(RGWCoroutine *op);
-};
-
-class RGWBackoffControlCR : public RGWCoroutine
-{
-  RGWCoroutine *cr;
-  ceph::mutex lock;
-
-  RGWSyncBackoff backoff;
-  bool reset_backoff;
-
-  bool exit_on_error;
-
-protected:
-  bool *backoff_ptr() {
-    return &reset_backoff;
-  }
-
-  ceph::mutex& cr_lock() {
-    return lock;
-  }
-
-  RGWCoroutine *get_cr() {
-    return cr;
-  }
-
-public:
-  RGWBackoffControlCR(CephContext *_cct, bool _exit_on_error)
-    : RGWCoroutine(_cct),
-      cr(nullptr),
-      lock(ceph::make_mutex("RGWBackoffControlCR::lock:" + stringify(this))),
-      reset_backoff(false), exit_on_error(_exit_on_error) {
-  }
-
-  ~RGWBackoffControlCR() override {
-    if (cr) {
-      cr->put();
-    }
-  }
-
-  virtual RGWCoroutine *alloc_cr() = 0;
-  virtual RGWCoroutine *alloc_finisher_cr() { return NULL; }
-
-  int operate(const DoutPrefixProvider *dpp) override;
-};
-
-struct RGWMetaSyncEnv {
-  const DoutPrefixProvider *dpp;
-  CephContext *cct{nullptr};
-  rgw::sal::RadosStore* store{nullptr};
-  RGWRESTConn *conn{nullptr};
-  RGWAsyncRadosProcessor *async_rados{nullptr};
-  RGWHTTPManager *http_manager{nullptr};
-  RGWSyncErrorLogger *error_logger{nullptr};
-  RGWSyncTraceManager *sync_tracer{nullptr};
-
-  RGWMetaSyncEnv() {}
-
-  void init(const DoutPrefixProvider *_dpp, CephContext *_cct, rgw::sal::RadosStore* _store, RGWRESTConn *_conn,
-            RGWAsyncRadosProcessor *_async_rados, RGWHTTPManager *_http_manager,
-            RGWSyncErrorLogger *_error_logger, RGWSyncTraceManager *_sync_tracer);
-
-  std::string shard_obj_name(int shard_id);
-  std::string status_oid();
-};
-
-class RGWRemoteMetaLog : public RGWCoroutinesManager {
-  const DoutPrefixProvider *dpp;
-  rgw::sal::RadosStore* store;
-  RGWRESTConn *conn;
-  RGWAsyncRadosProcessor *async_rados;
-
-  RGWHTTPManager http_manager;
-  RGWMetaSyncStatusManager *status_manager;
-  RGWSyncErrorLogger *error_logger{nullptr};
-  RGWSyncTraceManager *sync_tracer{nullptr};
-
-  RGWMetaSyncCR *meta_sync_cr{nullptr};
-
-  RGWSyncBackoff backoff;
-
-  RGWMetaSyncEnv sync_env;
-
-  void init_sync_env(RGWMetaSyncEnv *env);
-  int store_sync_info(const DoutPrefixProvider *dpp, const rgw_meta_sync_info& sync_info);
-
-  std::atomic<bool> going_down = { false };
-
-  RGWSyncTraceNodeRef tn;
-
-public:
-  RGWRemoteMetaLog(const DoutPrefixProvider *dpp, rgw::sal::RadosStore* _store,
-                   RGWAsyncRadosProcessor *async_rados,
-                   RGWMetaSyncStatusManager *_sm)
-    : RGWCoroutinesManager(_store->ctx(), _store->getRados()->get_cr_registry()),
-      dpp(dpp), store(_store), conn(NULL), async_rados(async_rados),
-      http_manager(store->ctx(), completion_mgr),
-      status_manager(_sm) {}
-
-  virtual ~RGWRemoteMetaLog() override;
-
-  int init();
-  void finish();
-
-  int read_log_info(const DoutPrefixProvider *dpp, rgw_mdlog_info *log_info);
-  int read_master_log_shards_info(const DoutPrefixProvider *dpp, const std::string& master_period, std::map<int, RGWMetadataLogInfo> *shards_info);
-  int read_master_log_shards_next(const DoutPrefixProvider *dpp, const std::string& period, std::map<int, std::string> shard_markers, std::map<int, rgw_mdlog_shard_data> *result);
-  int read_sync_status(const DoutPrefixProvider *dpp, rgw_meta_sync_status *sync_status);
-  int init_sync_status(const DoutPrefixProvider *dpp);
-  int run_sync(const DoutPrefixProvider *dpp, optional_yield y);
-
-  void wakeup(int shard_id);
-
-  RGWMetaSyncEnv& get_sync_env() {
-    return sync_env;
-  }
-};
-
-class RGWMetaSyncStatusManager : public DoutPrefixProvider {
-  rgw::sal::RadosStore* store;
-  librados::IoCtx ioctx;
-
-  RGWRemoteMetaLog master_log;
-
-  std::map<int, rgw_raw_obj> shard_objs;
-
-  struct utime_shard {
-    real_time ts;
-    int shard_id;
-
-    utime_shard() : shard_id(-1) {}
-
-    bool operator<(const utime_shard& rhs) const {
-      if (ts == rhs.ts) {
-       return shard_id < rhs.shard_id;
-      }
-      return ts < rhs.ts;
-    }
-  };
-
-  ceph::shared_mutex ts_to_shard_lock = ceph::make_shared_mutex("ts_to_shard_lock");
-  std::map<utime_shard, int> ts_to_shard;
-  std::vector<std::string> clone_markers;
-
-public:
-  RGWMetaSyncStatusManager(rgw::sal::RadosStore* _store, RGWAsyncRadosProcessor *async_rados)
-    : store(_store), master_log(this, store, async_rados, this)
-  {}
-
-  virtual ~RGWMetaSyncStatusManager() override;
-
-  int init(const DoutPrefixProvider *dpp);
-
-  int read_sync_status(const DoutPrefixProvider *dpp, rgw_meta_sync_status *sync_status) {
-    return master_log.read_sync_status(dpp, sync_status);
-  }
-  int init_sync_status(const DoutPrefixProvider *dpp) { return master_log.init_sync_status(dpp); }
-  int read_log_info(const DoutPrefixProvider *dpp, rgw_mdlog_info *log_info) {
-    return master_log.read_log_info(dpp, log_info);
-  }
-  int read_master_log_shards_info(const DoutPrefixProvider *dpp, const std::string& master_period, std::map<int, RGWMetadataLogInfo> *shards_info) {
-    return master_log.read_master_log_shards_info(dpp, master_period, shards_info);
-  }
-  int read_master_log_shards_next(const DoutPrefixProvider *dpp, const std::string& period, std::map<int, std::string> shard_markers, std::map<int, rgw_mdlog_shard_data> *result) {
-    return master_log.read_master_log_shards_next(dpp, period, shard_markers, result);
-  }
-
-  int run(const DoutPrefixProvider *dpp, optional_yield y) { return master_log.run_sync(dpp, y); }
-
-
-  // implements DoutPrefixProvider
-  CephContext *get_cct() const override { return store->ctx(); }
-  unsigned get_subsys() const override;
-  std::ostream& gen_prefix(std::ostream& out) const override;
-
-  void wakeup(int shard_id) { return master_log.wakeup(shard_id); }
-  void stop() {
-    master_log.finish();
-  }
-};
-
-class RGWOrderCallCR : public RGWCoroutine
-{
-public:
-  RGWOrderCallCR(CephContext *cct) : RGWCoroutine(cct) {}
-
-  virtual void call_cr(RGWCoroutine *_cr) = 0;
-};
-
-class RGWLastCallerWinsCR : public RGWOrderCallCR
-{
-  RGWCoroutine *cr{nullptr};
-
-public:
-  explicit RGWLastCallerWinsCR(CephContext *cct) : RGWOrderCallCR(cct) {}
-  ~RGWLastCallerWinsCR() {
-    if (cr) {
-      cr->put();
-    }
-  }
-
-  int operate(const DoutPrefixProvider *dpp) override;
-
-  void call_cr(RGWCoroutine *_cr) override {
-    if (cr) {
-      cr->put();
-    }
-    cr = _cr;
-  }
-};
-
-template <class T, class K>
-class RGWSyncShardMarkerTrack {
-  struct marker_entry {
-    uint64_t pos;
-    real_time timestamp;
-
-    marker_entry() : pos(0) {}
-    marker_entry(uint64_t _p, const real_time& _ts) : pos(_p), timestamp(_ts) {}
-  };
-  typename std::map<T, marker_entry> pending;
-
-  std::map<T, marker_entry> finish_markers;
-
-  int window_size;
-  int updates_since_flush;
-
-  RGWOrderCallCR *order_cr{nullptr};
-
-protected:
-  typename std::set<K> need_retry_set;
-
-  virtual RGWCoroutine *store_marker(const T& new_marker, uint64_t index_pos, const real_time& timestamp) = 0;
-  virtual RGWOrderCallCR *allocate_order_control_cr() = 0;
-  virtual void handle_finish(const T& marker) { }
-
-public:
-  RGWSyncShardMarkerTrack(int _window_size) : window_size(_window_size), updates_since_flush(0) {}
-  virtual ~RGWSyncShardMarkerTrack() {
-    if (order_cr) {
-      order_cr->put();
-    }
-  }
-
-  bool start(const T& pos, int index_pos, const real_time& timestamp) {
-    if (pending.find(pos) != pending.end()) {
-      return false;
-    }
-    pending[pos] = marker_entry(index_pos, timestamp);
-    return true;
-  }
-
-  void try_update_high_marker(const T& pos, int index_pos, const real_time& timestamp) {
-    finish_markers[pos] = marker_entry(index_pos, timestamp);
-  }
-
-  RGWCoroutine *finish(const T& pos) {
-    if (pending.empty()) {
-      /* can happen, due to a bug that ended up with multiple objects with the same name and version
-       * -- which can happen when versioning is enabled an the version is 'null'.
-       */
-      return NULL;
-    }
-
-    typename std::map<T, marker_entry>::iterator iter = pending.begin();
-
-    bool is_first = (pos == iter->first);
-
-    typename std::map<T, marker_entry>::iterator pos_iter = pending.find(pos);
-    if (pos_iter == pending.end()) {
-      /* see pending.empty() comment */
-      return NULL;
-    }
-
-    finish_markers[pos] = pos_iter->second;
-
-    pending.erase(pos);
-
-    handle_finish(pos);
-
-    updates_since_flush++;
-
-    if (is_first && (updates_since_flush >= window_size || pending.empty())) {
-      return flush();
-    }
-    return NULL;
-  }
-
-  RGWCoroutine *flush() {
-    if (finish_markers.empty()) {
-      return NULL;
-    }
-
-    typename std::map<T, marker_entry>::iterator i;
-
-    if (pending.empty()) {
-      i = finish_markers.end();
-    } else {
-      i = finish_markers.lower_bound(pending.begin()->first);
-    }
-    if (i == finish_markers.begin()) {
-      return NULL;
-    }
-    updates_since_flush = 0;
-
-    auto last = i;
-    --i;
-    const T& high_marker = i->first;
-    marker_entry& high_entry = i->second;
-    RGWCoroutine *cr = order(store_marker(high_marker, high_entry.pos, high_entry.timestamp));
-    finish_markers.erase(finish_markers.begin(), last);
-    return cr;
-  }
-
-  /*
-   * a key needs retry if it was processing when another marker that points
-   * to the same bucket shards arrives. Instead of processing it, we mark
-   * it as need_retry so that when we finish processing the original, we
-   * retry the processing on the same bucket shard, in case there are more
-   * entries to process. This closes a race that can happen.
-   */
-  bool need_retry(const K& key) {
-    return (need_retry_set.find(key) != need_retry_set.end());
-  }
-
-  void set_need_retry(const K& key) {
-    need_retry_set.insert(key);
-  }
-
-  void reset_need_retry(const K& key) {
-    need_retry_set.erase(key);
-  }
-
-  RGWCoroutine *order(RGWCoroutine *cr) {
-    /* either returns a new RGWLastWriteWinsCR, or update existing one, in which case it returns
-     * nothing and the existing one will call the cr
-     */
-    if (order_cr && order_cr->is_done()) {
-      order_cr->put();
-      order_cr = nullptr;
-    }
-    if (!order_cr) {
-      order_cr = allocate_order_control_cr();
-      order_cr->get();
-      order_cr->call_cr(cr);
-      return order_cr;
-    }
-    order_cr->call_cr(cr);
-    return nullptr; /* don't call it a second time */
-  }
-};
-
-class RGWMetaSyncShardMarkerTrack;
-
-class RGWMetaSyncSingleEntryCR : public RGWCoroutine {
-  RGWMetaSyncEnv *sync_env;
-
-  std::string raw_key;
-  std::string entry_marker;
-  RGWMDLogStatus op_status;
-
-  ssize_t pos;
-  std::string section;
-  std::string key;
-
-  int sync_status;
-
-  bufferlist md_bl;
-
-  RGWMetaSyncShardMarkerTrack *marker_tracker;
-
-  int tries;
-
-  bool error_injection;
-
-  RGWSyncTraceNodeRef tn;
-
-public:
-  RGWMetaSyncSingleEntryCR(RGWMetaSyncEnv *_sync_env,
-                           const std::string& _raw_key, const std::string& _entry_marker,
-                           const RGWMDLogStatus& _op_status,
-                           RGWMetaSyncShardMarkerTrack *_marker_tracker, const RGWSyncTraceNodeRef& _tn_parent);
-
-  int operate(const DoutPrefixProvider *dpp) override;
-};
-
-class RGWShardCollectCR : public RGWCoroutine {
-  int current_running = 0;
- protected:
-  int max_concurrent;
-  int status = 0;
-
-  // called with the result of each child. error codes can be ignored by
-  // returning 0. if handle_result() returns a negative value, it's
-  // treated as an error and stored in 'status'. the last such error is
-  // reported to the caller with set_cr_error()
-  virtual int handle_result(int r) = 0;
- public:
-  RGWShardCollectCR(CephContext *_cct, int _max_concurrent)
-    : RGWCoroutine(_cct), max_concurrent(_max_concurrent)
-  {}
-
-  virtual bool spawn_next() = 0;
-  int operate(const DoutPrefixProvider *dpp) override;
-};
-
-// factory functions for meta sync coroutines needed in mdlog trimming
-
-RGWCoroutine* create_read_remote_mdlog_shard_info_cr(RGWMetaSyncEnv *env,
-                                                     const std::string& period,
-                                                     int shard_id,
-                                                     RGWMetadataLogInfo* info);
-
-RGWCoroutine* create_list_remote_mdlog_shard_cr(RGWMetaSyncEnv *env,
-                                                const std::string& period,
-                                                int shard_id,
-                                                const std::string& marker,
-                                                uint32_t max_entries,
-                                                rgw_mdlog_shard_data *result);
-
-#endif
diff --git a/src/rgw/store/rados/rgw_sync_counters.cc b/src/rgw/store/rados/rgw_sync_counters.cc
deleted file mode 100644 (file)
index 1d23d58..0000000
+++ /dev/null
@@ -1,28 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab ft=cpp
-
-#include "common/ceph_context.h"
-#include "rgw_sync_counters.h"
-
-namespace sync_counters {
-
-PerfCountersRef build(CephContext *cct, const std::string& name)
-{
-  PerfCountersBuilder b(cct, name, l_first, l_last);
-
-  // share these counters with ceph-mgr
-  b.set_prio_default(PerfCountersBuilder::PRIO_USEFUL);
-
-  b.add_u64_avg(l_fetch, "fetch_bytes", "Number of object bytes replicated");
-  b.add_u64_counter(l_fetch_not_modified, "fetch_not_modified", "Number of objects already replicated");
-  b.add_u64_counter(l_fetch_err, "fetch_errors", "Number of object replication errors");
-
-  b.add_time_avg(l_poll, "poll_latency", "Average latency of replication log requests");
-  b.add_u64_counter(l_poll_err, "poll_errors", "Number of replication log request errors");
-
-  auto logger = PerfCountersRef{ b.create_perf_counters(), cct };
-  cct->get_perfcounters_collection()->add(logger.get());
-  return logger;
-}
-
-} // namespace sync_counters
diff --git a/src/rgw/store/rados/rgw_sync_counters.h b/src/rgw/store/rados/rgw_sync_counters.h
deleted file mode 100644 (file)
index df3acc6..0000000
+++ /dev/null
@@ -1,25 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab ft=cpp
-
-#pragma once
-
-#include "common/perf_counters_collection.h"
-
-namespace sync_counters {
-
-enum {
-  l_first = 805000,
-
-  l_fetch,
-  l_fetch_not_modified,
-  l_fetch_err,
-
-  l_poll,
-  l_poll_err,
-
-  l_last,
-};
-
-PerfCountersRef build(CephContext *cct, const std::string& name);
-
-} // namespace sync_counters
diff --git a/src/rgw/store/rados/rgw_sync_error_repo.cc b/src/rgw/store/rados/rgw_sync_error_repo.cc
deleted file mode 100644 (file)
index 44305b6..0000000
+++ /dev/null
@@ -1,205 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab ft=cpp
-
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2020 Red Hat, Inc
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation.  See file COPYING.
- */
-
-#include "rgw_sync_error_repo.h"
-#include "rgw_coroutine.h"
-#include "rgw_sal.h"
-#include "services/svc_rados.h"
-#include "cls/cmpomap/client.h"
-
-namespace rgw::error_repo {
-
-// prefix for the binary encoding of keys. this particular value is not
-// valid as the first byte of a utf8 code point, so we use this to
-// differentiate the binary encoding from existing string keys for
-// backward-compatibility
-constexpr uint8_t binary_key_prefix = 0x80;
-
-struct key_type {
-  rgw_bucket_shard bs;
-  std::optional<uint64_t> gen;
-};
-
-void encode(const key_type& k, bufferlist& bl, uint64_t f=0)
-{
-  ENCODE_START(1, 1, bl);
-  encode(k.bs, bl);
-  encode(k.gen, bl);
-  ENCODE_FINISH(bl);
-}
-
-void decode(key_type& k, bufferlist::const_iterator& bl)
-{
-  DECODE_START(1, bl);
-  decode(k.bs, bl);
-  decode(k.gen, bl);
-  DECODE_FINISH(bl);
-}
-
-std::string encode_key(const rgw_bucket_shard& bs,
-                       std::optional<uint64_t> gen)
-{
-  using ceph::encode;
-  const auto key = key_type{bs, gen};
-  bufferlist bl;
-  encode(binary_key_prefix, bl);
-  encode(key, bl);
-  return bl.to_str();
-}
-
-int decode_key(std::string encoded,
-               rgw_bucket_shard& bs,
-               std::optional<uint64_t>& gen)
-{
-  using ceph::decode;
-  key_type key;
-  const auto bl = bufferlist::static_from_string(encoded);
-  auto p = bl.cbegin();
-  try {
-    uint8_t prefix;
-    decode(prefix, p);
-    if (prefix != binary_key_prefix) {
-      return -EINVAL;
-    }
-    decode(key, p);
-  } catch (const buffer::error&) {
-    return -EIO;
-  }
-  if (!p.end()) {
-    return -EIO; // buffer contained unexpected bytes
-  }
-  bs = std::move(key.bs);
-  gen = key.gen;
-  return 0;
-}
-
-ceph::real_time decode_value(const bufferlist& bl)
-{
-  uint64_t value;
-  try {
-    using ceph::decode;
-    decode(value, bl);
-  } catch (const buffer::error&) {
-    value = 0; // empty buffer = 0
-  }
-  return ceph::real_clock::zero() + ceph::timespan(value);
-}
-
-int write(librados::ObjectWriteOperation& op,
-          const std::string& key,
-          ceph::real_time timestamp)
-{
-  // overwrite the existing timestamp if value is greater
-  const uint64_t value = timestamp.time_since_epoch().count();
-  using namespace ::cls::cmpomap;
-  const bufferlist zero = u64_buffer(0); // compare against 0 for missing keys
-  return cmp_set_vals(op, Mode::U64, Op::GT, {{key, u64_buffer(value)}}, zero);
-}
-
-int remove(librados::ObjectWriteOperation& op,
-           const std::string& key,
-           ceph::real_time timestamp)
-{
-  // remove the omap key if value >= existing
-  const uint64_t value = timestamp.time_since_epoch().count();
-  using namespace ::cls::cmpomap;
-  return cmp_rm_keys(op, Mode::U64, Op::GTE, {{key, u64_buffer(value)}});
-}
-
-class RGWErrorRepoWriteCR : public RGWSimpleCoroutine {
-  RGWSI_RADOS::Obj obj;
-  std::string key;
-  ceph::real_time timestamp;
-
-  boost::intrusive_ptr<RGWAioCompletionNotifier> cn;
- public:
-  RGWErrorRepoWriteCR(RGWSI_RADOS* rados, const rgw_raw_obj& raw_obj,
-                      const std::string& key, ceph::real_time timestamp)
-    : RGWSimpleCoroutine(rados->ctx()),
-      obj(rados->obj(raw_obj)),
-      key(key), timestamp(timestamp)
-  {}
-
-  int send_request(const DoutPrefixProvider *dpp) override {
-    librados::ObjectWriteOperation op;
-    int r = write(op, key, timestamp);
-    if (r < 0) {
-      return r;
-    }
-    r = obj.open(dpp);
-    if (r < 0) {
-      return r;
-    }
-
-    cn = stack->create_completion_notifier();
-    return obj.aio_operate(cn->completion(), &op);
-  }
-
-  int request_complete() override {
-    return cn->completion()->get_return_value();
-  }
-};
-
-RGWCoroutine* write_cr(RGWSI_RADOS* rados,
-                       const rgw_raw_obj& obj,
-                       const std::string& key,
-                       ceph::real_time timestamp)
-{
-  return new RGWErrorRepoWriteCR(rados, obj, key, timestamp);
-}
-
-
-class RGWErrorRepoRemoveCR : public RGWSimpleCoroutine {
-  RGWSI_RADOS::Obj obj;
-  std::string key;
-  ceph::real_time timestamp;
-
-  boost::intrusive_ptr<RGWAioCompletionNotifier> cn;
- public:
-  RGWErrorRepoRemoveCR(RGWSI_RADOS* rados, const rgw_raw_obj& raw_obj,
-                       const std::string& key, ceph::real_time timestamp)
-    : RGWSimpleCoroutine(rados->ctx()),
-      obj(rados->obj(raw_obj)),
-      key(key), timestamp(timestamp)
-  {}
-
-  int send_request(const DoutPrefixProvider *dpp) override {
-    librados::ObjectWriteOperation op;
-    int r = remove(op, key, timestamp);
-    if (r < 0) {
-      return r;
-    }
-    r = obj.open(dpp);
-    if (r < 0) {
-      return r;
-    }
-
-    cn = stack->create_completion_notifier();
-    return obj.aio_operate(cn->completion(), &op);
-  }
-
-  int request_complete() override {
-    return cn->completion()->get_return_value();
-  }
-};
-
-RGWCoroutine* remove_cr(RGWSI_RADOS* rados,
-                        const rgw_raw_obj& obj,
-                        const std::string& key,
-                        ceph::real_time timestamp)
-{
-  return new RGWErrorRepoRemoveCR(rados, obj, key, timestamp);
-}
-
-} // namespace rgw::error_repo
diff --git a/src/rgw/store/rados/rgw_sync_error_repo.h b/src/rgw/store/rados/rgw_sync_error_repo.h
deleted file mode 100644 (file)
index 60525d2..0000000
+++ /dev/null
@@ -1,59 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab ft=cpp
-
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2020 Red Hat, Inc
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation.  See file COPYING.
- */
-
-#pragma once
-
-#include <optional>
-#include "include/rados/librados_fwd.hpp"
-#include "include/buffer_fwd.h"
-#include "common/ceph_time.h"
-
-class RGWSI_RADOS;
-class RGWCoroutine;
-struct rgw_raw_obj;
-struct rgw_bucket_shard;
-
-namespace rgw::error_repo {
-
-// binary-encode a bucket/shard/gen and return it as a string
-std::string encode_key(const rgw_bucket_shard& bs,
-                       std::optional<uint64_t> gen);
-
-// try to decode a key. returns -EINVAL if not in binary format
-int decode_key(std::string encoded,
-               rgw_bucket_shard& bs,
-               std::optional<uint64_t>& gen);
-
-// decode a timestamp as a uint64_t for CMPXATTR_MODE_U64
-ceph::real_time decode_value(const ceph::bufferlist& bl);
-
-// write an omap key iff the given timestamp is newer
-int write(librados::ObjectWriteOperation& op,
-          const std::string& key,
-          ceph::real_time timestamp);
-RGWCoroutine* write_cr(RGWSI_RADOS* rados,
-                       const rgw_raw_obj& obj,
-                       const std::string& key,
-                       ceph::real_time timestamp);
-
-// remove an omap key iff there isn't a newer timestamp
-int remove(librados::ObjectWriteOperation& op,
-           const std::string& key,
-           ceph::real_time timestamp);
-RGWCoroutine* remove_cr(RGWSI_RADOS* rados,
-                        const rgw_raw_obj& obj,
-                        const std::string& key,
-                        ceph::real_time timestamp);
-
-} // namespace rgw::error_repo
diff --git a/src/rgw/store/rados/rgw_sync_module.cc b/src/rgw/store/rados/rgw_sync_module.cc
deleted file mode 100644 (file)
index 5a1e70b..0000000
+++ /dev/null
@@ -1,87 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab ft=cpp
-
-#include "rgw_common.h"
-#include "rgw_coroutine.h"
-#include "rgw_cr_rados.h"
-#include "rgw_sync_module.h"
-#include "rgw_data_sync.h"
-#include "rgw_bucket.h"
-
-#include "rgw_sync_module_log.h"
-#include "rgw_sync_module_es.h"
-#include "rgw_sync_module_aws.h"
-
-#include <boost/asio/yield.hpp>
-
-#define dout_subsys ceph_subsys_rgw
-
-RGWMetadataHandler *RGWSyncModuleInstance::alloc_bucket_meta_handler()
-{
-  return RGWBucketMetaHandlerAllocator::alloc();
-}
-
-RGWBucketInstanceMetadataHandlerBase* RGWSyncModuleInstance::alloc_bucket_instance_meta_handler(rgw::sal::Driver* driver)
-{
-  return RGWBucketInstanceMetaHandlerAllocator::alloc(driver);
-}
-
-RGWStatRemoteObjCBCR::RGWStatRemoteObjCBCR(RGWDataSyncCtx *_sc,
-                       rgw_bucket& _src_bucket, rgw_obj_key& _key) : RGWCoroutine(_sc->cct),
-                                                          sc(_sc), sync_env(_sc->env),
-                                                          src_bucket(_src_bucket), key(_key) {
-}
-
-RGWCallStatRemoteObjCR::RGWCallStatRemoteObjCR(RGWDataSyncCtx *_sc,
-                                               rgw_bucket& _src_bucket, rgw_obj_key& _key) : RGWCoroutine(_sc->cct),
-                                                                                                 sc(_sc), sync_env(_sc->env),
-                                                                                                 src_bucket(_src_bucket), key(_key) {
-}
-
-int RGWCallStatRemoteObjCR::operate(const DoutPrefixProvider *dpp) {
-  reenter(this) {
-    yield {
-      call(new RGWStatRemoteObjCR(sync_env->async_rados, sync_env->driver,
-                                  sc->source_zone,
-                                  src_bucket, key, &mtime, &size, &etag, &attrs, &headers));
-    }
-    if (retcode < 0) {
-      ldpp_dout(dpp, 10) << "RGWStatRemoteObjCR() returned " << retcode << dendl;
-      return set_cr_error(retcode);
-    }
-    ldpp_dout(dpp, 20) << "stat of remote obj: z=" << sc->source_zone
-                             << " b=" << src_bucket << " k=" << key
-                             << " size=" << size << " mtime=" << mtime << dendl;
-    yield {
-      RGWStatRemoteObjCBCR *cb = allocate_callback();
-      if (cb) {
-        cb->set_result(mtime, size, etag, std::move(attrs), std::move(headers));
-        call(cb);
-      }
-    }
-    if (retcode < 0) {
-      ldpp_dout(dpp, 10) << "RGWStatRemoteObjCR() callback returned " << retcode << dendl;
-      return set_cr_error(retcode);
-    }
-    return set_cr_done();
-  }
-  return 0;
-}
-
-void rgw_register_sync_modules(RGWSyncModulesManager *modules_manager)
-{
-  RGWSyncModuleRef default_module(std::make_shared<RGWDefaultSyncModule>());
-  modules_manager->register_module("rgw", default_module, true);
-
-  RGWSyncModuleRef archive_module(std::make_shared<RGWArchiveSyncModule>());
-  modules_manager->register_module("archive", archive_module);
-
-  RGWSyncModuleRef log_module(std::make_shared<RGWLogSyncModule>());
-  modules_manager->register_module("log", log_module);
-
-  RGWSyncModuleRef es_module(std::make_shared<RGWElasticSyncModule>());
-  modules_manager->register_module("elasticsearch", es_module);
-
-  RGWSyncModuleRef aws_module(std::make_shared<RGWAWSSyncModule>());
-  modules_manager->register_module("cloud", aws_module);
-}
diff --git a/src/rgw/store/rados/rgw_sync_module.h b/src/rgw/store/rados/rgw_sync_module.h
deleted file mode 100644 (file)
index 6d974c3..0000000
+++ /dev/null
@@ -1,202 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab ft=cpp
-
-#ifndef CEPH_RGW_SYNC_MODULE_H
-#define CEPH_RGW_SYNC_MODULE_H
-
-#include "rgw_common.h"
-#include "rgw_coroutine.h"
-
-class RGWBucketInfo;
-class RGWRemoteDataLog;
-struct RGWDataSyncCtx;
-struct RGWDataSyncEnv;
-struct rgw_bucket_entry_owner;
-struct rgw_obj_key;
-struct rgw_bucket_sync_pipe;
-
-
-class RGWDataSyncModule {
-public:
-  RGWDataSyncModule() {}
-  virtual ~RGWDataSyncModule() {}
-
-  virtual void init(RGWDataSyncCtx *sync_env, uint64_t instance_id) {}
-
-  virtual RGWCoroutine *init_sync(const DoutPrefixProvider *dpp, RGWDataSyncCtx *sc) {
-    return nullptr;
-  }
-
-  virtual RGWCoroutine *start_sync(const DoutPrefixProvider *dpp, RGWDataSyncCtx *sc) {
-    return nullptr;
-  }
-  virtual RGWCoroutine *sync_object(const DoutPrefixProvider *dpp, RGWDataSyncCtx *sc, rgw_bucket_sync_pipe& sync_pipe, rgw_obj_key& key, std::optional<uint64_t> versioned_epoch, rgw_zone_set *zones_trace) = 0;
-  virtual RGWCoroutine *remove_object(const DoutPrefixProvider *dpp, RGWDataSyncCtx *sc, rgw_bucket_sync_pipe& bucket_info, rgw_obj_key& key, real_time& mtime,
-                                      bool versioned, uint64_t versioned_epoch, rgw_zone_set *zones_trace) = 0;
-  virtual RGWCoroutine *create_delete_marker(const DoutPrefixProvider *dpp, RGWDataSyncCtx *sc, rgw_bucket_sync_pipe& bucket_info, rgw_obj_key& key, real_time& mtime,
-                                             rgw_bucket_entry_owner& owner, bool versioned, uint64_t versioned_epoch, rgw_zone_set *zones_trace) = 0;
-};
-
-class RGWRESTMgr;
-class RGWMetadataHandler;
-class RGWBucketInstanceMetadataHandlerBase;
-
-class RGWSyncModuleInstance {
-public:
-  RGWSyncModuleInstance() {}
-  virtual ~RGWSyncModuleInstance() {}
-  virtual RGWDataSyncModule *get_data_handler() = 0;
-  virtual RGWRESTMgr *get_rest_filter(int dialect, RGWRESTMgr *orig) {
-    return orig;
-  }
-  virtual bool supports_user_writes() {
-    return false;
-  }
-  virtual RGWMetadataHandler *alloc_bucket_meta_handler();
-  virtual RGWBucketInstanceMetadataHandlerBase *alloc_bucket_instance_meta_handler(rgw::sal::Driver* driver);
-
-  // indication whether the sync module start with full sync (default behavior)
-  // incremental sync would follow anyway
-  virtual bool should_full_sync() const {
-      return true;
-  }
-};
-
-typedef std::shared_ptr<RGWSyncModuleInstance> RGWSyncModuleInstanceRef;
-
-class JSONFormattable;
-
-class RGWSyncModule {
-
-public:
-  RGWSyncModule() {}
-  virtual ~RGWSyncModule() {}
-
-  virtual bool supports_writes() {
-    return false;
-  }
-  virtual bool supports_data_export() = 0;
-  virtual int create_instance(const DoutPrefixProvider *dpp, CephContext *cct, const JSONFormattable& config, RGWSyncModuleInstanceRef *instance) = 0;
-};
-
-typedef std::shared_ptr<RGWSyncModule> RGWSyncModuleRef;
-
-
-class RGWSyncModulesManager {
-  ceph::mutex lock = ceph::make_mutex("RGWSyncModulesManager");
-
-  std::map<std::string, RGWSyncModuleRef> modules;
-public:
-  RGWSyncModulesManager() = default;
-
-  void register_module(const std::string& name, RGWSyncModuleRef& module, bool is_default = false) {
-    std::lock_guard l{lock};
-    modules[name] = module;
-    if (is_default) {
-      modules[std::string()] = module;
-    }
-  }
-
-  bool get_module(const std::string& name, RGWSyncModuleRef *module) {
-    std::lock_guard l{lock};
-    auto iter = modules.find(name);
-    if (iter == modules.end()) {
-      return false;
-    }
-    if (module != nullptr) {
-      *module = iter->second;
-    }
-    return true;
-  }
-
-
-  bool supports_data_export(const std::string& name) {
-    RGWSyncModuleRef module;
-    if (!get_module(name, &module)) {
-      return false;
-    }
-
-    return module->supports_data_export();
-  }
-
-  int create_instance(const DoutPrefixProvider *dpp, CephContext *cct, const std::string& name, const JSONFormattable& config, RGWSyncModuleInstanceRef *instance) {
-    RGWSyncModuleRef module;
-    if (!get_module(name, &module)) {
-      return -ENOENT;
-    }
-
-    return module.get()->create_instance(dpp, cct, config, instance);
-  }
-
-  std::vector<std::string> get_registered_module_names() const {
-    std::vector<std::string> names;
-    for (auto& i: modules) {
-      if (!i.first.empty()) {
-        names.push_back(i.first);
-      }
-    }
-    return names;
-  }
-};
-
-class RGWStatRemoteObjCBCR : public RGWCoroutine {
-protected:
-  RGWDataSyncCtx *sc;
-  RGWDataSyncEnv *sync_env;
-
-  rgw_bucket src_bucket;
-  rgw_obj_key key;
-
-  ceph::real_time mtime;
-  uint64_t size = 0;
-  std::string etag;
-  std::map<std::string, bufferlist> attrs;
-  std::map<std::string, std::string> headers;
-public:
-  RGWStatRemoteObjCBCR(RGWDataSyncCtx *_sc,
-                       rgw_bucket& _src_bucket, rgw_obj_key& _key);
-  ~RGWStatRemoteObjCBCR() override {}
-
-  void set_result(ceph::real_time& _mtime,
-                  uint64_t _size,
-                  const std::string& _etag,
-                  std::map<std::string, bufferlist>&& _attrs,
-                  std::map<std::string, std::string>&& _headers) {
-    mtime = _mtime;
-    size = _size;
-    etag = _etag;
-    attrs = std::move(_attrs);
-    headers = std::move(_headers);
-  }
-};
-
-class RGWCallStatRemoteObjCR : public RGWCoroutine {
-  ceph::real_time mtime;
-  uint64_t size{0};
-  std::string etag;
-  std::map<std::string, bufferlist> attrs;
-  std::map<std::string, std::string> headers;
-
-protected:
-  RGWDataSyncCtx *sc;
-  RGWDataSyncEnv *sync_env;
-
-  rgw_bucket src_bucket;
-  rgw_obj_key key;
-
-public:
-  RGWCallStatRemoteObjCR(RGWDataSyncCtx *_sc,
-                     rgw_bucket& _src_bucket, rgw_obj_key& _key);
-
-  ~RGWCallStatRemoteObjCR() override {}
-
-  int operate(const DoutPrefixProvider *dpp) override;
-
-  virtual RGWStatRemoteObjCBCR *allocate_callback() {
-    return nullptr;
-  }
-};
-
-void rgw_register_sync_modules(RGWSyncModulesManager *modules_manager);
-
-#endif
diff --git a/src/rgw/store/rados/rgw_sync_module_aws.cc b/src/rgw/store/rados/rgw_sync_module_aws.cc
deleted file mode 100644 (file)
index 6827f7f..0000000
+++ /dev/null
@@ -1,1836 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab ft=cpp
-
-#include "common/errno.h"
-
-#include "rgw_common.h"
-#include "rgw_coroutine.h"
-#include "rgw_sync_module.h"
-#include "rgw_data_sync.h"
-#include "rgw_sync_module_aws.h"
-#include "rgw_cr_rados.h"
-#include "rgw_rest_conn.h"
-#include "rgw_cr_rest.h"
-#include "rgw_acl.h"
-#include "rgw_zone.h"
-
-#include "services/svc_zone.h"
-
-#include <boost/asio/yield.hpp>
-
-#define dout_subsys ceph_subsys_rgw
-
-
-#define DEFAULT_MULTIPART_SYNC_PART_SIZE (32 * 1024 * 1024)
-
-using namespace std;
-
-static string default_target_path = "rgw-${zonegroup}-${sid}/${bucket}";
-
-static string get_key_oid(const rgw_obj_key& key)
-{
-  string oid = key.name;
-  if (!key.instance.empty() &&
-      !key.have_null_instance()) {
-    oid += string(":") + key.instance;
-  }
-  return oid;
-}
-
-static string obj_to_aws_path(rgw::sal::Object* obj)
-{
-  string path = obj->get_bucket()->get_name() + "/" + get_key_oid(obj->get_key());
-
-
-  return path;
-}
-
-/*
-
-   json configuration definition:
-
-    {
-      "connection": {
-        "access_key": <access>,
-        "secret": <secret>,
-        "endpoint": <endpoint>,
-        "host_style": <path | virtual>,
-      },
-      "acls": [ { "type": <id | email | uri>,
-                  "source_id": <source_id>,
-                  "dest_id": <dest_id> } ... ],  # optional, acl mappings, no mappings if does not exist
-      "target_path": <target_path>, # override default
-           
-
-      # anything below here is for non trivial configuration 
-      # can be used in conjuction with the above
-
-      "default": {
-        "connection": {
-            "access_key": <access>,
-            "secret": <secret>,
-            "endpoint": <endpoint>,
-            "host_style" <path | virtual>,
-        },
-        "acls": [    # list of source uids and how they map into destination uids in the dest objects acls
-        {
-          "type" : <id | email | uri>,   #  optional, default is id
-          "source_id": <id>,
-          "dest_id": <id>
-        } ... ]
-        "target_path": "rgwx-${sid}/${bucket}" # how a bucket name is mapped to destination path,
-                                               # final object name will be target_path + "/" + obj
-      },
-      "connections": [
-          {
-            "id": <id>,
-            "access_key": <access>,
-            "secret": <secret>,
-            "endpoint": <endpoint>,
-          } ... ],
-      "acl_profiles": [
-          {
-            "id": <id>, # acl mappings
-            "acls": [ {
-                "type": <id | email | uri>,
-                "source_id": <id>,
-                "dest_id": <id>
-              } ... ]
-          }
-      ],
-      "profiles": [
-          {
-           "source_bucket": <source>, # can specify either specific bucket name (foo), or prefix (foo*)
-           "target_path": <dest>,   # (override default)
-           "connection_id": <connection_id>, # optional, if empty references default connection
-           "acls_id": <mappings_id>, # optional, if empty references default mappings
-          } ... ],
-    }
-
-target path optional variables:
-
-(evaluated at init)
-sid: sync instance id, randomly generated by sync process on first sync initalization
-zonegroup: zonegroup name
-zonegroup_id: zonegroup name
-zone: zone name
-zone_id: zone name
-
-(evaluated when syncing)
-bucket: bucket name
-owner: bucket owner
-
-*/
-
-struct ACLMapping {
-  ACLGranteeTypeEnum type{ACL_TYPE_CANON_USER};
-  string source_id;
-  string dest_id;
-
-  ACLMapping() = default;
-
-  ACLMapping(ACLGranteeTypeEnum t,
-             const string& s,
-             const string& d) : type(t),
-  source_id(s),
-  dest_id(d) {}
-
-  void init(const JSONFormattable& config) {
-    const string& t = config["type"];
-
-    if (t == "email") {
-      type = ACL_TYPE_EMAIL_USER;
-    } else if (t == "uri") {
-      type = ACL_TYPE_GROUP;
-    } else {
-      type = ACL_TYPE_CANON_USER;
-    }
-
-    source_id = config["source_id"];
-    dest_id = config["dest_id"];
-  }
-
-  void dump_conf(CephContext *cct, JSONFormatter& jf) const {
-    Formatter::ObjectSection os(jf, "acl_mapping");
-    string s;
-    switch (type) {
-      case ACL_TYPE_EMAIL_USER:
-        s = "email";
-        break;
-      case ACL_TYPE_GROUP:
-        s = "uri";
-        break;
-      default:
-        s = "id";
-        break;
-    }
-    encode_json("type", s, &jf);
-    encode_json("source_id", source_id, &jf);
-    encode_json("dest_id", dest_id, &jf);
-  }
-};
-
-struct ACLMappings {
-  map<string, ACLMapping> acl_mappings;
-
-  void init(const JSONFormattable& config) {
-    for (auto& c : config.array()) {
-      ACLMapping m;
-      m.init(c);
-
-      acl_mappings.emplace(std::make_pair(m.source_id, m));
-    }
-  }
-  void dump_conf(CephContext *cct, JSONFormatter& jf) const {
-    Formatter::ArraySection os(jf, "acls");
-
-    for (auto& i : acl_mappings) {
-      i.second.dump_conf(cct, jf);
-    }
-  }
-};
-
-struct AWSSyncConfig_ACLProfiles {
-  map<string, std::shared_ptr<ACLMappings> > acl_profiles;
-
-  void init(const JSONFormattable& config) {
-    for (auto& c : config.array()) {
-      const string& profile_id = c["id"];
-
-      std::shared_ptr<ACLMappings> ap{new ACLMappings};
-      ap->init(c["acls"]);
-
-      acl_profiles[profile_id] = ap;
-    }
-  }
-  void dump_conf(CephContext *cct, JSONFormatter& jf) const {
-    Formatter::ArraySection section(jf, "acl_profiles");
-
-    for (auto& p : acl_profiles) {
-      Formatter::ObjectSection section(jf, "profile");
-      encode_json("id", p.first, &jf);
-      p.second->dump_conf(cct, jf);
-    }
-  }
-
-  bool find(const string& profile_id, ACLMappings *result) const {
-    auto iter = acl_profiles.find(profile_id);
-    if (iter == acl_profiles.end()) {
-      return false;
-    }
-    *result = *iter->second;
-    return true;
-  }
-};
-
-struct AWSSyncConfig_Connection {
-  string connection_id;
-  string endpoint;
-  RGWAccessKey key;
-  std::optional<string> region;
-  HostStyle host_style{PathStyle};
-
-  bool has_endpoint{false};
-  bool has_key{false};
-  bool has_host_style{false};
-
-  void init(const JSONFormattable& config) {
-    has_endpoint = config.exists("endpoint");
-    has_key = config.exists("access_key") || config.exists("secret");
-    has_host_style = config.exists("host_style");
-
-    connection_id = config["id"];
-    endpoint = config["endpoint"];
-
-    key = RGWAccessKey(config["access_key"], config["secret"]);
-
-    if (config.exists("region")) {
-      region = config["region"];
-    } else {
-      region.reset();
-    }
-
-    string host_style_str = config["host_style"];
-    if (host_style_str != "virtual") {
-      host_style = PathStyle;
-    } else {
-      host_style = VirtualStyle;
-    }
-  }
-  void dump_conf(CephContext *cct, JSONFormatter& jf) const {
-    Formatter::ObjectSection section(jf, "connection");
-    encode_json("id", connection_id, &jf);
-    encode_json("endpoint", endpoint, &jf);
-    string s = (host_style == PathStyle ? "path" : "virtual");
-    encode_json("region", region, &jf);
-    encode_json("host_style", s, &jf);
-
-    {
-      Formatter::ObjectSection os(jf, "key");
-      encode_json("access_key", key.id, &jf);
-      string secret = (key.key.empty() ? "" : "******");
-      encode_json("secret", secret, &jf);
-    }
-  }
-};
-
-static int conf_to_uint64(const DoutPrefixProvider *dpp, CephContext *cct, const JSONFormattable& config, const string& key, uint64_t *pval)
-{
-  string sval;
-  if (config.find(key, &sval)) {
-    string err;
-    uint64_t val = strict_strtoll(sval.c_str(), 10, &err);
-    if (!err.empty()) {
-      ldpp_dout(dpp, 0) << "ERROR: could not parse configurable value for cloud sync module: " << key << ": " << sval << dendl;
-      return -EINVAL;
-    }
-    *pval = val;
-  }
-  return 0;
-}
-
-struct AWSSyncConfig_S3 {
-  uint64_t multipart_sync_threshold{DEFAULT_MULTIPART_SYNC_PART_SIZE};
-  uint64_t multipart_min_part_size{DEFAULT_MULTIPART_SYNC_PART_SIZE};
-
-  int init(const DoutPrefixProvider *dpp, CephContext *cct, const JSONFormattable& config) {
-    int r = conf_to_uint64(dpp, cct, config, "multipart_sync_threshold", &multipart_sync_threshold);
-    if (r < 0) {
-      return r;
-    }
-
-    r = conf_to_uint64(dpp, cct, config, "multipart_min_part_size", &multipart_min_part_size);
-    if (r < 0) {
-      return r;
-    }
-#define MULTIPART_MIN_POSSIBLE_PART_SIZE (5 * 1024 * 1024)
-    if (multipart_min_part_size < MULTIPART_MIN_POSSIBLE_PART_SIZE) {
-      multipart_min_part_size = MULTIPART_MIN_POSSIBLE_PART_SIZE;
-    }
-    return 0;
-  }
-
-  void dump_conf(CephContext *cct, JSONFormatter& jf) const {
-    Formatter::ObjectSection section(jf, "s3");
-    encode_json("multipart_sync_threshold", multipart_sync_threshold, &jf);
-    encode_json("multipart_min_part_size", multipart_min_part_size, &jf);
-  }
-};
-
-struct AWSSyncConfig_Profile {
-  string source_bucket;
-  bool prefix{false};
-  string target_path;
-  string connection_id;
-  string acls_id;
-
-  std::shared_ptr<AWSSyncConfig_Connection> conn_conf;
-  std::shared_ptr<ACLMappings> acls;
-
-  std::shared_ptr<RGWRESTConn> conn;
-
-  void init(const JSONFormattable& config) {
-    source_bucket = config["source_bucket"];
-
-    prefix = (!source_bucket.empty() && source_bucket[source_bucket.size() - 1] == '*');
-
-    if (prefix) {
-      source_bucket = source_bucket.substr(0, source_bucket.size() - 1);
-    }
-
-    target_path = config["target_path"];
-    connection_id = config["connection_id"];
-    acls_id = config["acls_id"];
-
-    if (config.exists("connection")) {
-      conn_conf = make_shared<AWSSyncConfig_Connection>();
-      conn_conf->init(config["connection"]);
-    }
-
-    if (config.exists("acls")) {
-      acls = make_shared<ACLMappings>();
-      acls->init(config["acls"]);
-    }
-  }
-
-  void dump_conf(CephContext *cct, JSONFormatter& jf, const char *section = "config") const {
-    Formatter::ObjectSection config(jf, section);
-    string sb{source_bucket};
-    if (prefix) {
-      sb.append("*");
-    }
-    encode_json("source_bucket", sb, &jf);
-    encode_json("target_path", target_path, &jf);
-    encode_json("connection_id", connection_id, &jf);
-    encode_json("acls_id", acls_id, &jf);
-    if (conn_conf.get()) {
-      conn_conf->dump_conf(cct, jf);
-    }
-    if (acls.get()) {
-      acls->dump_conf(cct, jf);
-    }
-  }
-};
-
-static void find_and_replace(const string& src, const string& find, const string& replace, string *dest)
-{
-  string s = src;
-
-  size_t pos = s.find(find);
-  while (pos != string::npos) {
-    size_t next_ofs = pos + find.size();
-    s = s.substr(0, pos) + replace + s.substr(next_ofs);
-    pos = s.find(find, next_ofs);
-  }
-
-  *dest = s;
-}
-
-static void apply_meta_param(const string& src, const string& param, const string& val, string *dest)
-{
-  string s = string("${") + param + "}";
-  find_and_replace(src, s, val, dest);
-}
-
-
-struct AWSSyncConfig {
-  AWSSyncConfig_Profile default_profile;
-  std::shared_ptr<AWSSyncConfig_Profile> root_profile;
-
-  map<string, std::shared_ptr<AWSSyncConfig_Connection> > connections;
-  AWSSyncConfig_ACLProfiles acl_profiles;
-
-  map<string, std::shared_ptr<AWSSyncConfig_Profile> > explicit_profiles;
-
-  AWSSyncConfig_S3 s3;
-
-  int init_profile(const DoutPrefixProvider *dpp, CephContext *cct, const JSONFormattable& profile_conf, AWSSyncConfig_Profile& profile,
-                   bool connection_must_exist) {
-    if (!profile.connection_id.empty()) {
-      if (profile.conn_conf) {
-        ldpp_dout(dpp, 0) << "ERROR: ambiguous profile connection configuration, connection_id=" << profile.connection_id << dendl;
-        return -EINVAL;
-      }
-      if (connections.find(profile.connection_id) == connections.end()) {
-        ldpp_dout(dpp, 0) << "ERROR: profile configuration reference non-existent connection_id=" << profile.connection_id << dendl;
-        return -EINVAL;
-      }
-      profile.conn_conf = connections[profile.connection_id];
-    } else if (!profile.conn_conf) {
-      profile.connection_id = default_profile.connection_id;
-      auto i = connections.find(profile.connection_id);
-      if (i != connections.end()) {
-        profile.conn_conf = i->second;
-      }
-    }
-
-    if (connection_must_exist && !profile.conn_conf) {
-      ldpp_dout(dpp, 0) << "ERROR: remote connection undefined for sync profile" << dendl;
-      return -EINVAL;
-    }
-
-    if (profile.conn_conf && default_profile.conn_conf) {
-      if (!profile.conn_conf->has_endpoint) {
-        profile.conn_conf->endpoint = default_profile.conn_conf->endpoint;
-      }
-      if (!profile.conn_conf->has_host_style) {
-        profile.conn_conf->host_style = default_profile.conn_conf->host_style;
-      }
-      if (!profile.conn_conf->has_key) {
-        profile.conn_conf->key = default_profile.conn_conf->key;
-      }
-    }
-
-    ACLMappings acl_mappings;
-
-    if (!profile.acls_id.empty()) {
-      if (!acl_profiles.find(profile.acls_id, &acl_mappings)) {
-        ldpp_dout(dpp, 0) << "ERROR: profile configuration reference non-existent acls id=" << profile.acls_id << dendl;
-        return -EINVAL;
-      }
-      profile.acls = acl_profiles.acl_profiles[profile.acls_id];
-    } else if (!profile.acls) {
-      if (default_profile.acls) {
-        profile.acls = default_profile.acls;
-        profile.acls_id = default_profile.acls_id;
-      }
-    }
-
-    if (profile.target_path.empty()) {
-      profile.target_path = default_profile.target_path;
-    }
-    if (profile.target_path.empty()) {
-      profile.target_path = default_target_path;
-    }
-
-    return 0;
-  }
-
-  int init_target(const DoutPrefixProvider *dpp, CephContext *cct, const JSONFormattable& profile_conf, std::shared_ptr<AWSSyncConfig_Profile> *ptarget) {
-    std::shared_ptr<AWSSyncConfig_Profile> profile;
-    profile.reset(new AWSSyncConfig_Profile);
-    profile->init(profile_conf);
-
-    int ret = init_profile(dpp, cct, profile_conf, *profile, true);
-    if (ret < 0) {
-      return ret;
-    }
-
-    auto& sb = profile->source_bucket;
-
-    if (explicit_profiles.find(sb) != explicit_profiles.end()) {
-      ldpp_dout(dpp, 0) << "WARNING: duplicate target configuration in sync module" << dendl;
-    }
-
-    explicit_profiles[sb] = profile;
-    if (ptarget) {
-      *ptarget = profile;
-    }
-    return 0;
-  }
-
-  bool do_find_profile(const rgw_bucket bucket, std::shared_ptr<AWSSyncConfig_Profile> *result) {
-    const string& name = bucket.name;
-    auto iter = explicit_profiles.upper_bound(name);
-    if (iter == explicit_profiles.begin()) {
-      return false;
-    }
-
-    --iter;
-    if (iter->first.size() > name.size()) {
-      return false;
-    }
-    if (name.compare(0, iter->first.size(), iter->first) != 0) {
-      return false;
-    }
-
-    std::shared_ptr<AWSSyncConfig_Profile>& target = iter->second;
-
-    if (!target->prefix &&
-        name.size() != iter->first.size()) {
-      return false;
-    }
-
-    *result = target;
-    return true;
-  }
-
-  void find_profile(const rgw_bucket bucket, std::shared_ptr<AWSSyncConfig_Profile> *result) {
-    if (!do_find_profile(bucket, result)) {
-      *result = root_profile;
-    }
-  }
-
-  AWSSyncConfig() {}
-
-  int init(const DoutPrefixProvider *dpp, CephContext *cct, const JSONFormattable& config) {
-    auto& default_conf = config["default"];
-
-    if (config.exists("default")) {
-      default_profile.init(default_conf);
-      init_profile(dpp, cct, default_conf, default_profile, false);
-    }
-
-    for (auto& conn : config["connections"].array()) {
-      auto new_conn = conn;
-
-      std::shared_ptr<AWSSyncConfig_Connection> c{new AWSSyncConfig_Connection};
-      c->init(new_conn);
-
-      connections[new_conn["id"]] = c;
-    }
-
-    acl_profiles.init(config["acl_profiles"]);
-
-    int r = s3.init(dpp, cct, config["s3"]);
-    if (r < 0) {
-      return r;
-    }
-
-    auto new_root_conf = config;
-
-    r = init_target(dpp, cct, new_root_conf, &root_profile); /* the root profile config */
-    if (r < 0) {
-      return r;
-    }
-
-    for (auto target_conf : config["profiles"].array()) {
-      int r = init_target(dpp, cct, target_conf, nullptr);
-      if (r < 0) {
-        return r;
-      }
-    }
-
-    JSONFormatter jf(true);
-    dump_conf(cct, jf);
-    stringstream ss;
-    jf.flush(ss);
-
-    ldpp_dout(dpp, 5) << "sync module config (parsed representation):\n" << ss.str() << dendl;
-
-    return 0;
-  }
-
-  void expand_target(RGWDataSyncCtx *sc, const string& sid, const string& path, string *dest) {
-      apply_meta_param(path, "sid", sid, dest);
-
-      const RGWZoneGroup& zg = sc->env->svc->zone->get_zonegroup();
-      apply_meta_param(path, "zonegroup", zg.get_name(), dest);
-      apply_meta_param(path, "zonegroup_id", zg.get_id(), dest);
-
-      const RGWZone& zone = sc->env->svc->zone->get_zone();
-      apply_meta_param(path, "zone", zone.name, dest);
-      apply_meta_param(path, "zone_id", zone.id, dest);
-  }
-
-  void update_config(const DoutPrefixProvider *dpp, RGWDataSyncCtx *sc, const string& sid) {
-    expand_target(sc, sid, root_profile->target_path, &root_profile->target_path);
-    ldpp_dout(dpp, 20) << "updated target: (root) -> " << root_profile->target_path << dendl;
-    for (auto& t : explicit_profiles) {
-      expand_target(sc, sid, t.second->target_path, &t.second->target_path);
-      ldpp_dout(dpp, 20) << "updated target: " << t.first << " -> " << t.second->target_path << dendl;
-    }
-  }
-
-  void dump_conf(CephContext *cct, JSONFormatter& jf) const {
-    Formatter::ObjectSection config(jf, "config");
-    root_profile->dump_conf(cct, jf);
-    jf.open_array_section("connections");
-    for (auto c : connections) {
-      c.second->dump_conf(cct, jf);
-    }
-    jf.close_section();
-
-    acl_profiles.dump_conf(cct, jf);
-
-    { // targets
-      Formatter::ArraySection as(jf, "profiles");
-      for (auto& t : explicit_profiles) {
-        Formatter::ObjectSection target_section(jf, "profile");
-        encode_json("name", t.first, &jf);
-        t.second->dump_conf(cct, jf);
-      }
-    }
-  }
-
-  string get_path(std::shared_ptr<AWSSyncConfig_Profile>& profile,
-                  const RGWBucketInfo& bucket_info,
-                  const rgw_obj_key& obj) {
-    string bucket_str;
-    string owner;
-    if (!bucket_info.owner.tenant.empty()) {
-      bucket_str = owner = bucket_info.owner.tenant + "-";
-      owner += bucket_info.owner.id;
-    }
-    bucket_str += bucket_info.bucket.name;
-
-    const string& path = profile->target_path;
-
-    string new_path;
-    apply_meta_param(path, "bucket", bucket_str, &new_path);
-    apply_meta_param(new_path, "owner", owner, &new_path);
-
-    new_path += string("/") + get_key_oid(obj);
-
-    return new_path;
-  }
-
-  void get_target(std::shared_ptr<AWSSyncConfig_Profile>& profile,
-                  const RGWBucketInfo& bucket_info,
-                  const rgw_obj_key& obj,
-                  string *bucket_name,
-                  string *obj_name) {
-    string path = get_path(profile, bucket_info, obj);
-    size_t pos = path.find('/');
-
-    *bucket_name = path.substr(0, pos);
-    *obj_name = path.substr(pos + 1);
-  }
-
-  void init_conns(RGWDataSyncCtx *sc, const string& id) {
-    auto sync_env = sc->env;
-
-    update_config(sync_env->dpp, sc, id);
-
-    auto& root_conf = root_profile->conn_conf;
-
-    root_profile->conn.reset(new S3RESTConn(sc->cct,
-                                           id,
-                                           { root_conf->endpoint },
-                                           root_conf->key,
-                                          sync_env->svc->zone->get_zonegroup().get_id(),
-                                           root_conf->region,
-                                           root_conf->host_style));
-
-    for (auto i : explicit_profiles) {
-      auto& c = i.second;
-
-      c->conn.reset(new S3RESTConn(sc->cct,
-                                   id,
-                                   { c->conn_conf->endpoint },
-                                   c->conn_conf->key,
-                                  sync_env->svc->zone->get_zonegroup().get_id(),
-                                   c->conn_conf->region,
-                                   c->conn_conf->host_style));
-    }
-  }
-};
-
-
-struct AWSSyncInstanceEnv {
-  AWSSyncConfig conf;
-  string id;
-
-  explicit AWSSyncInstanceEnv(AWSSyncConfig& _conf) : conf(_conf) {}
-
-  void init(RGWDataSyncCtx *sc, uint64_t instance_id) {
-    char buf[32];
-    snprintf(buf, sizeof(buf), "%llx", (unsigned long long)instance_id);
-    id = buf;
-
-    conf.init_conns(sc, id);
-  }
-
-  void get_profile(const rgw_bucket& bucket, std::shared_ptr<AWSSyncConfig_Profile> *ptarget) {
-    conf.find_profile(bucket, ptarget);
-    ceph_assert(ptarget);
-  }
-};
-
-static int do_decode_rest_obj(const DoutPrefixProvider *dpp, CephContext *cct, map<string, bufferlist>& attrs, map<string, string>& headers, rgw_rest_obj *info)
-{
-  for (auto header : headers) {
-    const string& val = header.second;
-    if (header.first == "RGWX_OBJECT_SIZE") {
-      info->content_len = atoi(val.c_str());
-    } else {
-      info->attrs[header.first] = val;
-    }
-  }
-
-  info->acls.set_ctx(cct);
-  auto aiter = attrs.find(RGW_ATTR_ACL);
-  if (aiter != attrs.end()) {
-    bufferlist& bl = aiter->second;
-    auto bliter = bl.cbegin();
-    try {
-      info->acls.decode(bliter);
-    } catch (buffer::error& err) {
-      ldpp_dout(dpp, 0) << "ERROR: failed to decode policy off attrs" << dendl;
-      return -EIO;
-    }
-  } else {
-    ldpp_dout(dpp, 0) << "WARNING: acl attrs not provided" << dendl;
-  }
-
-  return 0;
-}
-
-class RGWRESTStreamGetCRF : public RGWStreamReadHTTPResourceCRF
-{
-  RGWDataSyncCtx *sc;
-  RGWRESTConn *conn;
-  rgw::sal::Object* src_obj;
-  RGWRESTConn::get_obj_params req_params;
-
-  rgw_sync_aws_src_obj_properties src_properties;
-public:
-  RGWRESTStreamGetCRF(CephContext *_cct,
-                               RGWCoroutinesEnv *_env,
-                               RGWCoroutine *_caller,
-                               RGWDataSyncCtx *_sc,
-                               RGWRESTConn *_conn,
-                               rgw::sal::Object* _src_obj,
-                               const rgw_sync_aws_src_obj_properties& _src_properties) : RGWStreamReadHTTPResourceCRF(_cct, _env, _caller,
-                                                                                                                      _sc->env->http_manager, _src_obj->get_key()),
-                                                                                 sc(_sc), conn(_conn), src_obj(_src_obj),
-                                                                                 src_properties(_src_properties) {
-  }
-
-  int init(const DoutPrefixProvider *dpp) override {
-    /* init input connection */
-
-
-    req_params.get_op = true;
-    req_params.prepend_metadata = true;
-
-    req_params.unmod_ptr = &src_properties.mtime;
-    req_params.etag = src_properties.etag;
-    req_params.mod_zone_id = src_properties.zone_short_id;
-    req_params.mod_pg_ver = src_properties.pg_ver;
-
-    if (range.is_set) {
-      req_params.range_is_set = true;
-      req_params.range_start = range.ofs;
-      req_params.range_end = range.ofs + range.size - 1;
-    }
-
-    RGWRESTStreamRWRequest *in_req;
-    int ret = conn->get_obj(dpp, src_obj, req_params, false /* send */, &in_req);
-    if (ret < 0) {
-      ldpp_dout(dpp, 0) << "ERROR: " << __func__ << "(): conn->get_obj() returned ret=" << ret << dendl;
-      return ret;
-    }
-
-    set_req(in_req);
-
-    return RGWStreamReadHTTPResourceCRF::init(dpp);
-  }
-
-  int decode_rest_obj(const DoutPrefixProvider *dpp, map<string, string>& headers, bufferlist& extra_data) override {
-    map<string, bufferlist> src_attrs;
-
-    ldpp_dout(dpp, 20) << __func__ << ":" << " headers=" << headers << " extra_data.length()=" << extra_data.length() << dendl;
-
-    if (extra_data.length() > 0) {
-      JSONParser jp;
-      if (!jp.parse(extra_data.c_str(), extra_data.length())) {
-        ldpp_dout(dpp, 0) << "ERROR: failed to parse response extra data. len=" << extra_data.length() << " data=" << extra_data.c_str() << dendl;
-        return -EIO;
-      }
-
-      JSONDecoder::decode_json("attrs", src_attrs, &jp);
-    }
-    return do_decode_rest_obj(dpp, sc->cct, src_attrs, headers, &rest_obj);
-  }
-
-  bool need_extra_data() override {
-    return true;
-  }
-};
-
-static std::set<string> keep_headers = { "CONTENT_TYPE",
-                                         "CONTENT_ENCODING",
-                                         "CONTENT_DISPOSITION",
-                                         "CONTENT_LANGUAGE" };
-
-class RGWAWSStreamPutCRF : public RGWStreamWriteHTTPResourceCRF
-{
-  RGWDataSyncCtx *sc;
-  rgw_sync_aws_src_obj_properties src_properties;
-  std::shared_ptr<AWSSyncConfig_Profile> target;
-  rgw::sal::Object* dest_obj;
-  string etag;
-public:
-  RGWAWSStreamPutCRF(CephContext *_cct,
-                               RGWCoroutinesEnv *_env,
-                               RGWCoroutine *_caller,
-                               RGWDataSyncCtx *_sc,
-                               const rgw_sync_aws_src_obj_properties&  _src_properties,
-                               std::shared_ptr<AWSSyncConfig_Profile>& _target,
-                               rgw::sal::Object* _dest_obj) : RGWStreamWriteHTTPResourceCRF(_cct, _env, _caller, _sc->env->http_manager),
-                                                     sc(_sc), src_properties(_src_properties), target(_target), dest_obj(_dest_obj) {
-  }
-
-  int init() override {
-    /* init output connection */
-    RGWRESTStreamS3PutObj *out_req{nullptr};
-
-    if (multipart.is_multipart) {
-      char buf[32];
-      snprintf(buf, sizeof(buf), "%d", multipart.part_num);
-      rgw_http_param_pair params[] = { { "uploadId", multipart.upload_id.c_str() },
-                                       { "partNumber", buf },
-                                       { nullptr, nullptr } };
-      target->conn->put_obj_send_init(dest_obj, params, &out_req);
-    } else {
-      target->conn->put_obj_send_init(dest_obj, nullptr, &out_req);
-    }
-
-    set_req(out_req);
-
-    return RGWStreamWriteHTTPResourceCRF::init();
-  }
-
-  static bool keep_attr(const string& h) {
-    return (keep_headers.find(h) != keep_headers.end() ||
-            boost::algorithm::starts_with(h, "X_AMZ_"));
-  }
-
-  static void init_send_attrs(const DoutPrefixProvider *dpp,
-                              CephContext *cct,
-                              const rgw_rest_obj& rest_obj,
-                              const rgw_sync_aws_src_obj_properties& src_properties,
-                              const AWSSyncConfig_Profile *target,
-                              map<string, string> *attrs) {
-    auto& new_attrs = *attrs;
-
-    new_attrs.clear();
-
-    for (auto& hi : rest_obj.attrs) {
-      if (keep_attr(hi.first)) {
-        new_attrs.insert(hi);
-      }
-    }
-
-    auto acl = rest_obj.acls.get_acl();
-
-    map<int, vector<string> > access_map;
-
-    if (target->acls) {
-      for (auto& grant : acl.get_grant_map()) {
-        auto& orig_grantee = grant.first;
-        auto& perm = grant.second;
-
-        string grantee;
-
-        const auto& am = target->acls->acl_mappings;
-
-        auto iter = am.find(orig_grantee);
-        if (iter == am.end()) {
-          ldpp_dout(dpp, 20) << "acl_mappings: Could not find " << orig_grantee << " .. ignoring" << dendl;
-          continue;
-        }
-
-        grantee = iter->second.dest_id;
-
-        string type;
-
-        switch (iter->second.type) {
-          case ACL_TYPE_CANON_USER:
-            type = "id";
-            break;
-          case ACL_TYPE_EMAIL_USER:
-            type = "emailAddress";
-            break;
-          case ACL_TYPE_GROUP:
-            type = "uri";
-            break;
-          default:
-            continue;
-        }
-
-        string tv = type + "=" + grantee;
-
-        int flags = perm.get_permission().get_permissions();
-        if ((flags & RGW_PERM_FULL_CONTROL) == RGW_PERM_FULL_CONTROL) {
-          access_map[flags].push_back(tv);
-          continue;
-        }
-
-        for (int i = 1; i <= RGW_PERM_WRITE_ACP; i <<= 1) {
-          if (flags & i) {
-            access_map[i].push_back(tv);
-          }
-        }
-      }
-    }
-
-    for (auto aiter : access_map) {
-      int grant_type = aiter.first;
-
-      string header_str("x-amz-grant-");
-
-      switch (grant_type) {
-        case RGW_PERM_READ:
-          header_str.append("read");
-          break;
-        case RGW_PERM_WRITE:
-          header_str.append("write");
-          break;
-        case RGW_PERM_READ_ACP:
-          header_str.append("read-acp");
-          break;
-        case RGW_PERM_WRITE_ACP:
-          header_str.append("write-acp");
-          break;
-        case RGW_PERM_FULL_CONTROL:
-          header_str.append("full-control");
-          break;
-      }
-
-      string s;
-
-      for (auto viter : aiter.second) {
-        if (!s.empty()) {
-          s.append(", ");
-        }
-        s.append(viter);
-      }
-
-      ldpp_dout(dpp, 20) << "acl_mappings: set acl: " << header_str << "=" << s << dendl;
-
-      new_attrs[header_str] = s;
-    }
-
-    char buf[32];
-    snprintf(buf, sizeof(buf), "%llu", (long long)src_properties.versioned_epoch);
-    new_attrs["x-amz-meta-rgwx-versioned-epoch"] = buf;
-
-    utime_t ut(src_properties.mtime);
-    snprintf(buf, sizeof(buf), "%lld.%09lld",
-             (long long)ut.sec(),
-             (long long)ut.nsec());
-
-    new_attrs["x-amz-meta-rgwx-source-mtime"] = buf;
-    new_attrs["x-amz-meta-rgwx-source-etag"] = src_properties.etag;
-    new_attrs["x-amz-meta-rgwx-source-key"] = rest_obj.key.name;
-    if (!rest_obj.key.instance.empty()) {
-      new_attrs["x-amz-meta-rgwx-source-version-id"] = rest_obj.key.instance;
-    }
-  }
-
-  void send_ready(const DoutPrefixProvider *dpp, const rgw_rest_obj& rest_obj) override {
-    RGWRESTStreamS3PutObj *r = static_cast<RGWRESTStreamS3PutObj *>(req);
-
-    map<string, string> new_attrs;
-    if (!multipart.is_multipart) {
-      init_send_attrs(dpp, sc->cct, rest_obj, src_properties, target.get(), &new_attrs);
-    }
-
-    r->set_send_length(rest_obj.content_len);
-
-    RGWAccessControlPolicy policy;
-
-    r->send_ready(dpp, target->conn->get_key(), new_attrs, policy);
-  }
-
-  void handle_headers(const map<string, string>& headers) {
-    for (auto h : headers) {
-      if (h.first == "ETAG") {
-        etag = h.second;
-      }
-    }
-  }
-
-  bool get_etag(string *petag) {
-    if (etag.empty()) {
-      return false;
-    }
-    *petag = etag;
-    return true;
-  }
-};
-
-
-class RGWAWSStreamObjToCloudPlainCR : public RGWCoroutine {
-  RGWDataSyncCtx *sc;
-  RGWRESTConn *source_conn;
-  std::shared_ptr<AWSSyncConfig_Profile> target;
-  rgw::sal::Object* src_obj;
-  rgw::sal::Object* dest_obj;
-
-  rgw_sync_aws_src_obj_properties src_properties;
-
-  std::shared_ptr<RGWStreamReadHTTPResourceCRF> in_crf;
-  std::shared_ptr<RGWStreamWriteHTTPResourceCRF> out_crf;
-
-public:
-  RGWAWSStreamObjToCloudPlainCR(RGWDataSyncCtx *_sc,
-                                RGWRESTConn *_source_conn,
-                                rgw::sal::Object* _src_obj,
-                                const rgw_sync_aws_src_obj_properties& _src_properties,
-                                std::shared_ptr<AWSSyncConfig_Profile> _target,
-                                rgw::sal::Object* _dest_obj) : RGWCoroutine(_sc->cct),
-                                                   sc(_sc),
-                                                   source_conn(_source_conn),
-                                                   target(_target),
-                                                   src_obj(_src_obj),
-                                                   dest_obj(_dest_obj),
-                                                   src_properties(_src_properties) {}
-
-  int operate(const DoutPrefixProvider *dpp) override {
-    reenter(this) {
-      /* init input */
-      in_crf.reset(new RGWRESTStreamGetCRF(cct, get_env(), this, sc,
-                                           source_conn, src_obj,
-                                           src_properties));
-
-      /* init output */
-      out_crf.reset(new RGWAWSStreamPutCRF(cct, get_env(), this, sc,
-                                           src_properties, target, dest_obj));
-
-      yield call(new RGWStreamSpliceCR(cct, sc->env->http_manager, in_crf, out_crf));
-      if (retcode < 0) {
-        return set_cr_error(retcode);
-      }
-
-      return set_cr_done();
-    }
-
-    return 0;
-  }
-};
-
-class RGWAWSStreamObjToCloudMultipartPartCR : public RGWCoroutine {
-  RGWDataSyncCtx *sc;
-  RGWRESTConn *source_conn;
-  std::shared_ptr<AWSSyncConfig_Profile> target;
-  rgw::sal::Object* src_obj;
-  rgw::sal::Object* dest_obj;
-
-  rgw_sync_aws_src_obj_properties src_properties;
-
-  string upload_id;
-
-  rgw_sync_aws_multipart_part_info part_info;
-
-  std::shared_ptr<RGWStreamReadHTTPResourceCRF> in_crf;
-  std::shared_ptr<RGWStreamWriteHTTPResourceCRF> out_crf;
-
-  string *petag;
-
-public:
-  RGWAWSStreamObjToCloudMultipartPartCR(RGWDataSyncCtx *_sc,
-                                RGWRESTConn *_source_conn,
-                                rgw::sal::Object* _src_obj,
-                                std::shared_ptr<AWSSyncConfig_Profile>& _target,
-                                rgw::sal::Object* _dest_obj,
-                                const rgw_sync_aws_src_obj_properties& _src_properties,
-                                const string& _upload_id,
-                                const rgw_sync_aws_multipart_part_info& _part_info,
-                                string *_petag) : RGWCoroutine(_sc->cct),
-                                                   sc(_sc),
-                                                   source_conn(_source_conn),
-                                                   target(_target),
-                                                   src_obj(_src_obj),
-                                                   dest_obj(_dest_obj),
-                                                   src_properties(_src_properties),
-                                                   upload_id(_upload_id),
-                                                   part_info(_part_info),
-                                                   petag(_petag) {}
-
-  int operate(const DoutPrefixProvider *dpp) override {
-    reenter(this) {
-      /* init input */
-      in_crf.reset(new RGWRESTStreamGetCRF(cct, get_env(), this, sc,
-                                           source_conn, src_obj,
-                                           src_properties));
-
-      in_crf->set_range(part_info.ofs, part_info.size);
-
-      /* init output */
-      out_crf.reset(new RGWAWSStreamPutCRF(cct, get_env(), this, sc,
-                                           src_properties, target, dest_obj));
-
-      out_crf->set_multipart(upload_id, part_info.part_num, part_info.size);
-
-      yield call(new RGWStreamSpliceCR(cct, sc->env->http_manager, in_crf, out_crf));
-      if (retcode < 0) {
-        return set_cr_error(retcode);
-      }
-
-      if (!(static_cast<RGWAWSStreamPutCRF *>(out_crf.get()))->get_etag(petag)) {
-        ldpp_dout(dpp, 0) << "ERROR: failed to get etag from PUT request" << dendl;
-        return set_cr_error(-EIO);
-      }
-
-      return set_cr_done();
-    }
-
-    return 0;
-  }
-};
-
-class RGWAWSAbortMultipartCR : public RGWCoroutine {
-  RGWDataSyncCtx *sc;
-  RGWRESTConn *dest_conn;
-  rgw::sal::Object* dest_obj;
-
-  string upload_id;
-
-public:
-  RGWAWSAbortMultipartCR(RGWDataSyncCtx *_sc,
-                        RGWRESTConn *_dest_conn,
-                        rgw::sal::Object* _dest_obj,
-                        const string& _upload_id) : RGWCoroutine(_sc->cct),
-                                                   sc(_sc),
-                                                   dest_conn(_dest_conn),
-                                                   dest_obj(_dest_obj),
-                                                   upload_id(_upload_id) {}
-
-  int operate(const DoutPrefixProvider *dpp) override {
-    reenter(this) {
-
-      yield {
-        rgw_http_param_pair params[] = { { "uploadId", upload_id.c_str() }, {nullptr, nullptr} };
-        bufferlist bl;
-        call(new RGWDeleteRESTResourceCR(sc->cct, dest_conn, sc->env->http_manager,
-                                         obj_to_aws_path(dest_obj), params));
-      }
-
-      if (retcode < 0) {
-        ldpp_dout(dpp, 0) << "ERROR: failed to abort multipart upload for dest object=" << dest_obj << " (retcode=" << retcode << ")" << dendl;
-        return set_cr_error(retcode);
-      }
-
-      return set_cr_done();
-    }
-
-    return 0;
-  }
-};
-
-class RGWAWSInitMultipartCR : public RGWCoroutine {
-  RGWDataSyncCtx *sc;
-  RGWRESTConn *dest_conn;
-  rgw::sal::Object* dest_obj;
-
-  uint64_t obj_size;
-  map<string, string> attrs;
-
-  bufferlist out_bl;
-
-  string *upload_id;
-
-  struct InitMultipartResult {
-    string bucket;
-    string key;
-    string upload_id;
-
-    void decode_xml(XMLObj *obj) {
-      RGWXMLDecoder::decode_xml("Bucket", bucket, obj);
-      RGWXMLDecoder::decode_xml("Key", key, obj);
-      RGWXMLDecoder::decode_xml("UploadId", upload_id, obj);
-    }
-  } result;
-
-public:
-  RGWAWSInitMultipartCR(RGWDataSyncCtx *_sc,
-                        RGWRESTConn *_dest_conn,
-                        rgw::sal::Object* _dest_obj,
-                        uint64_t _obj_size,
-                        const map<string, string>& _attrs,
-                        string *_upload_id) : RGWCoroutine(_sc->cct),
-                                                   sc(_sc),
-                                                   dest_conn(_dest_conn),
-                                                   dest_obj(_dest_obj),
-                                                   obj_size(_obj_size),
-                                                   attrs(_attrs),
-                                                   upload_id(_upload_id) {}
-
-  int operate(const DoutPrefixProvider *dpp) override {
-    reenter(this) {
-
-      yield {
-        rgw_http_param_pair params[] = { { "uploads", nullptr }, {nullptr, nullptr} };
-        bufferlist bl;
-        call(new RGWPostRawRESTResourceCR <bufferlist> (sc->cct, dest_conn, sc->env->http_manager,
-                                                 obj_to_aws_path(dest_obj), params, &attrs, bl, &out_bl));
-      }
-
-      if (retcode < 0) {
-        ldpp_dout(dpp, 0) << "ERROR: failed to initialize multipart upload for dest object=" << dest_obj << dendl;
-        return set_cr_error(retcode);
-      }
-      {
-        /*
-         * If one of the following fails we cannot abort upload, as we cannot
-         * extract the upload id. If one of these fail it's very likely that that's
-         * the least of our problem.
-         */
-        RGWXMLDecoder::XMLParser parser;
-        if (!parser.init()) {
-          ldpp_dout(dpp, 0) << "ERROR: failed to initialize xml parser for parsing multipart init response from server" << dendl;
-          return set_cr_error(-EIO);
-        }
-
-        if (!parser.parse(out_bl.c_str(), out_bl.length(), 1)) {
-          string str(out_bl.c_str(), out_bl.length());
-          ldpp_dout(dpp, 5) << "ERROR: failed to parse xml: " << str << dendl;
-          return set_cr_error(-EIO);
-        }
-
-        try {
-          RGWXMLDecoder::decode_xml("InitiateMultipartUploadResult", result, &parser, true);
-        } catch (RGWXMLDecoder::err& err) {
-          string str(out_bl.c_str(), out_bl.length());
-          ldpp_dout(dpp, 5) << "ERROR: unexpected xml: " << str << dendl;
-          return set_cr_error(-EIO);
-        }
-      }
-
-      ldpp_dout(dpp, 20) << "init multipart result: bucket=" << result.bucket << " key=" << result.key << " upload_id=" << result.upload_id << dendl;
-
-      *upload_id = result.upload_id;
-
-      return set_cr_done();
-    }
-
-    return 0;
-  }
-};
-
-class RGWAWSCompleteMultipartCR : public RGWCoroutine {
-  RGWDataSyncCtx *sc;
-  RGWRESTConn *dest_conn;
-  rgw::sal::Object* dest_obj;
-
-  bufferlist out_bl;
-
-  string upload_id;
-
-  struct CompleteMultipartReq {
-    map<int, rgw_sync_aws_multipart_part_info> parts;
-
-    explicit CompleteMultipartReq(const map<int, rgw_sync_aws_multipart_part_info>& _parts) : parts(_parts) {}
-
-    void dump_xml(Formatter *f) const {
-      for (auto p : parts) {
-        f->open_object_section("Part");
-        encode_xml("PartNumber", p.first, f);
-        encode_xml("ETag", p.second.etag, f);
-        f->close_section();
-      };
-    }
-  } req_enc;
-
-  struct CompleteMultipartResult {
-    string location;
-    string bucket;
-    string key;
-    string etag;
-
-    void decode_xml(XMLObj *obj) {
-      RGWXMLDecoder::decode_xml("Location", bucket, obj);
-      RGWXMLDecoder::decode_xml("Bucket", bucket, obj);
-      RGWXMLDecoder::decode_xml("Key", key, obj);
-      RGWXMLDecoder::decode_xml("ETag", etag, obj);
-    }
-  } result;
-
-public:
-  RGWAWSCompleteMultipartCR(RGWDataSyncCtx *_sc,
-                        RGWRESTConn *_dest_conn,
-                        rgw::sal::Object* _dest_obj,
-                        string _upload_id,
-                        const map<int, rgw_sync_aws_multipart_part_info>& _parts) : RGWCoroutine(_sc->cct),
-                                                   sc(_sc),
-                                                   dest_conn(_dest_conn),
-                                                   dest_obj(_dest_obj),
-                                                   upload_id(_upload_id),
-                                                   req_enc(_parts) {}
-
-  int operate(const DoutPrefixProvider *dpp) override {
-    reenter(this) {
-
-      yield {
-        rgw_http_param_pair params[] = { { "uploadId", upload_id.c_str() }, {nullptr, nullptr} };
-        stringstream ss;
-        XMLFormatter formatter;
-
-        encode_xml("CompleteMultipartUpload", req_enc, &formatter);
-
-        formatter.flush(ss);
-
-        bufferlist bl;
-        bl.append(ss.str());
-
-        call(new RGWPostRawRESTResourceCR <bufferlist> (sc->cct, dest_conn, sc->env->http_manager,
-                                                 obj_to_aws_path(dest_obj), params, nullptr, bl, &out_bl));
-      }
-
-      if (retcode < 0) {
-        ldpp_dout(dpp, 0) << "ERROR: failed to initialize multipart upload for dest object=" << dest_obj << dendl;
-        return set_cr_error(retcode);
-      }
-      {
-        /*
-         * If one of the following fails we cannot abort upload, as we cannot
-         * extract the upload id. If one of these fail it's very likely that that's
-         * the least of our problem.
-         */
-        RGWXMLDecoder::XMLParser parser;
-        if (!parser.init()) {
-          ldpp_dout(dpp, 0) << "ERROR: failed to initialize xml parser for parsing multipart init response from server" << dendl;
-          return set_cr_error(-EIO);
-        }
-
-        if (!parser.parse(out_bl.c_str(), out_bl.length(), 1)) {
-          string str(out_bl.c_str(), out_bl.length());
-          ldpp_dout(dpp, 5) << "ERROR: failed to parse xml: " << str << dendl;
-          return set_cr_error(-EIO);
-        }
-
-        try {
-          RGWXMLDecoder::decode_xml("CompleteMultipartUploadResult", result, &parser, true);
-        } catch (RGWXMLDecoder::err& err) {
-          string str(out_bl.c_str(), out_bl.length());
-          ldpp_dout(dpp, 5) << "ERROR: unexpected xml: " << str << dendl;
-          return set_cr_error(-EIO);
-        }
-      }
-
-      ldpp_dout(dpp, 20) << "complete multipart result: location=" << result.location << " bucket=" << result.bucket << " key=" << result.key << " etag=" << result.etag << dendl;
-
-      return set_cr_done();
-    }
-
-    return 0;
-  }
-};
-
-
-class RGWAWSStreamAbortMultipartUploadCR : public RGWCoroutine {
-  RGWDataSyncCtx *sc;
-  RGWRESTConn *dest_conn;
-  rgw::sal::Object* dest_obj;
-  const rgw_raw_obj status_obj;
-
-  string upload_id;
-
-public:
-
-  RGWAWSStreamAbortMultipartUploadCR(RGWDataSyncCtx *_sc,
-                                RGWRESTConn *_dest_conn,
-                                rgw::sal::Object* _dest_obj,
-                                const rgw_raw_obj& _status_obj,
-                                const string& _upload_id) : RGWCoroutine(_sc->cct), sc(_sc),
-                                                            dest_conn(_dest_conn),
-                                                            dest_obj(_dest_obj),
-                                                            status_obj(_status_obj),
-                                                            upload_id(_upload_id) {}
-
-  int operate(const DoutPrefixProvider *dpp) override {
-    reenter(this) {
-      yield call(new RGWAWSAbortMultipartCR(sc, dest_conn, dest_obj, upload_id));
-      if (retcode < 0) {
-        ldpp_dout(dpp, 0) << "ERROR: failed to abort multipart upload dest obj=" << dest_obj << " upload_id=" << upload_id << " retcode=" << retcode << dendl;
-        /* ignore error, best effort */
-      }
-      yield call(new RGWRadosRemoveCR(sc->env->driver, status_obj));
-      if (retcode < 0) {
-        ldpp_dout(dpp, 0) << "ERROR: failed to remove sync status obj obj=" << status_obj << " retcode=" << retcode << dendl;
-        /* ignore error, best effort */
-      }
-      return set_cr_done();
-    }
-
-    return 0;
-  }
-};
-
-class RGWAWSStreamObjToCloudMultipartCR : public RGWCoroutine {
-  RGWDataSyncCtx *sc;
-  RGWDataSyncEnv *sync_env;
-  AWSSyncConfig& conf;
-  RGWRESTConn *source_conn;
-  std::shared_ptr<AWSSyncConfig_Profile> target;
-  rgw::sal::Object* src_obj;
-  rgw::sal::Object* dest_obj;
-
-  uint64_t obj_size;
-  string src_etag;
-  rgw_sync_aws_src_obj_properties src_properties;
-  rgw_rest_obj rest_obj;
-
-  rgw_sync_aws_multipart_upload_info status;
-
-  map<string, string> new_attrs;
-
-  rgw_sync_aws_multipart_part_info *pcur_part_info{nullptr};
-
-  int ret_err{0};
-
-  rgw_raw_obj status_obj;
-
-public:
-  RGWAWSStreamObjToCloudMultipartCR(RGWDataSyncCtx *_sc,
-                                   rgw_bucket_sync_pipe& _sync_pipe,
-                                AWSSyncConfig& _conf,
-                                RGWRESTConn *_source_conn,
-                                rgw::sal::Object* _src_obj,
-                                std::shared_ptr<AWSSyncConfig_Profile>& _target,
-                                rgw::sal::Object* _dest_obj,
-                                uint64_t _obj_size,
-                                const rgw_sync_aws_src_obj_properties& _src_properties,
-                                const rgw_rest_obj& _rest_obj) : RGWCoroutine(_sc->cct),
-                                                   sc(_sc),
-                                                   sync_env(_sc->env),
-                                                   conf(_conf),
-                                                   source_conn(_source_conn),
-                                                   target(_target),
-                                                   src_obj(_src_obj),
-                                                   dest_obj(_dest_obj),
-                                                   obj_size(_obj_size),
-                                                   src_properties(_src_properties),
-                                                   rest_obj(_rest_obj),
-                                                   status_obj(sync_env->svc->zone->get_zone_params().log_pool,
-                                                              RGWBucketPipeSyncStatusManager::obj_status_oid(_sync_pipe, sc->source_zone, src_obj)) {
-  }
-
-
-  int operate(const DoutPrefixProvider *dpp) override {
-    reenter(this) {
-      yield call(new RGWSimpleRadosReadCR<rgw_sync_aws_multipart_upload_info>(dpp, sync_env->async_rados, sync_env->svc->sysobj,
-                                                                 status_obj, &status, false));
-
-      if (retcode < 0 && retcode != -ENOENT) {
-        ldpp_dout(dpp, 0) << "ERROR: failed to read sync status of object " << src_obj << " retcode=" << retcode << dendl;
-        return retcode;
-      }
-
-      if (retcode >= 0) {
-        /* check here that mtime and size did not change */
-
-        if (status.src_properties.mtime != src_properties.mtime || status.obj_size != obj_size ||
-            status.src_properties.etag != src_properties.etag) {
-          yield call(new RGWAWSStreamAbortMultipartUploadCR(sc, target->conn.get(), dest_obj, status_obj, status.upload_id));
-          retcode = -ENOENT;
-        }
-      }
-
-      if (retcode == -ENOENT) {
-        RGWAWSStreamPutCRF::init_send_attrs(dpp, sc->cct, rest_obj, src_properties, target.get(), &new_attrs);
-
-        yield call(new RGWAWSInitMultipartCR(sc, target->conn.get(), dest_obj, status.obj_size, std::move(new_attrs), &status.upload_id));
-        if (retcode < 0) {
-          return set_cr_error(retcode);
-        }
-
-        status.obj_size = obj_size;
-        status.src_properties = src_properties;
-#define MULTIPART_MAX_PARTS 10000
-        uint64_t min_part_size = obj_size / MULTIPART_MAX_PARTS;
-        status.part_size = std::max(conf.s3.multipart_min_part_size, min_part_size);
-        status.num_parts = (obj_size + status.part_size - 1) / status.part_size;
-        status.cur_part = 1;
-      }
-
-      for (; (uint32_t)status.cur_part <= status.num_parts; ++status.cur_part) {
-        yield {
-          rgw_sync_aws_multipart_part_info& cur_part_info = status.parts[status.cur_part];
-          cur_part_info.part_num = status.cur_part;
-          cur_part_info.ofs = status.cur_ofs;
-          cur_part_info.size = std::min((uint64_t)status.part_size, status.obj_size - status.cur_ofs);
-
-          pcur_part_info = &cur_part_info;
-
-          status.cur_ofs += status.part_size;
-
-          call(new RGWAWSStreamObjToCloudMultipartPartCR(sc,
-                                                             source_conn, src_obj,
-                                                             target,
-                                                             dest_obj,
-                                                             status.src_properties,
-                                                             status.upload_id,
-                                                             cur_part_info,
-                                                             &cur_part_info.etag));
-        }
-
-        if (retcode < 0) {
-          ldpp_dout(dpp, 0) << "ERROR: failed to sync obj=" << src_obj << ", sync via multipart upload, upload_id=" << status.upload_id << " part number " << status.cur_part << " (error: " << cpp_strerror(-retcode) << ")" << dendl;
-          ret_err = retcode;
-          yield call(new RGWAWSStreamAbortMultipartUploadCR(sc, target->conn.get(), dest_obj, status_obj, status.upload_id));
-          return set_cr_error(ret_err);
-        }
-
-        yield call(new RGWSimpleRadosWriteCR<rgw_sync_aws_multipart_upload_info>(dpp, sync_env->async_rados, sync_env->svc->sysobj, status_obj, status));
-        if (retcode < 0) {
-          ldpp_dout(dpp, 0) << "ERROR: failed to store multipart upload state, retcode=" << retcode << dendl;
-          /* continue with upload anyway */
-        }
-        ldpp_dout(dpp, 20) << "sync of object=" << src_obj << " via multipart upload, finished sending part #" << status.cur_part << " etag=" << pcur_part_info->etag << dendl;
-      }
-
-      yield call(new RGWAWSCompleteMultipartCR(sc, target->conn.get(), dest_obj, status.upload_id, status.parts));
-      if (retcode < 0) {
-        ldpp_dout(dpp, 0) << "ERROR: failed to complete multipart upload of obj=" << src_obj << " (error: " << cpp_strerror(-retcode) << ")" << dendl;
-        ret_err = retcode;
-        yield call(new RGWAWSStreamAbortMultipartUploadCR(sc, target->conn.get(), dest_obj, status_obj, status.upload_id));
-        return set_cr_error(ret_err);
-      }
-
-      /* remove status obj */
-      yield call(new RGWRadosRemoveCR(sync_env->driver, status_obj));
-      if (retcode < 0) {
-        ldpp_dout(dpp, 0) << "ERROR: failed to abort multipart upload obj=" << src_obj << " upload_id=" << status.upload_id << " part number " << status.cur_part << " (" << cpp_strerror(-retcode) << ")" << dendl;
-        /* ignore error, best effort */
-      }
-      return set_cr_done();
-    }
-
-    return 0;
-  }
-};
-template <class T>
-int decode_attr(map<string, bufferlist>& attrs, const char *attr_name, T *result, T def_val)
-{
-  map<string, bufferlist>::iterator iter = attrs.find(attr_name);
-  if (iter == attrs.end()) {
-    *result = def_val;
-    return 0;
-  }
-  bufferlist& bl = iter->second;
-  if (bl.length() == 0) {
-    *result = def_val;
-    return 0;
-  }
-  auto bliter = bl.cbegin();
-  try {
-    decode(*result, bliter);
-  } catch (buffer::error& err) {
-    return -EIO;
-  }
-  return 0;
-}
-
-// maybe use Fetch Remote Obj instead?
-class RGWAWSHandleRemoteObjCBCR: public RGWStatRemoteObjCBCR {
-  rgw_bucket_sync_pipe sync_pipe;
-  AWSSyncInstanceEnv& instance;
-
-  uint64_t versioned_epoch{0};
-
-  RGWRESTConn *source_conn{nullptr};
-  std::shared_ptr<AWSSyncConfig_Profile> target;
-  bufferlist res;
-  unordered_map <string, bool> bucket_created;
-  string target_bucket_name;
-  string target_obj_name;
-  rgw_rest_obj rest_obj;
-  int ret{0};
-
-  uint32_t src_zone_short_id{0};
-  uint64_t src_pg_ver{0};
-
-  bufferlist out_bl;
-
-  struct CreateBucketResult {
-    string code;
-
-    void decode_xml(XMLObj *obj) {
-      RGWXMLDecoder::decode_xml("Code", code, obj);
-    }
-  } result;
-
-  rgw_bucket target_bucket;
-  std::unique_ptr<rgw::sal::RadosBucket> bucket;
-  std::unique_ptr<rgw::sal::RadosObject> src_obj;
-  std::unique_ptr<rgw::sal::RadosBucket> dest_bucket;
-  std::unique_ptr<rgw::sal::RadosObject> dest_obj;
-
-
-public:
-  RGWAWSHandleRemoteObjCBCR(RGWDataSyncCtx *_sc,
-                            rgw_bucket_sync_pipe& _sync_pipe,
-                            rgw_obj_key& _key,
-                            AWSSyncInstanceEnv& _instance,
-                            uint64_t _versioned_epoch) : RGWStatRemoteObjCBCR(_sc, _sync_pipe.info.source_bs.bucket, _key),
-                                                         sync_pipe(_sync_pipe),
-                                                         instance(_instance), versioned_epoch(_versioned_epoch)
-  {}
-
-  ~RGWAWSHandleRemoteObjCBCR(){
-  }
-
-  int operate(const DoutPrefixProvider *dpp) override {
-    reenter(this) {
-      ret = decode_attr(attrs, RGW_ATTR_PG_VER, &src_pg_ver, (uint64_t)0);
-      if (ret < 0) {
-        ldpp_dout(dpp, 0) << "ERROR: failed to decode pg ver attr, ignoring" << dendl;
-      } else {
-        ret = decode_attr(attrs, RGW_ATTR_SOURCE_ZONE, &src_zone_short_id, (uint32_t)0);
-        if (ret < 0) {
-          ldpp_dout(dpp, 0) << "ERROR: failed to decode source zone short_id attr, ignoring" << dendl;
-          src_pg_ver = 0; /* all or nothing */
-        }
-      }
-      ldpp_dout(dpp, 4) << "AWS: download begin: z=" << sc->source_zone
-                              << " b=" << src_bucket << " k=" << key << " size=" << size
-                              << " mtime=" << mtime << " etag=" << etag
-                              << " zone_short_id=" << src_zone_short_id << " pg_ver=" << src_pg_ver
-                              << dendl;
-
-      source_conn = sync_env->svc->zone->get_zone_conn(sc->source_zone);
-      if (!source_conn) {
-        ldpp_dout(dpp, 0) << "ERROR: cannot find http connection to zone " << sc->source_zone << dendl;
-        return set_cr_error(-EINVAL);
-      }
-
-      instance.get_profile(sync_pipe.info.source_bs.bucket, &target);
-      instance.conf.get_target(target, sync_pipe.dest_bucket_info, key, &target_bucket_name, &target_obj_name);
-
-      if (bucket_created.find(target_bucket_name) == bucket_created.end()){
-        yield {
-          ldpp_dout(dpp, 0) << "AWS: creating bucket " << target_bucket_name << dendl;
-          bufferlist bl;
-          call(new RGWPutRawRESTResourceCR <bufferlist> (sc->cct, target->conn.get(),
-                                                  sync_env->http_manager,
-                                                  target_bucket_name, nullptr, bl, &out_bl));
-        }
-        if (retcode < 0 ) {
-          RGWXMLDecoder::XMLParser parser;
-          if (!parser.init()) {
-            ldpp_dout(dpp, 0) << "ERROR: failed to initialize xml parser for parsing multipart init response from server" << dendl;
-            return set_cr_error(retcode);
-          }
-
-          if (!parser.parse(out_bl.c_str(), out_bl.length(), 1)) {
-            string str(out_bl.c_str(), out_bl.length());
-            ldpp_dout(dpp, 5) << "ERROR: failed to parse xml: " << str << dendl;
-            return set_cr_error(retcode);
-          }
-
-          try {
-            RGWXMLDecoder::decode_xml("Error", result, &parser, true);
-          } catch (RGWXMLDecoder::err& err) {
-            string str(out_bl.c_str(), out_bl.length());
-            ldpp_dout(dpp, 5) << "ERROR: unexpected xml: " << str << dendl;
-            return set_cr_error(retcode);
-          }
-
-          if (result.code != "BucketAlreadyOwnedByYou") {
-            return set_cr_error(retcode);
-          }
-        }
-
-        bucket_created[target_bucket_name] = true;
-      }
-
-      yield {
-        bucket.reset(new rgw::sal::RadosBucket(sync_env->driver, src_bucket));
-        src_obj.reset(new rgw::sal::RadosObject(sync_env->driver, key, bucket.get()));
-
-        /* init output */
-        target_bucket.name = target_bucket_name; /* this is only possible because we only use bucket name for
-                                                    uri resolution */
-        dest_bucket.reset(new rgw::sal::RadosBucket(sync_env->driver, target_bucket));
-        dest_obj.reset(new rgw::sal::RadosObject(sync_env->driver, rgw_obj_key(target_obj_name), dest_bucket.get()));
-
-        rgw_sync_aws_src_obj_properties src_properties;
-        src_properties.mtime = mtime;
-        src_properties.etag = etag;
-        src_properties.zone_short_id = src_zone_short_id;
-        src_properties.pg_ver = src_pg_ver;
-        src_properties.versioned_epoch = versioned_epoch;
-
-        if (size < instance.conf.s3.multipart_sync_threshold) {
-          call(new RGWAWSStreamObjToCloudPlainCR(sc, source_conn, src_obj.get(),
-                                                 src_properties,
-                                                 target,
-                                                 dest_obj.get()));
-        } else {
-          rgw_rest_obj rest_obj;
-          rest_obj.init(key);
-          if (do_decode_rest_obj(dpp, sc->cct, attrs, headers, &rest_obj)) {
-            ldpp_dout(dpp, 0) << "ERROR: failed to decode rest obj out of headers=" << headers << ", attrs=" << attrs << dendl;
-            return set_cr_error(-EINVAL);
-          }
-          call(new RGWAWSStreamObjToCloudMultipartCR(sc, sync_pipe, instance.conf, source_conn, src_obj.get(),
-                                                     target, dest_obj.get(), size, src_properties, rest_obj));
-        }
-      }
-      if (retcode < 0) {
-        return set_cr_error(retcode);
-      }
-
-      return set_cr_done();
-    }
-
-    return 0;
-  }
-};
-
-class RGWAWSHandleRemoteObjCR : public RGWCallStatRemoteObjCR {
-  rgw_bucket_sync_pipe sync_pipe;
-  AWSSyncInstanceEnv& instance;
-  uint64_t versioned_epoch;
-public:
-  RGWAWSHandleRemoteObjCR(RGWDataSyncCtx *_sc,
-                              rgw_bucket_sync_pipe& _sync_pipe, rgw_obj_key& _key,
-                              AWSSyncInstanceEnv& _instance, uint64_t _versioned_epoch) : RGWCallStatRemoteObjCR(_sc, _sync_pipe.info.source_bs.bucket, _key),
-                                                          sync_pipe(_sync_pipe),
-                                                          instance(_instance), versioned_epoch(_versioned_epoch) {
-  }
-
-  ~RGWAWSHandleRemoteObjCR() {}
-
-  RGWStatRemoteObjCBCR *allocate_callback() override {
-    return new RGWAWSHandleRemoteObjCBCR(sc, sync_pipe, key, instance, versioned_epoch);
-  }
-};
-
-class RGWAWSRemoveRemoteObjCBCR : public RGWCoroutine {
-  RGWDataSyncCtx *sc;
-  std::shared_ptr<AWSSyncConfig_Profile> target;
-  rgw_bucket_sync_pipe sync_pipe;
-  rgw_obj_key key;
-  ceph::real_time mtime;
-  AWSSyncInstanceEnv& instance;
-  int ret{0};
-public:
-  RGWAWSRemoveRemoteObjCBCR(RGWDataSyncCtx *_sc,
-                          rgw_bucket_sync_pipe& _sync_pipe, rgw_obj_key& _key, const ceph::real_time& _mtime,
-                          AWSSyncInstanceEnv& _instance) : RGWCoroutine(_sc->cct), sc(_sc),
-                                                        sync_pipe(_sync_pipe), key(_key),
-                                                        mtime(_mtime), instance(_instance) {}
-  int operate(const DoutPrefixProvider *dpp) override {
-    reenter(this) {
-      ldpp_dout(dpp, 0) << ": remove remote obj: z=" << sc->source_zone
-                              << " b=" <<sync_pipe.info.source_bs.bucket << " k=" << key << " mtime=" << mtime << dendl;
-      yield {
-        instance.get_profile(sync_pipe.info.source_bs.bucket, &target);
-        string path =  instance.conf.get_path(target, sync_pipe.dest_bucket_info, key);
-        ldpp_dout(dpp, 0) << "AWS: removing aws object at" << path << dendl;
-
-        call(new RGWDeleteRESTResourceCR(sc->cct, target->conn.get(),
-                                         sc->env->http_manager,
-                                         path, nullptr /* params */));
-      }
-      if (retcode < 0) {
-        return set_cr_error(retcode);
-      }
-      return set_cr_done();
-    }
-    return 0;
-  }
-
-};
-
-
-class RGWAWSDataSyncModule: public RGWDataSyncModule {
-  CephContext *cct;
-  AWSSyncInstanceEnv instance;
-public:
-  RGWAWSDataSyncModule(CephContext *_cct, AWSSyncConfig& _conf) :
-                  cct(_cct),
-                  instance(_conf) {
-  }
-
-  void init(RGWDataSyncCtx *sc, uint64_t instance_id) override {
-    instance.init(sc, instance_id);
-  }
-
-  ~RGWAWSDataSyncModule() {}
-
-  RGWCoroutine *sync_object(const DoutPrefixProvider *dpp, RGWDataSyncCtx *sc, rgw_bucket_sync_pipe& sync_pipe, rgw_obj_key& key,
-                            std::optional<uint64_t> versioned_epoch,
-                            rgw_zone_set *zones_trace) override {
-    ldout(sc->cct, 0) << instance.id << ": sync_object: b=" << sync_pipe.info.source_bs.bucket << " k=" << key << " versioned_epoch=" << versioned_epoch.value_or(0) << dendl;
-    return new RGWAWSHandleRemoteObjCR(sc, sync_pipe, key, instance, versioned_epoch.value_or(0));
-  }
-  RGWCoroutine *remove_object(const DoutPrefixProvider *dpp, RGWDataSyncCtx *sc, rgw_bucket_sync_pipe& sync_pipe, rgw_obj_key& key, real_time& mtime, bool versioned, uint64_t versioned_epoch,
-                              rgw_zone_set *zones_trace) override {
-    ldout(sc->cct, 0) <<"rm_object: b=" << sync_pipe.info.source_bs.bucket << " k=" << key << " mtime=" << mtime << " versioned=" << versioned << " versioned_epoch=" << versioned_epoch << dendl;
-    return new RGWAWSRemoveRemoteObjCBCR(sc, sync_pipe, key, mtime, instance);
-  }
-  RGWCoroutine *create_delete_marker(const DoutPrefixProvider *dpp, RGWDataSyncCtx *sc, rgw_bucket_sync_pipe& sync_pipe, rgw_obj_key& key, real_time& mtime,
-                                     rgw_bucket_entry_owner& owner, bool versioned, uint64_t versioned_epoch,
-                                     rgw_zone_set *zones_trace) override {
-    ldout(sc->cct, 0) <<"AWS Not implemented: create_delete_marker: b=" << sync_pipe.info.source_bs.bucket << " k=" << key << " mtime=" << mtime
-                            << " versioned=" << versioned << " versioned_epoch=" << versioned_epoch << dendl;
-    return NULL;
-  }
-};
-
-class RGWAWSSyncModuleInstance : public RGWSyncModuleInstance {
-  RGWAWSDataSyncModule data_handler;
-public:
-  RGWAWSSyncModuleInstance(CephContext *cct, AWSSyncConfig& _conf) : data_handler(cct, _conf) {}
-  RGWDataSyncModule *get_data_handler() override {
-    return &data_handler;
-  }
-};
-
-int RGWAWSSyncModule::create_instance(const DoutPrefixProvider *dpp, CephContext *cct, const JSONFormattable& config,  RGWSyncModuleInstanceRef *instance){
-  AWSSyncConfig conf;
-
-  int r = conf.init(dpp, cct, config);
-  if (r < 0) {
-    return r;
-  }
-
-  instance->reset(new RGWAWSSyncModuleInstance(cct, conf));
-  return 0;
-}
diff --git a/src/rgw/store/rados/rgw_sync_module_aws.h b/src/rgw/store/rados/rgw_sync_module_aws.h
deleted file mode 100644 (file)
index 48f0145..0000000
+++ /dev/null
@@ -1,111 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab ft=cpp
-
-#ifndef RGW_SYNC_MODULE_AWS_H
-#define RGW_SYNC_MODULE_AWS_H
-
-#include "rgw_sync_module.h"
-
-struct rgw_sync_aws_multipart_part_info {
-  int part_num{0};
-  uint64_t ofs{0};
-  uint64_t size{0};
-  std::string etag;
-
-  void encode(bufferlist& bl) const {
-    ENCODE_START(1, 1, bl);
-    encode(part_num, bl);
-    encode(ofs, bl);
-    encode(size, bl);
-    encode(etag, bl);
-    ENCODE_FINISH(bl);
-  }
-
-  void decode(bufferlist::const_iterator& bl) {
-    DECODE_START(1, bl);
-    decode(part_num, bl);
-    decode(ofs, bl);
-    decode(size, bl);
-    decode(etag, bl);
-    DECODE_FINISH(bl);
-  }
-};
-WRITE_CLASS_ENCODER(rgw_sync_aws_multipart_part_info)
-
-struct rgw_sync_aws_src_obj_properties {
-  ceph::real_time mtime;
-  std::string etag;
-  uint32_t zone_short_id{0};
-  uint64_t pg_ver{0};
-  uint64_t versioned_epoch{0};
-
-  void encode(bufferlist& bl) const {
-    ENCODE_START(1, 1, bl);
-    encode(mtime, bl);
-    encode(etag, bl);
-    encode(zone_short_id, bl);
-    encode(pg_ver, bl);
-    encode(versioned_epoch, bl);
-    ENCODE_FINISH(bl);
-  }
-
-  void decode(bufferlist::const_iterator& bl) {
-    DECODE_START(1, bl);
-    decode(mtime, bl);
-    decode(etag, bl);
-    decode(zone_short_id, bl);
-    decode(pg_ver, bl);
-    decode(versioned_epoch, bl);
-    DECODE_FINISH(bl);
-  }
-};
-WRITE_CLASS_ENCODER(rgw_sync_aws_src_obj_properties)
-
-struct rgw_sync_aws_multipart_upload_info {
-  std::string upload_id;
-  uint64_t obj_size;
-  rgw_sync_aws_src_obj_properties src_properties;
-  uint32_t part_size{0};
-  uint32_t num_parts{0};
-
-  int cur_part{0};
-  uint64_t cur_ofs{0};
-
-  std::map<int, rgw_sync_aws_multipart_part_info> parts;
-
-  void encode(bufferlist& bl) const {
-    ENCODE_START(1, 1, bl);
-    encode(upload_id, bl);
-    encode(obj_size, bl);
-    encode(src_properties, bl);
-    encode(part_size, bl);
-    encode(num_parts, bl);
-    encode(cur_part, bl);
-    encode(cur_ofs, bl);
-    encode(parts, bl);
-    ENCODE_FINISH(bl);
-  }
-
-  void decode(bufferlist::const_iterator& bl) {
-    DECODE_START(1, bl);
-    decode(upload_id, bl);
-    decode(obj_size, bl);
-    decode(src_properties, bl);
-    decode(part_size, bl);
-    decode(num_parts, bl);
-    decode(cur_part, bl);
-    decode(cur_ofs, bl);
-    decode(parts, bl);
-    DECODE_FINISH(bl);
-  }
-};
-WRITE_CLASS_ENCODER(rgw_sync_aws_multipart_upload_info)
-
-class RGWAWSSyncModule : public RGWSyncModule {
- public:
-  RGWAWSSyncModule() {}
-  bool supports_data_export() override { return false;}
-  int create_instance(const DoutPrefixProvider *dpp, CephContext *cct, const JSONFormattable& config, RGWSyncModuleInstanceRef *instance) override;
-};
-
-#endif /* RGW_SYNC_MODULE_AWS_H */
diff --git a/src/rgw/store/rados/rgw_sync_module_es.cc b/src/rgw/store/rados/rgw_sync_module_es.cc
deleted file mode 100644 (file)
index 3c294bb..0000000
+++ /dev/null
@@ -1,962 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab ft=cpp
-
-#include "rgw_b64.h"
-#include "rgw_common.h"
-#include "rgw_coroutine.h"
-#include "rgw_sync_module.h"
-#include "rgw_data_sync.h"
-#include "rgw_sync_module_es.h"
-#include "rgw_sync_module_es_rest.h"
-#include "rgw_rest_conn.h"
-#include "rgw_cr_rest.h"
-#include "rgw_op.h"
-#include "rgw_es_query.h"
-#include "rgw_zone.h"
-
-#include "services/svc_zone.h"
-
-#include "include/str_list.h"
-
-#include <boost/asio/yield.hpp>
-
-#define dout_subsys ceph_subsys_rgw
-
-using namespace std;
-
-/*
- * allowlist utility. Config string is a list of entries, where an entry is either an item,
- * a prefix, or a suffix. An item would be the name of the entity that we'd look up,
- * a prefix would be a string ending with an asterisk, a suffix would be a string starting
- * with an asterisk. For example:
- *
- *      bucket1, bucket2, foo*, *bar
- */
-class ItemList {
-  bool approve_all{false};
-
-  set<string> entries;
-  set<string> prefixes;
-  set<string> suffixes;
-
-  void parse(const string& str) {
-    list<string> l;
-
-    get_str_list(str, ",", l);
-
-    for (auto& entry : l) {
-      entry = rgw_trim_whitespace(entry);
-      if (entry.empty()) {
-        continue;
-      }
-
-      if (entry == "*") {
-        approve_all = true;
-        return;
-      }
-
-      if (entry[0] == '*') {
-        suffixes.insert(entry.substr(1));
-        continue;
-      }
-
-      if (entry.back() == '*') {
-        prefixes.insert(entry.substr(0, entry.size() - 1));
-        continue;
-      }
-
-      entries.insert(entry);
-    }
-  }
-
-public:
-  ItemList() {}
-  void init(const string& str, bool def_val) {
-    if (str.empty()) {
-      approve_all = def_val;
-    } else {
-      parse(str);
-    }
-  }
-
-  bool exists(const string& entry) {
-    if (approve_all) {
-      return true;
-    }
-
-    if (entries.find(entry) != entries.end()) {
-      return true;
-    }
-
-    auto i = prefixes.upper_bound(entry);
-    if (i != prefixes.begin()) {
-      --i;
-      if (boost::algorithm::starts_with(entry, *i)) {
-        return true;
-      }
-    }
-
-    for (i = suffixes.begin(); i != suffixes.end(); ++i) {
-      if (boost::algorithm::ends_with(entry, *i)) {
-        return true;
-      }
-    }
-
-    return false;
-  }
-};
-
-#define ES_NUM_SHARDS_MIN 5
-
-#define ES_NUM_SHARDS_DEFAULT 16
-#define ES_NUM_REPLICAS_DEFAULT 1
-
-using ESVersion = std::pair<int,int>;
-static constexpr ESVersion ES_V5{5,0};
-static constexpr ESVersion ES_V7{7,0};
-
-struct ESInfo {
-  std::string name;
-  std::string cluster_name;
-  std::string cluster_uuid;
-  ESVersion version;
-
-  void decode_json(JSONObj *obj);
-
-  std::string get_version_str(){
-    return std::to_string(version.first) + "." + std::to_string(version.second);
-  }
-};
-
-// simple wrapper structure to wrap the es version nested type
-struct es_version_decoder {
-  ESVersion version;
-
-  int parse_version(const std::string& s) {
-    int major, minor;
-    int ret = sscanf(s.c_str(), "%d.%d", &major, &minor);
-    if (ret < 0) {
-      return ret;
-    }
-    version = std::make_pair(major,minor);
-    return 0;
-  }
-
-  void decode_json(JSONObj *obj) {
-    std::string s;
-    JSONDecoder::decode_json("number",s,obj);
-    if (parse_version(s) < 0)
-      throw JSONDecoder::err("Failed to parse ElasticVersion");
-  }
-};
-
-
-void ESInfo::decode_json(JSONObj *obj)
-{
-  JSONDecoder::decode_json("name", name, obj);
-  JSONDecoder::decode_json("cluster_name", cluster_name, obj);
-  JSONDecoder::decode_json("cluster_uuid", cluster_uuid, obj);
-  es_version_decoder esv;
-  JSONDecoder::decode_json("version", esv, obj);
-  version = std::move(esv.version);
-}
-
-struct ElasticConfig {
-  uint64_t sync_instance{0};
-  string id;
-  string index_path;
-  std::unique_ptr<RGWRESTConn> conn;
-  bool explicit_custom_meta{true};
-  string override_index_path;
-  ItemList index_buckets;
-  ItemList allow_owners;
-  uint32_t num_shards{0};
-  uint32_t num_replicas{0};
-  std::map <string,string> default_headers = {{ "Content-Type", "application/json" }};
-  ESInfo es_info;
-
-  void init(CephContext *cct, const JSONFormattable& config) {
-    string elastic_endpoint = config["endpoint"];
-    id = string("elastic:") + elastic_endpoint;
-    conn.reset(new RGWRESTConn(cct, (rgw::sal::Driver*)nullptr, id, { elastic_endpoint }, nullopt /* region */ ));
-    explicit_custom_meta = config["explicit_custom_meta"](true);
-    index_buckets.init(config["index_buckets_list"], true); /* approve all buckets by default */
-    allow_owners.init(config["approved_owners_list"], true); /* approve all bucket owners by default */
-    override_index_path = config["override_index_path"];
-    num_shards = config["num_shards"](ES_NUM_SHARDS_DEFAULT);
-    if (num_shards < ES_NUM_SHARDS_MIN) {
-      num_shards = ES_NUM_SHARDS_MIN;
-    }
-    num_replicas = config["num_replicas"](ES_NUM_REPLICAS_DEFAULT);
-    if (string user = config["username"], pw = config["password"];
-        !user.empty() && !pw.empty()) {
-      auto auth_string = user + ":" + pw;
-      default_headers.emplace("AUTHORIZATION", "Basic " + rgw::to_base64(auth_string));
-    }
-
-  }
-
-  void init_instance(const RGWRealm& realm, uint64_t instance_id) {
-    sync_instance = instance_id;
-
-    if (!override_index_path.empty()) {
-      index_path = override_index_path;
-      return;
-    }
-
-    char buf[32];
-    snprintf(buf, sizeof(buf), "-%08x", (uint32_t)(sync_instance & 0xFFFFFFFF));
-
-    index_path = "/rgw-" + realm.get_name() + buf;
-  }
-
-  string get_index_path() {
-    return index_path;
-  }
-
-  map<string, string>& get_request_headers() {
-    return default_headers;
-  }
-
-  string get_obj_path(const RGWBucketInfo& bucket_info, const rgw_obj_key& key) {
-    if (es_info.version >= ES_V7) {
-      return index_path+ "/_doc/" + url_encode(bucket_info.bucket.bucket_id + ":" + key.name + ":" + (key.instance.empty() ? "null" : key.instance));
-;
-    } else {
-      return index_path +  "/object/" + url_encode(bucket_info.bucket.bucket_id + ":" + key.name + ":" + (key.instance.empty() ? "null" : key.instance));
-    }
-  }
-
-  bool should_handle_operation(RGWBucketInfo& bucket_info) {
-    return index_buckets.exists(bucket_info.bucket.name) &&
-           allow_owners.exists(bucket_info.owner.to_str());
-  }
-};
-
-using ElasticConfigRef = std::shared_ptr<ElasticConfig>;
-
-static const char *es_type_to_str(const ESType& t) {
-  switch (t) {
-  case ESType::String: return "string";
-  case ESType::Text: return "text";
-  case ESType::Keyword: return "keyword";
-  case ESType::Long: return "long";
-  case ESType::Integer: return "integer";
-  case ESType::Short: return "short";
-  case ESType::Byte: return "byte";
-  case ESType::Double: return "double";
-  case ESType::Float: return "float";
-  case ESType::Half_Float: return "half_float";
-  case ESType::Scaled_Float: return "scaled_float";
-  case ESType::Date: return "date";
-  case ESType::Boolean: return "boolean";
-  case ESType::Integer_Range: return "integer_range";
-  case ESType::Float_Range: return "float_range";
-  case ESType::Double_Range: return "date_range";
-  case ESType::Date_Range: return "date_range";
-  case ESType::Geo_Point: return "geo_point";
-  case ESType::Ip: return "ip";
-  default:
-    return "<unknown>";
-  }
-}
-
-struct es_type_v2 {
-  ESType estype;
-  const char *format{nullptr};
-  std::optional<bool> analyzed;
-
-  es_type_v2(ESType et) : estype(et) {}
-
-  void dump(Formatter *f) const {
-    const char *type_str = es_type_to_str(estype);
-    encode_json("type", type_str, f);
-    if (format) {
-      encode_json("format", format, f);
-    }
-
-    auto is_analyzed = analyzed;
-
-    if (estype == ESType::String &&
-        !is_analyzed) {
-      is_analyzed = false;
-    }
-
-    if (is_analyzed) {
-      encode_json("index", (is_analyzed.value() ? "analyzed" : "not_analyzed"), f);
-    }
-  }
-};
-
-struct es_type_v5 {
-  ESType estype;
-  const char *format{nullptr};
-  std::optional<bool> analyzed;
-  std::optional<bool> index;
-
-  es_type_v5(ESType et) : estype(et) {}
-
-  void dump(Formatter *f) const {
-    ESType new_estype;
-    if (estype != ESType::String) {
-      new_estype = estype;
-    } else {
-      bool is_analyzed = analyzed.value_or(false);
-      new_estype = (is_analyzed ? ESType::Text : ESType::Keyword);
-      /* index = true; ... Not setting index=true, because that's the default,
-       * and dumping a boolean value *might* be a problem when backporting this
-       * because value might get quoted
-       */
-    }
-
-    const char *type_str = es_type_to_str(new_estype);
-    encode_json("type", type_str, f);
-    if (format) {
-      encode_json("format", format, f);
-    }
-    if (index) {
-      encode_json("index", index.value(), f);
-    }
-  }
-};
-
-template <class T>
-struct es_type : public T {
-  es_type(T t) : T(t) {}
-  es_type& set_format(const char *f) {
-    T::format = f;
-    return *this;
-  }
-
-  es_type& set_analyzed(bool a) {
-    T::analyzed = a;
-    return *this;
-  }
-};
-
-template <class T>
-struct es_index_mappings {
-  ESVersion es_version;
-  ESType string_type {ESType::String};
-
-  es_index_mappings(ESVersion esv):es_version(esv) {
-  }
-
-  es_type<T> est(ESType t) const {
-    return es_type<T>(t);
-  }
-
-  void dump_custom(const char *section, ESType type, const char *format, Formatter *f) const {
-    f->open_object_section(section);
-    ::encode_json("type", "nested", f);
-    f->open_object_section("properties");
-    encode_json("name", est(string_type), f);
-    encode_json("value", est(type).set_format(format), f);
-    f->close_section(); // entry
-    f->close_section(); // custom-string
-  }
-
-  void dump(Formatter *f) const {
-    if (es_version <= ES_V7)
-      f->open_object_section("object");
-    f->open_object_section("properties");
-    encode_json("bucket", est(string_type), f);
-    encode_json("name", est(string_type), f);
-    encode_json("instance", est(string_type), f);
-    encode_json("versioned_epoch", est(ESType::Long), f);
-    f->open_object_section("meta");
-    f->open_object_section("properties");
-    encode_json("cache_control", est(string_type), f);
-    encode_json("content_disposition", est(string_type), f);
-    encode_json("content_encoding", est(string_type), f);
-    encode_json("content_language", est(string_type), f);
-    encode_json("content_type", est(string_type), f);
-    encode_json("storage_class", est(string_type), f);
-    encode_json("etag", est(string_type), f);
-    encode_json("expires", est(string_type), f);
-    encode_json("mtime", est(ESType::Date)
-                         .set_format("strict_date_optional_time||epoch_millis"), f);
-    encode_json("size", est(ESType::Long), f);
-    dump_custom("custom-string", string_type, nullptr, f);
-    dump_custom("custom-int", ESType::Long, nullptr, f);
-    dump_custom("custom-date", ESType::Date, "strict_date_optional_time||epoch_millis", f);
-    f->close_section(); // properties
-    f->close_section(); // meta
-    f->close_section(); // properties
-
-    if (es_version <= ES_V7)
-    f->close_section(); // object
-  }
-};
-
-struct es_index_settings {
-  uint32_t num_replicas;
-  uint32_t num_shards;
-
-  es_index_settings(uint32_t _replicas, uint32_t _shards) : num_replicas(_replicas), num_shards(_shards) {}
-
-  void dump(Formatter *f) const {
-    encode_json("number_of_replicas", num_replicas, f);
-    encode_json("number_of_shards", num_shards, f);
-  }
-};
-
-struct es_index_config_base {
-  virtual ~es_index_config_base() {}
-  virtual void dump(Formatter *f) const = 0;
-};
-
-template <class T>
-struct es_index_config : public es_index_config_base {
-  es_index_settings settings;
-  es_index_mappings<T> mappings;
-
-  es_index_config(es_index_settings& _s, ESVersion esv) : settings(_s), mappings(esv) {
-  }
-
-  void dump(Formatter *f) const {
-    encode_json("settings", settings, f);
-    encode_json("mappings", mappings, f);
-  }
-};
-
-static bool is_sys_attr(const std::string& attr_name){
-  static constexpr std::initializer_list<const char*> rgw_sys_attrs =
-                                                         {RGW_ATTR_PG_VER,
-                                                          RGW_ATTR_SOURCE_ZONE,
-                                                          RGW_ATTR_ID_TAG,
-                                                          RGW_ATTR_TEMPURL_KEY1,
-                                                          RGW_ATTR_TEMPURL_KEY2,
-                                                          RGW_ATTR_UNIX1,
-                                                          RGW_ATTR_UNIX_KEY1
-  };
-
-  return std::find(rgw_sys_attrs.begin(), rgw_sys_attrs.end(), attr_name) != rgw_sys_attrs.end();
-}
-
-static size_t attr_len(const bufferlist& val)
-{
-  size_t len = val.length();
-  if (len && val[len - 1] == '\0') {
-    --len;
-  }
-
-  return len;
-}
-
-struct es_obj_metadata {
-  const DoutPrefixProvider *dpp;
-  CephContext *cct;
-  ElasticConfigRef es_conf;
-  RGWBucketInfo bucket_info;
-  rgw_obj_key key;
-  ceph::real_time mtime;
-  uint64_t size;
-  map<string, bufferlist> attrs;
-  uint64_t versioned_epoch;
-
-  es_obj_metadata(CephContext *_cct, ElasticConfigRef _es_conf, const RGWBucketInfo& _bucket_info,
-                  const rgw_obj_key& _key, ceph::real_time& _mtime, uint64_t _size,
-                  map<string, bufferlist>& _attrs, uint64_t _versioned_epoch) : cct(_cct), es_conf(_es_conf), bucket_info(_bucket_info), key(_key),
-                                                     mtime(_mtime), size(_size), attrs(std::move(_attrs)), versioned_epoch(_versioned_epoch) {}
-
-  void dump(Formatter *f) const {
-    map<string, string> out_attrs;
-    map<string, string> custom_meta;
-    RGWAccessControlPolicy policy;
-    set<string> permissions;
-    RGWObjTags obj_tags;
-
-    for (auto i : attrs) {
-      const string& attr_name = i.first;
-      bufferlist& val = i.second;
-
-      if (!boost::algorithm::starts_with(attr_name, RGW_ATTR_PREFIX)) {
-        continue;
-      }
-
-      if (boost::algorithm::starts_with(attr_name, RGW_ATTR_META_PREFIX)) {
-        custom_meta.emplace(attr_name.substr(sizeof(RGW_ATTR_META_PREFIX) - 1),
-                            string(val.c_str(), attr_len(val)));
-        continue;
-      }
-
-      if (boost::algorithm::starts_with(attr_name, RGW_ATTR_CRYPT_PREFIX)) {
-        continue;
-      }
-
-      if (boost::algorithm::starts_with(attr_name, RGW_ATTR_OLH_PREFIX)) {
-        // skip versioned object olh info
-        continue;
-      }
-
-      if (attr_name == RGW_ATTR_ACL) {
-        try {
-          auto i = val.cbegin();
-          decode(policy, i);
-        } catch (buffer::error& err) {
-          ldpp_dout(dpp, 0) << "ERROR: failed to decode acl for " << bucket_info.bucket << "/" << key << dendl;
-          continue;
-        }
-
-        const RGWAccessControlList& acl = policy.get_acl();
-
-        permissions.insert(policy.get_owner().get_id().to_str());
-        for (auto acliter : acl.get_grant_map()) {
-          const ACLGrant& grant = acliter.second;
-          if (grant.get_type().get_type() == ACL_TYPE_CANON_USER &&
-              ((uint32_t)grant.get_permission().get_permissions() & RGW_PERM_READ) != 0) {
-            rgw_user user;
-            if (grant.get_id(user)) {
-              permissions.insert(user.to_str());
-            }
-          }
-        }
-      } else if (attr_name == RGW_ATTR_TAGS) {
-        try {
-          auto tags_bl = val.cbegin();
-          decode(obj_tags, tags_bl);
-        } catch (buffer::error& err) {
-          ldpp_dout(dpp, 0) << "ERROR: failed to decode obj tags for "
-                       << bucket_info.bucket << "/" << key << dendl;
-          continue;
-        }
-      } else if (attr_name == RGW_ATTR_COMPRESSION) {
-        RGWCompressionInfo cs_info;
-        try {
-          auto vals_bl = val.cbegin();
-          decode(cs_info, vals_bl);
-        } catch (buffer::error& err) {
-          ldpp_dout(dpp, 0) << "ERROR: failed to decode compression attr for "
-                       << bucket_info.bucket << "/" << key << dendl;
-          continue;
-        }
-        out_attrs.emplace("compression",std::move(cs_info.compression_type));
-      } else {
-        if (!is_sys_attr(attr_name)) {
-          out_attrs.emplace(attr_name.substr(sizeof(RGW_ATTR_PREFIX) - 1),
-                            std::string(val.c_str(), attr_len(val)));
-        }
-      }
-    }
-    ::encode_json("bucket", bucket_info.bucket.name, f);
-    ::encode_json("name", key.name, f);
-    string instance = key.instance;
-    if (instance.empty())
-      instance = "null";
-    ::encode_json("instance", instance, f);
-    ::encode_json("versioned_epoch", versioned_epoch, f);
-    ::encode_json("owner", policy.get_owner(), f);
-    ::encode_json("permissions", permissions, f);
-    f->open_object_section("meta");
-    ::encode_json("size", size, f);
-
-    string mtime_str;
-    rgw_to_iso8601(mtime, &mtime_str);
-    ::encode_json("mtime", mtime_str, f);
-    for (auto i : out_attrs) {
-      ::encode_json(i.first.c_str(), i.second, f);
-    }
-    map<string, string> custom_str;
-    map<string, string> custom_int;
-    map<string, string> custom_date;
-
-    for (auto i : custom_meta) {
-      auto config = bucket_info.mdsearch_config.find(i.first);
-      if (config == bucket_info.mdsearch_config.end()) {
-        if (!es_conf->explicit_custom_meta) {
-          /* default custom meta is of type string */
-          custom_str[i.first] = i.second;
-        } else {
-          ldpp_dout(dpp, 20) << "custom meta entry key=" << i.first << " not found in bucket mdsearch config: " << bucket_info.mdsearch_config << dendl;
-        }
-        continue;
-      }
-      switch (config->second) {
-        case ESEntityTypeMap::ES_ENTITY_DATE:
-          custom_date[i.first] = i.second;
-          break;
-        case ESEntityTypeMap::ES_ENTITY_INT:
-          custom_int[i.first] = i.second;
-          break;
-        default:
-          custom_str[i.first] = i.second;
-      }
-    }
-
-    if (!custom_str.empty()) {
-      f->open_array_section("custom-string");
-      for (auto i : custom_str) {
-        f->open_object_section("entity");
-        ::encode_json("name", i.first.c_str(), f);
-        ::encode_json("value", i.second, f);
-        f->close_section();
-      }
-      f->close_section();
-    }
-    if (!custom_int.empty()) {
-      f->open_array_section("custom-int");
-      for (auto i : custom_int) {
-        f->open_object_section("entity");
-        ::encode_json("name", i.first.c_str(), f);
-        ::encode_json("value", i.second, f);
-        f->close_section();
-      }
-      f->close_section();
-    }
-    if (!custom_date.empty()) {
-      f->open_array_section("custom-date");
-      for (auto i : custom_date) {
-        /*
-         * try to exlicitly parse date field, otherwise elasticsearch could reject the whole doc,
-         * which will end up with failed sync
-         */
-        real_time t;
-        int r = parse_time(i.second.c_str(), &t);
-        if (r < 0) {
-          ldpp_dout(dpp, 20) << __func__ << "(): failed to parse time (" << i.second << "), skipping encoding of custom date attribute" << dendl;
-          continue;
-        }
-
-        string time_str;
-        rgw_to_iso8601(t, &time_str);
-
-        f->open_object_section("entity");
-        ::encode_json("name", i.first.c_str(), f);
-        ::encode_json("value", time_str.c_str(), f);
-        f->close_section();
-      }
-      f->close_section();
-    }
-    f->close_section(); // meta
-    const auto& m = obj_tags.get_tags();
-    if (m.size() > 0){
-      f->open_array_section("tagging");
-      for (const auto &it : m) {
-        f->open_object_section("tag");
-        ::encode_json("key", it.first, f);
-        ::encode_json("value",it.second, f);
-        f->close_section();
-      }
-      f->close_section(); // tagging
-    }
-  }
-};
-
-class RGWElasticGetESInfoCBCR : public RGWCoroutine {
-public:
-  RGWElasticGetESInfoCBCR(RGWDataSyncCtx *_sc, 
-                          ElasticConfigRef _conf) : RGWCoroutine(_sc->cct),
-                                                    sc(_sc), sync_env(_sc->env),
-                                                    conf(_conf) {}
-  int operate(const DoutPrefixProvider *dpp) override {
-    reenter(this) {
-      ldpp_dout(dpp, 5) << conf->id << ": get elasticsearch info for zone: " << sc->source_zone << dendl;
-      yield call(new RGWReadRESTResourceCR<ESInfo> (sync_env->cct,
-                                                    conf->conn.get(),
-                                                    sync_env->http_manager,
-                                                    "/", nullptr /*params*/,
-                                                    &(conf->default_headers),
-                                                    &(conf->es_info)));
-      if (retcode < 0) {
-        ldpp_dout(dpp, 5) << conf->id << ": get elasticsearch failed: " << retcode << dendl;
-        return set_cr_error(retcode);
-      }
-
-      ldpp_dout(dpp, 5) << conf->id << ": got elastic version=" << conf->es_info.get_version_str() << dendl;
-      return set_cr_done();
-    }
-    return 0;
-  }
-private:
-  RGWDataSyncCtx *sc;
-  RGWDataSyncEnv *sync_env;
-  ElasticConfigRef conf;
-};
-
-class RGWElasticPutIndexCBCR : public RGWCoroutine {
-public:
-  RGWElasticPutIndexCBCR(RGWDataSyncCtx *_sc,
-                         ElasticConfigRef _conf) : RGWCoroutine(_sc->cct),
-                                                   sc(_sc), sync_env(_sc->env),
-                                                   conf(_conf) {}
-  int operate(const DoutPrefixProvider *dpp) override {
-    reenter(this) {
-      ldpp_dout(dpp, 5) << conf->id << ": put elasticsearch index for zone: " << sc->source_zone << dendl;
-
-      yield {
-        string path = conf->get_index_path();
-        es_index_settings settings(conf->num_replicas, conf->num_shards);
-        std::unique_ptr<es_index_config_base> index_conf;
-
-        if (conf->es_info.version >= ES_V5) {
-          ldpp_dout(dpp, 0) << "elasticsearch: index mapping: version >= 5" << dendl;
-          index_conf.reset(new es_index_config<es_type_v5>(settings, conf->es_info.version));
-        } else {
-          ldpp_dout(dpp, 0) << "elasticsearch: index mapping: version < 5" << dendl;
-          index_conf.reset(new es_index_config<es_type_v2>(settings, conf->es_info.version));
-        }
-        call(new RGWPutRESTResourceCR<es_index_config_base, int, _err_response> (sc->cct,
-                                                             conf->conn.get(),
-                                                             sync_env->http_manager,
-                                                             path, nullptr /*params*/,
-                                                             &(conf->default_headers),
-                                                             *index_conf, nullptr, &err_response));
-      }
-      if (retcode < 0) {
-
-        if (err_response.error.type != "index_already_exists_exception" &&
-                 err_response.error.type != "resource_already_exists_exception") {
-          ldpp_dout(dpp, 0) << "elasticsearch: failed to initialize index: response.type=" << err_response.error.type << " response.reason=" << err_response.error.reason << dendl;
-          return set_cr_error(retcode);
-        }
-
-        ldpp_dout(dpp, 0) << "elasticsearch: index already exists, assuming external initialization" << dendl;
-      }
-      return set_cr_done();
-    }
-    return 0;
-  }
-
-private:
-  RGWDataSyncCtx *sc;
-  RGWDataSyncEnv *sync_env;
-  ElasticConfigRef conf;
-
-    struct _err_response {
-    struct err_reason {
-      vector<err_reason> root_cause;
-      string type;
-      string reason;
-      string index;
-
-      void decode_json(JSONObj *obj) {
-        JSONDecoder::decode_json("root_cause", root_cause, obj);
-        JSONDecoder::decode_json("type", type, obj);
-        JSONDecoder::decode_json("reason", reason, obj);
-        JSONDecoder::decode_json("index", index, obj);
-      }
-    } error;
-
-    void decode_json(JSONObj *obj) {
-      JSONDecoder::decode_json("error", error, obj);
-    }
-  } err_response;
-};
-
-class RGWElasticInitConfigCBCR : public RGWCoroutine {
-  RGWDataSyncCtx *sc;
-  RGWDataSyncEnv *sync_env;
-  ElasticConfigRef conf;
-
-public:
-  RGWElasticInitConfigCBCR(RGWDataSyncCtx *_sc,
-                          ElasticConfigRef _conf) : RGWCoroutine(_sc->cct),
-                                                    sc(_sc), sync_env(_sc->env),
-                                                    conf(_conf) {}
-  int operate(const DoutPrefixProvider *dpp) override {
-    reenter(this) {
-
-      yield call(new RGWElasticGetESInfoCBCR(sc, conf));
-
-      if (retcode < 0) {
-        return set_cr_error(retcode);
-      }
-
-      yield call(new RGWElasticPutIndexCBCR(sc, conf));
-      if (retcode < 0) {
-          return set_cr_error(retcode);
-      }
-      return set_cr_done();
-    }
-    return 0;
-  }
-
-};
-
-class RGWElasticHandleRemoteObjCBCR : public RGWStatRemoteObjCBCR {
-  rgw_bucket_sync_pipe sync_pipe;
-  ElasticConfigRef conf;
-  uint64_t versioned_epoch;
-public:
-  RGWElasticHandleRemoteObjCBCR(RGWDataSyncCtx *_sc,
-                          rgw_bucket_sync_pipe& _sync_pipe, rgw_obj_key& _key,
-                          ElasticConfigRef _conf, uint64_t _versioned_epoch) : RGWStatRemoteObjCBCR(_sc, _sync_pipe.info.source_bs.bucket, _key),
-                                                                               sync_pipe(_sync_pipe), conf(_conf),
-                                                                               versioned_epoch(_versioned_epoch) {}
-  int operate(const DoutPrefixProvider *dpp) override {
-    reenter(this) {
-      ldpp_dout(dpp, 10) << ": stat of remote obj: z=" << sc->source_zone
-                               << " b=" << sync_pipe.info.source_bs.bucket << " k=" << key
-                               << " size=" << size << " mtime=" << mtime << dendl;
-
-      yield {
-        string path = conf->get_obj_path(sync_pipe.dest_bucket_info, key);
-        es_obj_metadata doc(sync_env->cct, conf, sync_pipe.dest_bucket_info, key, mtime, size, attrs, versioned_epoch);
-
-        call(new RGWPutRESTResourceCR<es_obj_metadata, int>(sync_env->cct, conf->conn.get(),
-                                                            sync_env->http_manager,
-                                                            path, nullptr /* params */,
-                                                            &(conf->default_headers),
-                                                            doc, nullptr /* result */));
-
-      }
-      if (retcode < 0) {
-        return set_cr_error(retcode);
-      }
-      return set_cr_done();
-    }
-    return 0;
-  }
-};
-
-class RGWElasticHandleRemoteObjCR : public RGWCallStatRemoteObjCR {
-  rgw_bucket_sync_pipe sync_pipe;
-  ElasticConfigRef conf;
-  uint64_t versioned_epoch;
-public:
-  RGWElasticHandleRemoteObjCR(RGWDataSyncCtx *_sc,
-                        rgw_bucket_sync_pipe& _sync_pipe, rgw_obj_key& _key,
-                        ElasticConfigRef _conf, uint64_t _versioned_epoch) : RGWCallStatRemoteObjCR(_sc, _sync_pipe.info.source_bs.bucket, _key),
-                                                           sync_pipe(_sync_pipe),
-                                                           conf(_conf), versioned_epoch(_versioned_epoch) {
-  }
-
-  ~RGWElasticHandleRemoteObjCR() override {}
-
-  RGWStatRemoteObjCBCR *allocate_callback() override {
-    return new RGWElasticHandleRemoteObjCBCR(sc, sync_pipe, key, conf, versioned_epoch);
-  }
-};
-
-class RGWElasticRemoveRemoteObjCBCR : public RGWCoroutine {
-  RGWDataSyncCtx *sc;
-  RGWDataSyncEnv *sync_env;
-  rgw_bucket_sync_pipe sync_pipe;
-  rgw_obj_key key;
-  ceph::real_time mtime;
-  ElasticConfigRef conf;
-public:
-  RGWElasticRemoveRemoteObjCBCR(RGWDataSyncCtx *_sc,
-                          rgw_bucket_sync_pipe& _sync_pipe, rgw_obj_key& _key, const ceph::real_time& _mtime,
-                          ElasticConfigRef _conf) : RGWCoroutine(_sc->cct), sc(_sc), sync_env(_sc->env),
-                                                        sync_pipe(_sync_pipe), key(_key),
-                                                        mtime(_mtime), conf(_conf) {}
-  int operate(const DoutPrefixProvider *dpp) override {
-    reenter(this) {
-      ldpp_dout(dpp, 10) << ": remove remote obj: z=" << sc->source_zone
-                               << " b=" << sync_pipe.info.source_bs.bucket << " k=" << key << " mtime=" << mtime << dendl;
-      yield {
-        string path = conf->get_obj_path(sync_pipe.dest_bucket_info, key);
-
-        call(new RGWDeleteRESTResourceCR(sync_env->cct, conf->conn.get(),
-                                         sync_env->http_manager,
-                                         path, nullptr /* params */));
-      }
-      if (retcode < 0) {
-        return set_cr_error(retcode);
-      }
-      return set_cr_done();
-    }
-    return 0;
-  }
-
-};
-
-class RGWElasticDataSyncModule : public RGWDataSyncModule {
-  ElasticConfigRef conf;
-public:
-  RGWElasticDataSyncModule(const DoutPrefixProvider *dpp, CephContext *cct, const JSONFormattable& config) : conf(std::make_shared<ElasticConfig>()) {
-    conf->init(cct, config);
-  }
-  ~RGWElasticDataSyncModule() override {}
-
-  void init(RGWDataSyncCtx *sc, uint64_t instance_id) override {
-    conf->init_instance(sc->env->svc->zone->get_realm(), instance_id);
-  }
-
-  RGWCoroutine *init_sync(const DoutPrefixProvider *dpp, RGWDataSyncCtx *sc) override {
-    ldpp_dout(dpp, 5) << conf->id << ": init" << dendl;
-    return new RGWElasticInitConfigCBCR(sc, conf);
-  }
-
-  RGWCoroutine *start_sync(const DoutPrefixProvider *dpp, RGWDataSyncCtx *sc) override {
-    ldpp_dout(dpp, 5) << conf->id << ": start_sync" << dendl;
-    // try to get elastic search version
-    return new RGWElasticGetESInfoCBCR(sc, conf);
-  }
-
-  RGWCoroutine *sync_object(const DoutPrefixProvider *dpp, RGWDataSyncCtx *sc, rgw_bucket_sync_pipe& sync_pipe, rgw_obj_key& key, std::optional<uint64_t> versioned_epoch, rgw_zone_set *zones_trace) override {
-    ldpp_dout(dpp, 10) << conf->id << ": sync_object: b=" << sync_pipe.info.source_bs.bucket << " k=" << key << " versioned_epoch=" << versioned_epoch.value_or(0) << dendl;
-    if (!conf->should_handle_operation(sync_pipe.dest_bucket_info)) {
-      ldpp_dout(dpp, 10) << conf->id << ": skipping operation (bucket not approved)" << dendl;
-      return nullptr;
-    }
-    return new RGWElasticHandleRemoteObjCR(sc, sync_pipe, key, conf, versioned_epoch.value_or(0));
-  }
-  RGWCoroutine *remove_object(const DoutPrefixProvider *dpp, RGWDataSyncCtx *sc, rgw_bucket_sync_pipe& sync_pipe, rgw_obj_key& key, real_time& mtime, bool versioned, uint64_t versioned_epoch, rgw_zone_set *zones_trace) override {
-    /* versioned and versioned epoch params are useless in the elasticsearch backend case */
-    ldpp_dout(dpp, 10) << conf->id << ": rm_object: b=" << sync_pipe.info.source_bs.bucket << " k=" << key << " mtime=" << mtime << " versioned=" << versioned << " versioned_epoch=" << versioned_epoch << dendl;
-    if (!conf->should_handle_operation(sync_pipe.dest_bucket_info)) {
-      ldpp_dout(dpp, 10) << conf->id << ": skipping operation (bucket not approved)" << dendl;
-      return nullptr;
-    }
-    return new RGWElasticRemoveRemoteObjCBCR(sc, sync_pipe, key, mtime, conf);
-  }
-  RGWCoroutine *create_delete_marker(const DoutPrefixProvider *dpp, RGWDataSyncCtx *sc, rgw_bucket_sync_pipe& sync_pipe, rgw_obj_key& key, real_time& mtime,
-                                     rgw_bucket_entry_owner& owner, bool versioned, uint64_t versioned_epoch, rgw_zone_set *zones_trace) override {
-    ldpp_dout(dpp, 10) << conf->id << ": create_delete_marker: b=" << sync_pipe.info.source_bs.bucket << " k=" << key << " mtime=" << mtime
-                            << " versioned=" << versioned << " versioned_epoch=" << versioned_epoch << dendl;
-    ldpp_dout(dpp, 10) << conf->id << ": skipping operation (not handled)" << dendl;
-    return NULL;
-  }
-  RGWRESTConn *get_rest_conn() {
-    return conf->conn.get();
-  }
-
-  string get_index_path() {
-    return conf->get_index_path();
-  }
-
-  map<string, string>& get_request_headers() {
-    return conf->get_request_headers();
-  }
-};
-
-RGWElasticSyncModuleInstance::RGWElasticSyncModuleInstance(const DoutPrefixProvider *dpp, CephContext *cct, const JSONFormattable& config)
-{
-  data_handler = std::unique_ptr<RGWElasticDataSyncModule>(new RGWElasticDataSyncModule(dpp, cct, config));
-}
-
-RGWDataSyncModule *RGWElasticSyncModuleInstance::get_data_handler()
-{
-  return data_handler.get();
-}
-
-RGWRESTConn *RGWElasticSyncModuleInstance::get_rest_conn()
-{
-  return data_handler->get_rest_conn();
-}
-
-string RGWElasticSyncModuleInstance::get_index_path() {
-  return data_handler->get_index_path();
-}
-
-map<string, string>& RGWElasticSyncModuleInstance::get_request_headers() {
-  return data_handler->get_request_headers();
-}
-
-RGWRESTMgr *RGWElasticSyncModuleInstance::get_rest_filter(int dialect, RGWRESTMgr *orig) {
-  if (dialect != RGW_REST_S3) {
-    return orig;
-  }
-  delete orig;
-  return new RGWRESTMgr_MDSearch_S3();
-}
-
-int RGWElasticSyncModule::create_instance(const DoutPrefixProvider *dpp, CephContext *cct, const JSONFormattable& config, RGWSyncModuleInstanceRef *instance) {
-  string endpoint = config["endpoint"];
-  instance->reset(new RGWElasticSyncModuleInstance(dpp, cct, config));
-  return 0;
-}
-
diff --git a/src/rgw/store/rados/rgw_sync_module_es.h b/src/rgw/store/rados/rgw_sync_module_es.h
deleted file mode 100644 (file)
index 6c0c422..0000000
+++ /dev/null
@@ -1,62 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab ft=cpp
-
-#ifndef CEPH_RGW_SYNC_MODULE_ES_H
-#define CEPH_RGW_SYNC_MODULE_ES_H
-
-#include "rgw_sync_module.h"
-
-enum class ESType {
-  /* string datatypes */
-  String, /* Deprecated Since 5.X+ */
-  Text,
-  Keyword,
-
-  /* Numeric Types */
-  Long, Integer, Short, Byte, Double, Float, Half_Float, Scaled_Float,
-
-  /* Date Type */
-  Date,
-
-  /* Boolean */
-  Boolean,
-
-  /* Binary; Must Be Base64 Encoded */
-  Binary,
-
-  /* Range Types */
-  Integer_Range, Float_Range, Long_Range, Double_Range, Date_Range,
-
-  /* A Few Specialized Types */
-  Geo_Point,
-  Ip
-};
-
-
-class RGWElasticSyncModule : public RGWSyncModule {
-public:
-  RGWElasticSyncModule() {}
-  bool supports_data_export() override {
-    return false;
-  }
-  int create_instance(const DoutPrefixProvider *dpp, CephContext *cct, const JSONFormattable& config, RGWSyncModuleInstanceRef *instance) override;
-};
-
-class RGWElasticDataSyncModule;
-class RGWRESTConn;
-
-class RGWElasticSyncModuleInstance : public RGWSyncModuleInstance {
-  std::unique_ptr<RGWElasticDataSyncModule> data_handler;
-public:
-  RGWElasticSyncModuleInstance(const DoutPrefixProvider *dpp, CephContext *cct, const JSONFormattable& config);
-  RGWDataSyncModule *get_data_handler() override;
-  RGWRESTMgr *get_rest_filter(int dialect, RGWRESTMgr *orig) override;
-  RGWRESTConn *get_rest_conn();
-  std::string get_index_path();
-  std::map<std::string, std::string>& get_request_headers();
-  bool supports_user_writes() override {
-    return true;
-  }
-};
-
-#endif
diff --git a/src/rgw/store/rados/rgw_sync_module_es_rest.cc b/src/rgw/store/rados/rgw_sync_module_es_rest.cc
deleted file mode 100644 (file)
index db9d48a..0000000
+++ /dev/null
@@ -1,428 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab ft=cpp
-
-#include "rgw_sync_module_es.h"
-#include "rgw_sync_module_es_rest.h"
-#include "rgw_es_query.h"
-#include "rgw_op.h"
-#include "rgw_rest.h"
-#include "rgw_rest_s3.h"
-#include "rgw_sal_rados.h"
-
-#define dout_context g_ceph_context
-#define dout_subsys ceph_subsys_rgw
-
-using namespace std;
-
-struct es_index_obj_response {
-  string bucket;
-  rgw_obj_key key;
-  uint64_t versioned_epoch{0};
-  ACLOwner owner;
-  set<string> read_permissions;
-
-  struct {
-    uint64_t size{0};
-    ceph::real_time mtime;
-    string etag;
-    string content_type;
-    string storage_class;
-    map<string, string> custom_str;
-    map<string, int64_t> custom_int;
-    map<string, string> custom_date;
-
-    template <class T>
-    struct _custom_entry {
-      string name;
-      T value;
-      void decode_json(JSONObj *obj) {
-        JSONDecoder::decode_json("name", name, obj);
-        JSONDecoder::decode_json("value", value, obj);
-      }
-    };
-
-    void decode_json(JSONObj *obj) {
-      JSONDecoder::decode_json("size", size, obj);
-      string mtime_str;
-      JSONDecoder::decode_json("mtime", mtime_str, obj);
-      parse_time(mtime_str.c_str(), &mtime);
-      JSONDecoder::decode_json("etag", etag, obj);
-      JSONDecoder::decode_json("content_type", content_type, obj);
-      JSONDecoder::decode_json("storage_class", storage_class, obj);
-      list<_custom_entry<string> > str_entries;
-      JSONDecoder::decode_json("custom-string", str_entries, obj);
-      for (auto& e : str_entries) {
-        custom_str[e.name] = e.value;
-      }
-      list<_custom_entry<int64_t> > int_entries;
-      JSONDecoder::decode_json("custom-int", int_entries, obj);
-      for (auto& e : int_entries) {
-        custom_int[e.name] = e.value;
-      }
-      list<_custom_entry<string> > date_entries;
-      JSONDecoder::decode_json("custom-date", date_entries, obj);
-      for (auto& e : date_entries) {
-        custom_date[e.name] = e.value;
-      }
-    }
-  } meta;
-
-  void decode_json(JSONObj *obj) {
-    JSONDecoder::decode_json("bucket", bucket, obj);
-    JSONDecoder::decode_json("name", key.name, obj);
-    JSONDecoder::decode_json("instance", key.instance, obj);
-    JSONDecoder::decode_json("versioned_epoch", versioned_epoch, obj);
-    JSONDecoder::decode_json("permissions", read_permissions, obj);
-    JSONDecoder::decode_json("owner", owner, obj);
-    JSONDecoder::decode_json("meta", meta, obj);
-  }
-};
-
-struct es_search_response {
-  uint32_t took;
-  bool timed_out;
-  struct {
-    uint32_t total;
-    uint32_t successful;
-    uint32_t failed;
-    void decode_json(JSONObj *obj) {
-      JSONDecoder::decode_json("total", total, obj);
-      JSONDecoder::decode_json("successful", successful, obj);
-      JSONDecoder::decode_json("failed", failed, obj);
-    }
-  } shards;
-  struct obj_hit {
-    string index;
-    string type;
-    string id;
-    // double score
-    es_index_obj_response source;
-    void decode_json(JSONObj *obj) {
-      JSONDecoder::decode_json("_index", index, obj);
-      JSONDecoder::decode_json("_type", type, obj);
-      JSONDecoder::decode_json("_id", id, obj);
-      JSONDecoder::decode_json("_source", source, obj);
-    }
-  };
-  struct {
-    uint32_t total;
-    // double max_score;
-    list<obj_hit> hits;
-    void decode_json(JSONObj *obj) {
-      JSONDecoder::decode_json("total", total, obj);
-      // JSONDecoder::decode_json("max_score", max_score, obj);
-      JSONDecoder::decode_json("hits", hits, obj);
-    }
-  } hits;
-  void decode_json(JSONObj *obj) {
-    JSONDecoder::decode_json("took", took, obj);
-    JSONDecoder::decode_json("timed_out", timed_out, obj);
-    JSONDecoder::decode_json("_shards", shards, obj);
-    JSONDecoder::decode_json("hits", hits, obj);
-  }
-};
-
-class RGWMetadataSearchOp : public RGWOp {
-  RGWSyncModuleInstanceRef sync_module_ref;
-  RGWElasticSyncModuleInstance *es_module;
-protected:
-  string expression;
-  string custom_prefix;
-#define MAX_KEYS_DEFAULT 100
-  uint64_t max_keys{MAX_KEYS_DEFAULT};
-  string marker_str;
-  uint64_t marker{0};
-  string next_marker;
-  bool is_truncated{false};
-  string err;
-
-  es_search_response response;
-
-public:
-  RGWMetadataSearchOp(const RGWSyncModuleInstanceRef& sync_module) : sync_module_ref(sync_module) {
-    es_module = static_cast<RGWElasticSyncModuleInstance *>(sync_module_ref.get());
-  }
-
-  int verify_permission(optional_yield) override {
-    return 0;
-  }
-  virtual int get_params() = 0;
-  void pre_exec() override;
-  void execute(optional_yield y) override;
-
-  const char* name() const override { return "metadata_search"; }
-  virtual RGWOpType get_type() override { return RGW_OP_METADATA_SEARCH; }
-  virtual uint32_t op_mask() override { return RGW_OP_TYPE_READ; }
-};
-
-void RGWMetadataSearchOp::pre_exec()
-{
-  rgw_bucket_object_pre_exec(s);
-}
-
-void RGWMetadataSearchOp::execute(optional_yield y)
-{
-  op_ret = get_params();
-  if (op_ret < 0)
-    return;
-
-  list<pair<string, string> > conds;
-
-  if (!s->user->get_info().system) {
-    conds.push_back(make_pair("permissions", s->user->get_id().to_str()));
-  }
-
-  if (!s->bucket_name.empty()) {
-    conds.push_back(make_pair("bucket", s->bucket_name));
-  }
-
-  ESQueryCompiler es_query(expression, &conds, custom_prefix);
-  
-  static map<string, string, ltstr_nocase> aliases = {
-                                  { "bucket", "bucket" }, /* forces lowercase */
-                                  { "name", "name" },
-                                  { "key", "name" },
-                                  { "instance", "instance" },
-                                  { "etag", "meta.etag" },
-                                  { "size", "meta.size" },
-                                  { "mtime", "meta.mtime" },
-                                  { "lastmodified", "meta.mtime" },
-                                  { "last_modified", "meta.mtime" },
-                                  { "contenttype", "meta.content_type" },
-                                  { "content_type", "meta.content_type" },
-                                  { "storageclass", "meta.storage_class" },
-                                  { "storage_class", "meta.storage_class" },
-  };
-  es_query.set_field_aliases(&aliases);
-
-  static map<string, ESEntityTypeMap::EntityType> generic_map = { {"bucket", ESEntityTypeMap::ES_ENTITY_STR},
-                                                           {"name", ESEntityTypeMap::ES_ENTITY_STR},
-                                                           {"instance", ESEntityTypeMap::ES_ENTITY_STR},
-                                                           {"permissions", ESEntityTypeMap::ES_ENTITY_STR},
-                                                           {"meta.etag", ESEntityTypeMap::ES_ENTITY_STR},
-                                                           {"meta.content_type", ESEntityTypeMap::ES_ENTITY_STR},
-                                                           {"meta.mtime", ESEntityTypeMap::ES_ENTITY_DATE},
-                                                           {"meta.size", ESEntityTypeMap::ES_ENTITY_INT},
-                                                           {"meta.storage_class", ESEntityTypeMap::ES_ENTITY_STR} };
-  ESEntityTypeMap gm(generic_map);
-  es_query.set_generic_type_map(&gm);
-
-  static set<string> restricted_fields = { {"permissions"} };
-  es_query.set_restricted_fields(&restricted_fields);
-
-  map<string, ESEntityTypeMap::EntityType> custom_map;
-  for (auto& i : s->bucket->get_info().mdsearch_config) {
-    custom_map[i.first] = (ESEntityTypeMap::EntityType)i.second;
-  }
-
-  ESEntityTypeMap em(custom_map);
-  es_query.set_custom_type_map(&em);
-
-  bool valid = es_query.compile(&err);
-  if (!valid) {
-    ldpp_dout(this, 10) << "invalid query, failed generating request json" << dendl;
-    op_ret = -EINVAL;
-    return;
-  }
-
-  JSONFormatter f;
-  encode_json("root", es_query, &f);
-
-  RGWRESTConn *conn = es_module->get_rest_conn();
-
-  bufferlist in;
-  bufferlist out;
-
-  stringstream ss;
-
-  f.flush(ss);
-  in.append(ss.str());
-
-  string resource = es_module->get_index_path() + "/_search";
-  param_vec_t params;
-  static constexpr int BUFSIZE = 32;
-  char buf[BUFSIZE];
-  snprintf(buf, sizeof(buf), "%lld", (long long)max_keys);
-  params.push_back(param_pair_t("size", buf));
-  if (marker > 0) {
-    params.push_back(param_pair_t("from", marker_str.c_str()));
-  }
-  ldpp_dout(this, 20) << "sending request to elasticsearch, payload=" << string(in.c_str(), in.length()) << dendl;
-  auto& extra_headers = es_module->get_request_headers();
-  op_ret = conn->get_resource(s, resource, &params, &extra_headers,
-                              out, &in, nullptr, y);
-  if (op_ret < 0) {
-    ldpp_dout(this, 0) << "ERROR: failed to fetch resource (r=" << resource << ", ret=" << op_ret << ")" << dendl;
-    return;
-  }
-
-  ldpp_dout(this, 20) << "response: " << string(out.c_str(), out.length()) << dendl;
-
-  JSONParser jparser;
-  if (!jparser.parse(out.c_str(), out.length())) {
-    ldpp_dout(this, 0) << "ERROR: failed to parse elasticsearch response" << dendl;
-    op_ret = -EINVAL;
-    return;
-  }
-
-  try {
-    decode_json_obj(response, &jparser);
-  } catch (const JSONDecoder::err& e) {
-    ldpp_dout(this, 0) << "ERROR: failed to decode JSON input: " << e.what() << dendl;
-    op_ret = -EINVAL;
-    return;
-  }
-
-}
-
-class RGWMetadataSearch_ObjStore_S3 : public RGWMetadataSearchOp {
-public:
-  explicit RGWMetadataSearch_ObjStore_S3(const RGWSyncModuleInstanceRef& _sync_module) : RGWMetadataSearchOp(_sync_module) {
-    custom_prefix = "x-amz-meta-";
-  }
-
-  int get_params() override {
-    expression = s->info.args.get("query");
-    bool exists;
-    string max_keys_str = s->info.args.get("max-keys", &exists);
-#define MAX_KEYS_MAX 10000
-    if (exists) {
-      string err;
-      max_keys = strict_strtoll(max_keys_str.c_str(), 10, &err);
-      if (!err.empty()) {
-        return -EINVAL;
-      }
-      if (max_keys > MAX_KEYS_MAX) {
-        max_keys = MAX_KEYS_MAX;
-      }
-    }
-    marker_str = s->info.args.get("marker", &exists);
-    if (exists) {
-      string err;
-      marker = strict_strtoll(marker_str.c_str(), 10, &err);
-      if (!err.empty()) {
-        return -EINVAL;
-      }
-    }
-    uint64_t nm = marker + max_keys;
-    static constexpr int BUFSIZE = 32;
-    char buf[BUFSIZE];
-    snprintf(buf, sizeof(buf), "%lld", (long long)nm);
-    next_marker = buf;
-    return 0;
-  }
-  void send_response() override {
-    if (op_ret) {
-      s->err.message = err;
-      set_req_state_err(s, op_ret);
-    }
-    dump_errno(s);
-    end_header(s, this, "application/xml");
-
-    if (op_ret < 0) {
-      return;
-    }
-
-    is_truncated = (response.hits.hits.size() >= max_keys);
-
-    s->formatter->open_object_section("SearchMetadataResponse");
-    s->formatter->dump_string("Marker", marker_str);
-    s->formatter->dump_string("IsTruncated", (is_truncated ? "true" : "false"));
-    if (is_truncated) {
-      s->formatter->dump_string("NextMarker", next_marker);
-    }
-    if (s->format == RGWFormat::JSON) {
-      s->formatter->open_array_section("Objects");
-    }
-    for (auto& i : response.hits.hits) {
-      s->formatter->open_object_section("Contents");
-      es_index_obj_response& e = i.source;
-      s->formatter->dump_string("Bucket", e.bucket);
-      s->formatter->dump_string("Key", e.key.name);
-      string instance = (!e.key.instance.empty() ? e.key.instance : "null");
-      s->formatter->dump_string("Instance", instance.c_str());
-      s->formatter->dump_int("VersionedEpoch", e.versioned_epoch);
-      dump_time(s, "LastModified", e.meta.mtime);
-      s->formatter->dump_int("Size", e.meta.size);
-      s->formatter->dump_format("ETag", "\"%s\"", e.meta.etag.c_str());
-      s->formatter->dump_string("ContentType", e.meta.content_type.c_str());
-      s->formatter->dump_string("StorageClass", e.meta.storage_class.c_str());
-      dump_owner(s, e.owner.get_id(), e.owner.get_display_name());
-      s->formatter->open_array_section("CustomMetadata");
-      for (auto& m : e.meta.custom_str) {
-        s->formatter->open_object_section("Entry");
-        s->formatter->dump_string("Name", m.first.c_str());
-        s->formatter->dump_string("Value", m.second);
-        s->formatter->close_section();
-      }
-      for (auto& m : e.meta.custom_int) {
-        s->formatter->open_object_section("Entry");
-        s->formatter->dump_string("Name", m.first.c_str());
-        s->formatter->dump_int("Value", m.second);
-        s->formatter->close_section();
-      }
-      for (auto& m : e.meta.custom_date) {
-        s->formatter->open_object_section("Entry");
-        s->formatter->dump_string("Name", m.first.c_str());
-        s->formatter->dump_string("Value", m.second);
-        s->formatter->close_section();
-      }
-      s->formatter->close_section();
-      rgw_flush_formatter(s, s->formatter);
-      s->formatter->close_section();
-    };
-    if (s->format == RGWFormat::JSON) {
-      s->formatter->close_section();
-    }
-    s->formatter->close_section();
-   rgw_flush_formatter_and_reset(s, s->formatter);
-  }
-};
-
-class RGWHandler_REST_MDSearch_S3 : public RGWHandler_REST_S3 {
-protected:
-  RGWOp *op_get() override {
-    if (s->info.args.exists("query")) {
-      return new RGWMetadataSearch_ObjStore_S3(driver->get_sync_module());
-    }
-    if (!s->init_state.url_bucket.empty() &&
-        s->info.args.exists("mdsearch")) {
-      return new RGWGetBucketMetaSearch_ObjStore_S3;
-    }
-    return nullptr;
-  }
-  RGWOp *op_head() override {
-    return nullptr;
-  }
-  RGWOp *op_post() override {
-    return nullptr;
-  }
-public:
-  explicit RGWHandler_REST_MDSearch_S3(const rgw::auth::StrategyRegistry& auth_registry) : RGWHandler_REST_S3(auth_registry) {}
-  virtual ~RGWHandler_REST_MDSearch_S3() {}
-};
-
-
-RGWHandler_REST* RGWRESTMgr_MDSearch_S3::get_handler(rgw::sal::Driver* driver,
-                                                    req_state* const s,
-                                                     const rgw::auth::StrategyRegistry& auth_registry,
-                                                     const std::string& frontend_prefix)
-{
-  int ret =
-    RGWHandler_REST_S3::init_from_header(driver, s,
-                                       RGWFormat::XML, true);
-  if (ret < 0) {
-    return nullptr;
-  }
-
-  if (!s->object->empty()) {
-    return nullptr;
-  }
-
-  RGWHandler_REST *handler = new RGWHandler_REST_MDSearch_S3(auth_registry);
-
-  ldpp_dout(s, 20) << __func__ << " handler=" << typeid(*handler).name()
-                   << dendl;
-  return handler;
-}
-
diff --git a/src/rgw/store/rados/rgw_sync_module_es_rest.h b/src/rgw/store/rados/rgw_sync_module_es_rest.h
deleted file mode 100644 (file)
index b18271a..0000000
+++ /dev/null
@@ -1,18 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab ft=cpp
-
-#pragma once
-
-#include "rgw_rest.h"
-
-class RGWElasticSyncModuleInstance;
-
-class RGWRESTMgr_MDSearch_S3 : public RGWRESTMgr {
-public:
-  explicit RGWRESTMgr_MDSearch_S3() {}
-
-  RGWHandler_REST *get_handler(rgw::sal::Driver* driver,
-                              req_state* s,
-                               const rgw::auth::StrategyRegistry& auth_registry,
-                               const std::string& frontend_prefix) override;
-};
diff --git a/src/rgw/store/rados/rgw_sync_module_log.cc b/src/rgw/store/rados/rgw_sync_module_log.cc
deleted file mode 100644 (file)
index a21604c..0000000
+++ /dev/null
@@ -1,76 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab ft=cpp
-
-#include "rgw_common.h"
-#include "rgw_coroutine.h"
-#include "rgw_cr_rados.h"
-#include "rgw_sync_module.h"
-#include "rgw_data_sync.h"
-#include "rgw_sync_module_log.h"
-
-#define dout_subsys ceph_subsys_rgw
-
-using namespace std;
-
-class RGWLogStatRemoteObjCBCR : public RGWStatRemoteObjCBCR {
-public:
-  RGWLogStatRemoteObjCBCR(RGWDataSyncCtx *_sc,
-                          rgw_bucket& _src_bucket, rgw_obj_key& _key) : RGWStatRemoteObjCBCR(_sc, _src_bucket, _key) {}
-  int operate(const DoutPrefixProvider *dpp) override {
-    ldpp_dout(dpp, 0) << "SYNC_LOG: stat of remote obj: z=" << sc->source_zone
-                            << " b=" << src_bucket << " k=" << key << " size=" << size << " mtime=" << mtime
-                            << " attrs=" << attrs << dendl;
-    return set_cr_done();
-  }
-
-};
-
-class RGWLogStatRemoteObjCR : public RGWCallStatRemoteObjCR {
-public:
-  RGWLogStatRemoteObjCR(RGWDataSyncCtx *_sc,
-                        rgw_bucket& _src_bucket, rgw_obj_key& _key) : RGWCallStatRemoteObjCR(_sc, _src_bucket, _key) {
-  }
-
-  ~RGWLogStatRemoteObjCR() override {}
-
-  RGWStatRemoteObjCBCR *allocate_callback() override {
-    return new RGWLogStatRemoteObjCBCR(sc, src_bucket, key);
-  }
-};
-
-class RGWLogDataSyncModule : public RGWDataSyncModule {
-  string prefix;
-public:
-  explicit RGWLogDataSyncModule(const string& _prefix) : prefix(_prefix) {}
-
-  RGWCoroutine *sync_object(const DoutPrefixProvider *dpp, RGWDataSyncCtx *sc, rgw_bucket_sync_pipe& sync_pipe, rgw_obj_key& key, std::optional<uint64_t> versioned_epoch, rgw_zone_set *zones_trace) override {
-    ldpp_dout(dpp, 0) << prefix << ": SYNC_LOG: sync_object: b=" << sync_pipe.info.source_bs.bucket << " k=" << key << " versioned_epoch=" << versioned_epoch.value_or(0) << dendl;
-    return new RGWLogStatRemoteObjCR(sc, sync_pipe.info.source_bs.bucket, key);
-  }
-  RGWCoroutine *remove_object(const DoutPrefixProvider *dpp, RGWDataSyncCtx *sc, rgw_bucket_sync_pipe& sync_pipe, rgw_obj_key& key, real_time& mtime, bool versioned, uint64_t versioned_epoch, rgw_zone_set *zones_trace) override {
-    ldpp_dout(dpp, 0) << prefix << ": SYNC_LOG: rm_object: b=" << sync_pipe.info.source_bs.bucket << " k=" << key << " mtime=" << mtime << " versioned=" << versioned << " versioned_epoch=" << versioned_epoch << dendl;
-    return NULL;
-  }
-  RGWCoroutine *create_delete_marker(const DoutPrefixProvider *dpp, RGWDataSyncCtx *sc, rgw_bucket_sync_pipe& sync_pipe, rgw_obj_key& key, real_time& mtime,
-                                     rgw_bucket_entry_owner& owner, bool versioned, uint64_t versioned_epoch, rgw_zone_set *zones_trace) override {
-    ldpp_dout(dpp, 0) << prefix << ": SYNC_LOG: create_delete_marker: b=" << sync_pipe.info.source_bs.bucket << " k=" << key << " mtime=" << mtime
-                            << " versioned=" << versioned << " versioned_epoch=" << versioned_epoch << dendl;
-    return NULL;
-  }
-};
-
-class RGWLogSyncModuleInstance : public RGWSyncModuleInstance {
-  RGWLogDataSyncModule data_handler;
-public:
-  explicit RGWLogSyncModuleInstance(const string& prefix) : data_handler(prefix) {}
-  RGWDataSyncModule *get_data_handler() override {
-    return &data_handler;
-  }
-};
-
-int RGWLogSyncModule::create_instance(const DoutPrefixProvider *dpp, CephContext *cct, const JSONFormattable& config, RGWSyncModuleInstanceRef *instance) {
-  string prefix = config["prefix"];
-  instance->reset(new RGWLogSyncModuleInstance(prefix));
-  return 0;
-}
-
diff --git a/src/rgw/store/rados/rgw_sync_module_log.h b/src/rgw/store/rados/rgw_sync_module_log.h
deleted file mode 100644 (file)
index ecf3bb7..0000000
+++ /dev/null
@@ -1,18 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab ft=cpp
-
-#ifndef CEPH_RGW_SYNC_MODULE_LOG_H
-#define CEPH_RGW_SYNC_MODULE_LOG_H
-
-#include "rgw_sync_module.h"
-
-class RGWLogSyncModule : public RGWSyncModule {
-public:
-  RGWLogSyncModule() {}
-  bool supports_data_export() override {
-    return false;
-  }
-  int create_instance(const DoutPrefixProvider *dpp, CephContext *cct, const JSONFormattable& config, RGWSyncModuleInstanceRef *instance) override;
-};
-
-#endif
diff --git a/src/rgw/store/rados/rgw_sync_trace.cc b/src/rgw/store/rados/rgw_sync_trace.cc
deleted file mode 100644 (file)
index b346835..0000000
+++ /dev/null
@@ -1,290 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab ft=cpp
-
-#ifndef CEPH_RGW_SYNC_TRACE_H
-#define CEPH_RGW_SYNC_TRACE_H
-
-#include <regex>
-
-#include "common/debug.h"
-#include "common/ceph_json.h"
-
-#include "rgw_sync_trace.h"
-#include "rgw_rados.h"
-#include "rgw_worker.h"
-
-#define dout_context g_ceph_context
-
-static constexpr auto dout_subsys = ceph_subsys_rgw;
-
-using namespace std;
-
-
-RGWSyncTraceNode::RGWSyncTraceNode(CephContext *_cct, uint64_t _handle,
-                                   const RGWSyncTraceNodeRef& _parent,
-                                   const string& _type, const string& _id) : cct(_cct),
-                                                                             parent(_parent),
-                                                                             type(_type),
-                                                                             id(_id),
-                                                                             handle(_handle),
-                                                                             history(cct->_conf->rgw_sync_trace_per_node_log_size)
-{
-  if (parent.get()) {
-    prefix = parent->get_prefix();
-  }
-
-  if (!type.empty()) {
-    prefix += type;
-    if (!id.empty()) {
-      prefix += "[" + id + "]";
-    }
-    prefix += ":";
-  }
-}
-
-void RGWSyncTraceNode::log(int level, const string& s)
-{
-  status = s;
-  history.push_back(status);
-  /* dump output on either rgw_sync, or rgw -- but only once */
-  if (cct->_conf->subsys.should_gather(ceph_subsys_rgw_sync, level)) {
-    lsubdout(cct, rgw_sync,
-      ceph::dout::need_dynamic(level)) << "RGW-SYNC:" << to_str() << dendl;
-  } else {
-    lsubdout(cct, rgw,
-      ceph::dout::need_dynamic(level)) << "RGW-SYNC:" << to_str() << dendl;
-  }
-}
-
-
-class RGWSyncTraceServiceMapThread : public RGWRadosThread {
-  RGWRados *store;
-  RGWSyncTraceManager *manager;
-
-  uint64_t interval_msec() override {
-    return cct->_conf->rgw_sync_trace_servicemap_update_interval * 1000;
-  }
-public:
-  RGWSyncTraceServiceMapThread(RGWRados *_store, RGWSyncTraceManager *_manager)
-    : RGWRadosThread(_store, "sync-trace"), store(_store), manager(_manager) {}
-
-  int process(const DoutPrefixProvider *dpp) override;
-};
-
-int RGWSyncTraceServiceMapThread::process(const DoutPrefixProvider *dpp)
-{
-  map<string, string> status;
-  status["current_sync"] = manager->get_active_names();
-  int ret = store->update_service_map(dpp, std::move(status));
-  if (ret < 0) {
-    ldout(store->ctx(), 0) << "ERROR: update_service_map() returned ret=" << ret << dendl;
-  }
-  return 0;
-}
-
-RGWSyncTraceNodeRef RGWSyncTraceManager::add_node(const RGWSyncTraceNodeRef& parent,
-                                                  const std::string& type,
-                                                  const std::string& id)
-{
-  shunique_lock wl(lock, ceph::acquire_unique);
-  auto handle = alloc_handle();
-  RGWSyncTraceNodeRef& ref = nodes[handle];
-  ref.reset(new RGWSyncTraceNode(cct, handle, parent, type, id));
-  // return a separate shared_ptr that calls finish() on the node instead of
-  // deleting it. the lambda capture holds a reference to the original 'ref'
-  auto deleter = [ref, this] (RGWSyncTraceNode *node) { finish_node(node); };
-  return {ref.get(), deleter};
-}
-
-bool RGWSyncTraceNode::match(const string& search_term, bool search_history)
-{
-  try {
-    std::regex expr(search_term);
-    std::smatch m;
-
-    if (regex_search(prefix, m, expr)) {
-      return true;
-    }
-    if (regex_search(status, m,expr)) {
-      return true;
-    }
-    if (!search_history) {
-      return false;
-    }
-
-    for (auto h : history) {
-      if (regex_search(h, m, expr)) {
-        return true;
-      }
-    }
-  } catch (const std::regex_error& e) {
-    ldout(cct, 5) << "NOTICE: sync trace: bad expression: bad regex search term" << dendl;
-  }
-
-  return false;
-}
-
-void RGWSyncTraceManager::init(RGWRados *store)
-{
-  service_map_thread = new RGWSyncTraceServiceMapThread(store, this);
-  service_map_thread->start();
-}
-
-RGWSyncTraceManager::~RGWSyncTraceManager()
-{
-  cct->get_admin_socket()->unregister_commands(this);
-  service_map_thread->stop();
-  delete service_map_thread;
-
-  nodes.clear();
-}
-
-int RGWSyncTraceManager::hook_to_admin_command()
-{
-  AdminSocket *admin_socket = cct->get_admin_socket();
-
-  admin_commands = { { "sync trace show name=search,type=CephString,req=false", "sync trace show [filter_str]: show current multisite tracing information" },
-                     { "sync trace history name=search,type=CephString,req=false", "sync trace history [filter_str]: show history of multisite tracing information" },
-                     { "sync trace active name=search,type=CephString,req=false", "show active multisite sync entities information" },
-                     { "sync trace active_short name=search,type=CephString,req=false", "show active multisite sync entities entries" } };
-  for (auto cmd : admin_commands) {
-    int r = admin_socket->register_command(cmd[0], this,
-                                           cmd[1]);
-    if (r < 0) {
-      lderr(cct) << "ERROR: fail to register admin socket command (r=" << r << ")" << dendl;
-      return r;
-    }
-  }
-  return 0;
-}
-
-static void dump_node(RGWSyncTraceNode *entry, bool show_history, Formatter *f)
-{
-  f->open_object_section("entry");
-  ::encode_json("status", entry->to_str(), f);
-  if (show_history) {
-    f->open_array_section("history");
-    for (auto h : entry->get_history()) {
-      ::encode_json("entry", h, f);
-    }
-    f->close_section();
-  }
-  f->close_section();
-}
-
-string RGWSyncTraceManager::get_active_names()
-{
-  shunique_lock rl(lock, ceph::acquire_shared);
-
-  stringstream ss;
-  JSONFormatter f;
-
-  f.open_array_section("result");
-  for (auto n : nodes) {
-    auto& entry = n.second;
-
-    if (!entry->test_flags(RGW_SNS_FLAG_ACTIVE)) {
-      continue;
-    }
-    const string& name = entry->get_resource_name();
-    if (!name.empty()) {
-      ::encode_json("entry", name, &f);
-    }
-    f.flush(ss);
-  }
-  f.close_section();
-  f.flush(ss);
-
-  return ss.str();
-}
-
-int RGWSyncTraceManager::call(std::string_view command, const cmdmap_t& cmdmap,
-                             const bufferlist&,
-                             Formatter *f,
-                             std::ostream& ss,
-                             bufferlist& out) {
-
-  bool show_history = (command == "sync trace history");
-  bool show_short = (command == "sync trace active_short");
-  bool show_active = (command == "sync trace active") || show_short;
-
-  string search;
-
-  auto si = cmdmap.find("search");
-  if (si != cmdmap.end()) {
-    search = boost::get<string>(si->second);
-  }
-
-  shunique_lock rl(lock, ceph::acquire_shared);
-
-  f->open_object_section("result");
-  f->open_array_section("running");
-  for (auto n : nodes) {
-    auto& entry = n.second;
-
-    if (!search.empty() && !entry->match(search, show_history)) {
-      continue;
-    }
-    if (show_active && !entry->test_flags(RGW_SNS_FLAG_ACTIVE)) {
-      continue;
-    }
-    if (show_short) {
-      const string& name = entry->get_resource_name();
-      if (!name.empty()) {
-        ::encode_json("entry", name, f);
-      }
-    } else {
-      dump_node(entry.get(), show_history, f);
-    }
-    f->flush(out);
-  }
-  f->close_section();
-
-  f->open_array_section("complete");
-  for (auto& entry : complete_nodes) {
-    if (!search.empty() && !entry->match(search, show_history)) {
-      continue;
-    }
-    if (show_active && !entry->test_flags(RGW_SNS_FLAG_ACTIVE)) {
-      continue;
-    }
-    dump_node(entry.get(), show_history, f);
-    f->flush(out);
-  }
-  f->close_section();
-
-  f->close_section();
-
-  return 0;
-}
-
-void RGWSyncTraceManager::finish_node(RGWSyncTraceNode *node)
-{
-  RGWSyncTraceNodeRef old_node;
-
-  {
-    shunique_lock wl(lock, ceph::acquire_unique);
-    if (!node) {
-      return;
-    }
-    auto iter = nodes.find(node->handle);
-    if (iter == nodes.end()) {
-      /* not found, already finished */
-      return;
-    }
-
-    if (complete_nodes.full()) {
-      /* take a reference to the entry that is going to be evicted,
-       * can't let it get evicted under lock held, otherwise
-       * it's a deadlock as it will call finish_node()
-       */
-      old_node = complete_nodes.front();
-    }
-
-    complete_nodes.push_back(iter->second);
-    nodes.erase(iter);
-  }
-};
-
-#endif
-
diff --git a/src/rgw/store/rados/rgw_sync_trace.h b/src/rgw/store/rados/rgw_sync_trace.h
deleted file mode 100644 (file)
index 9617dac..0000000
+++ /dev/null
@@ -1,145 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab ft=cpp
-
-#ifndef CEPH_RGW_SYNC_LOG_H
-#define CEPH_RGW_SYNC_LOG_H
-
-#include <atomic>
-
-#include "common/ceph_mutex.h"
-#include "common/shunique_lock.h"
-#include "common/admin_socket.h"
-
-#include <set>
-#include <ostream>
-#include <string>
-#include <shared_mutex>
-#include <boost/circular_buffer.hpp>
-
-#define SSTR(o) ({      \
-  std::stringstream ss; \
-  ss << o;              \
-  ss.str();             \
-})
-
-#define RGW_SNS_FLAG_ACTIVE   1
-#define RGW_SNS_FLAG_ERROR    2
-
-class RGWRados;
-class RGWSyncTraceManager;
-class RGWSyncTraceNode;
-class RGWSyncTraceServiceMapThread;
-
-using RGWSyncTraceNodeRef = std::shared_ptr<RGWSyncTraceNode>;
-
-class RGWSyncTraceNode final {
-  friend class RGWSyncTraceManager;
-
-  CephContext *cct;
-  RGWSyncTraceNodeRef parent;
-
-  uint16_t state{0};
-  std::string status;
-
-  ceph::mutex lock = ceph::make_mutex("RGWSyncTraceNode::lock");
-
-  std::string type;
-  std::string id;
-
-  std::string prefix;
-
-  std::string resource_name;
-
-  uint64_t handle;
-
-  boost::circular_buffer<std::string> history;
-
-  // private constructor, create with RGWSyncTraceManager::add_node()
-  RGWSyncTraceNode(CephContext *_cct, uint64_t _handle,
-                   const RGWSyncTraceNodeRef& _parent,
-                   const std::string& _type, const std::string& _id);
-
- public:
-  void set_resource_name(const std::string& s) {
-    resource_name = s;
-  }
-
-  const std::string& get_resource_name() {
-    return resource_name;
-  }
-
-  void set_flag(uint16_t s) {
-    state |= s;
-  }
-  void unset_flag(uint16_t s) {
-    state &= ~s;
-  }
-  bool test_flags(uint16_t f) {
-    return (state & f) == f;
-  }
-  void log(int level, const std::string& s);
-
-  std::string to_str() {
-    return prefix + " " + status;
-  }
-
-  const std::string& get_prefix() {
-    return prefix;
-  }
-
-  std::ostream& operator<<(std::ostream& os) { 
-    os << to_str();
-    return os;            
-  }
-
-  boost::circular_buffer<std::string>& get_history() {
-    return history;
-  }
-
-  bool match(const std::string& search_term, bool search_history);
-};
-
-class RGWSyncTraceManager : public AdminSocketHook {
-  friend class RGWSyncTraceNode;
-
-  mutable std::shared_timed_mutex lock;
-  using shunique_lock = ceph::shunique_lock<decltype(lock)>;
-
-  CephContext *cct;
-  RGWSyncTraceServiceMapThread *service_map_thread{nullptr};
-
-  std::map<uint64_t, RGWSyncTraceNodeRef> nodes;
-  boost::circular_buffer<RGWSyncTraceNodeRef> complete_nodes;
-
-  std::atomic<uint64_t> count = { 0 };
-
-  std::list<std::array<std::string, 3> > admin_commands;
-
-  uint64_t alloc_handle() {
-    return ++count;
-  }
-  void finish_node(RGWSyncTraceNode *node);
-
-public:
-  RGWSyncTraceManager(CephContext *_cct, int max_lru) : cct(_cct), complete_nodes(max_lru) {}
-  ~RGWSyncTraceManager();
-
-  void init(RGWRados *store);
-
-  const RGWSyncTraceNodeRef root_node;
-
-  RGWSyncTraceNodeRef add_node(const RGWSyncTraceNodeRef& parent,
-                               const std::string& type,
-                               const std::string& id = "");
-
-  int hook_to_admin_command();
-  int call(std::string_view command, const cmdmap_t& cmdmap,
-          const bufferlist&,
-          Formatter *f,
-          std::ostream& ss,
-          bufferlist& out) override;
-  std::string get_active_names();
-};
-
-
-#endif
diff --git a/src/rgw/store/rados/rgw_tools.cc b/src/rgw/store/rados/rgw_tools.cc
deleted file mode 100644 (file)
index 5a8aefa..0000000
+++ /dev/null
@@ -1,414 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab ft=cpp
-
-#include "common/errno.h"
-#include "librados/librados_asio.h"
-
-#include "include/stringify.h"
-
-#include "rgw_tools.h"
-#include "rgw_acl_s3.h"
-#include "rgw_aio_throttle.h"
-#include "rgw_compression.h"
-
-#define dout_subsys ceph_subsys_rgw
-
-#define READ_CHUNK_LEN (512 * 1024)
-
-using namespace std;
-
-int rgw_init_ioctx(const DoutPrefixProvider *dpp,
-                   librados::Rados *rados, const rgw_pool& pool,
-                   librados::IoCtx& ioctx, bool create,
-                  bool mostly_omap)
-{
-  int r = rados->ioctx_create(pool.name.c_str(), ioctx);
-  if (r == -ENOENT && create) {
-    r = rados->pool_create(pool.name.c_str());
-    if (r == -ERANGE) {
-      ldpp_dout(dpp, 0)
-        << __func__
-        << " ERROR: librados::Rados::pool_create returned " << cpp_strerror(-r)
-        << " (this can be due to a pool or placement group misconfiguration, e.g."
-        << " pg_num < pgp_num or mon_max_pg_per_osd exceeded)"
-        << dendl;
-    }
-    if (r < 0 && r != -EEXIST) {
-      return r;
-    }
-
-    r = rados->ioctx_create(pool.name.c_str(), ioctx);
-    if (r < 0) {
-      return r;
-    }
-
-    r = ioctx.application_enable(pg_pool_t::APPLICATION_NAME_RGW, false);
-    if (r < 0 && r != -EOPNOTSUPP) {
-      return r;
-    }
-
-    if (mostly_omap) {
-      // set pg_autoscale_bias
-      bufferlist inbl;
-      float bias = g_conf().get_val<double>("rgw_rados_pool_autoscale_bias");
-      int r = rados->mon_command(
-       "{\"prefix\": \"osd pool set\", \"pool\": \"" +
-       pool.name + "\", \"var\": \"pg_autoscale_bias\", \"val\": \"" +
-       stringify(bias) + "\"}",
-       inbl, NULL, NULL);
-      if (r < 0) {
-       ldpp_dout(dpp, 10) << __func__ << " warning: failed to set pg_autoscale_bias on "
-                << pool.name << dendl;
-      }
-      // set recovery_priority
-      int p = g_conf().get_val<uint64_t>("rgw_rados_pool_recovery_priority");
-      r = rados->mon_command(
-       "{\"prefix\": \"osd pool set\", \"pool\": \"" +
-       pool.name + "\", \"var\": \"recovery_priority\": \"" +
-       stringify(p) + "\"}",
-       inbl, NULL, NULL);
-      if (r < 0) {
-       ldpp_dout(dpp, 10) << __func__ << " warning: failed to set recovery_priority on "
-                << pool.name << dendl;
-      }
-    }
-  } else if (r < 0) {
-    return r;
-  }
-  if (!pool.ns.empty()) {
-    ioctx.set_namespace(pool.ns);
-  }
-  return 0;
-}
-
-map<string, bufferlist>* no_change_attrs() {
-  static map<string, bufferlist> no_change;
-  return &no_change;
-}
-
-int rgw_put_system_obj(const DoutPrefixProvider *dpp, RGWSI_SysObj* svc_sysobj,
-                       const rgw_pool& pool, const string& oid, bufferlist& data, bool exclusive,
-                       RGWObjVersionTracker *objv_tracker, real_time set_mtime, optional_yield y, map<string, bufferlist> *pattrs)
-{
-  map<string,bufferlist> no_attrs;
-  if (!pattrs) {
-    pattrs = &no_attrs;
-  }
-
-  rgw_raw_obj obj(pool, oid);
-
-  auto sysobj = svc_sysobj->get_obj(obj);
-  int ret;
-
-  if (pattrs != no_change_attrs()) {
-    ret = sysobj.wop()
-      .set_objv_tracker(objv_tracker)
-      .set_exclusive(exclusive)
-      .set_mtime(set_mtime)
-      .set_attrs(*pattrs)
-      .write(dpp, data, y);
-  } else {
-    ret = sysobj.wop()
-      .set_objv_tracker(objv_tracker)
-      .set_exclusive(exclusive)
-      .set_mtime(set_mtime)
-      .write_data(dpp, data, y);
-  }
-
-  return ret;
-}
-
-int rgw_stat_system_obj(const DoutPrefixProvider *dpp, RGWSI_SysObj* svc_sysobj,
-                        const rgw_pool& pool, const std::string& key,
-                        RGWObjVersionTracker *objv_tracker,
-                       real_time *pmtime, optional_yield y,
-                       std::map<std::string, bufferlist> *pattrs)
-{
-  rgw_raw_obj obj(pool, key);
-  auto sysobj = svc_sysobj->get_obj(obj);
-  return sysobj.rop()
-               .set_attrs(pattrs)
-               .set_last_mod(pmtime)
-               .stat(y, dpp);
-}
-
-
-int rgw_get_system_obj(RGWSI_SysObj* svc_sysobj, const rgw_pool& pool, const string& key, bufferlist& bl,
-                       RGWObjVersionTracker *objv_tracker, real_time *pmtime, optional_yield y,
-                       const DoutPrefixProvider *dpp, map<string, bufferlist> *pattrs,
-                       rgw_cache_entry_info *cache_info,
-                      boost::optional<obj_version> refresh_version, bool raw_attrs)
-{
-  const rgw_raw_obj obj(pool, key);
-  auto sysobj = svc_sysobj->get_obj(obj);
-  auto rop = sysobj.rop();
-  return rop.set_attrs(pattrs)
-            .set_last_mod(pmtime)
-            .set_objv_tracker(objv_tracker)
-            .set_raw_attrs(raw_attrs)
-            .set_cache_info(cache_info)
-            .set_refresh_version(refresh_version)
-            .read(dpp, &bl, y);
-}
-
-int rgw_delete_system_obj(const DoutPrefixProvider *dpp, 
-                          RGWSI_SysObj *sysobj_svc, const rgw_pool& pool, const string& oid,
-                          RGWObjVersionTracker *objv_tracker, optional_yield y)
-{
-  auto sysobj = sysobj_svc->get_obj(rgw_raw_obj{pool, oid});
-  rgw_raw_obj obj(pool, oid);
-  return sysobj.wop()
-               .set_objv_tracker(objv_tracker)
-               .remove(dpp, y);
-}
-
-int rgw_rados_operate(const DoutPrefixProvider *dpp, librados::IoCtx& ioctx, const std::string& oid,
-                      librados::ObjectReadOperation *op, bufferlist* pbl,
-                      optional_yield y, int flags)
-{
-  // given a yield_context, call async_operate() to yield the coroutine instead
-  // of blocking
-  if (y) {
-    auto& context = y.get_io_context();
-    auto& yield = y.get_yield_context();
-    boost::system::error_code ec;
-    auto bl = librados::async_operate(
-      context, ioctx, oid, op, flags, yield[ec]);
-    if (pbl) {
-      *pbl = std::move(bl);
-    }
-    return -ec.value();
-  }
-  // work on asio threads should be asynchronous, so warn when they block
-  if (is_asio_thread) {
-    ldpp_dout(dpp, 20) << "WARNING: blocking librados call" << dendl;
-  }
-  return ioctx.operate(oid, op, nullptr, flags);
-}
-
-int rgw_rados_operate(const DoutPrefixProvider *dpp, librados::IoCtx& ioctx, const std::string& oid,
-                      librados::ObjectWriteOperation *op, optional_yield y,
-                     int flags)
-{
-  if (y) {
-    auto& context = y.get_io_context();
-    auto& yield = y.get_yield_context();
-    boost::system::error_code ec;
-    librados::async_operate(context, ioctx, oid, op, flags, yield[ec]);
-    return -ec.value();
-  }
-  if (is_asio_thread) {
-    ldpp_dout(dpp, 20) << "WARNING: blocking librados call" << dendl;
-  }
-  return ioctx.operate(oid, op, flags);
-}
-
-int rgw_rados_notify(const DoutPrefixProvider *dpp, librados::IoCtx& ioctx, const std::string& oid,
-                     bufferlist& bl, uint64_t timeout_ms, bufferlist* pbl,
-                     optional_yield y)
-{
-  if (y) {
-    auto& context = y.get_io_context();
-    auto& yield = y.get_yield_context();
-    boost::system::error_code ec;
-    auto reply = librados::async_notify(context, ioctx, oid,
-                                        bl, timeout_ms, yield[ec]);
-    if (pbl) {
-      *pbl = std::move(reply);
-    }
-    return -ec.value();
-  }
-  if (is_asio_thread) {
-    ldpp_dout(dpp, 20) << "WARNING: blocking librados call" << dendl;
-  }
-  return ioctx.notify2(oid, bl, timeout_ms, pbl);
-}
-
-void rgw_filter_attrset(map<string, bufferlist>& unfiltered_attrset, const string& check_prefix,
-                        map<string, bufferlist> *attrset)
-{
-  attrset->clear();
-  map<string, bufferlist>::iterator iter;
-  for (iter = unfiltered_attrset.lower_bound(check_prefix);
-       iter != unfiltered_attrset.end(); ++iter) {
-    if (!boost::algorithm::starts_with(iter->first, check_prefix))
-      break;
-    (*attrset)[iter->first] = iter->second;
-  }
-}
-
-RGWDataAccess::RGWDataAccess(rgw::sal::Driver* _driver) : driver(_driver)
-{
-}
-
-
-int RGWDataAccess::Bucket::finish_init()
-{
-  auto iter = attrs.find(RGW_ATTR_ACL);
-  if (iter == attrs.end()) {
-    return 0;
-  }
-
-  bufferlist::const_iterator bliter = iter->second.begin();
-  try {
-    policy.decode(bliter);
-  } catch (buffer::error& err) {
-    return -EIO;
-  }
-
-  return 0;
-}
-
-int RGWDataAccess::Bucket::init(const DoutPrefixProvider *dpp, optional_yield y)
-{
-  std::unique_ptr<rgw::sal::Bucket> bucket;
-  int ret = sd->driver->get_bucket(dpp, nullptr, tenant, name, &bucket, y);
-  if (ret < 0) {
-    return ret;
-  }
-
-  bucket_info = bucket->get_info();
-  mtime = bucket->get_modification_time();
-  attrs = bucket->get_attrs();
-
-  return finish_init();
-}
-
-int RGWDataAccess::Bucket::init(const RGWBucketInfo& _bucket_info,
-                               const map<string, bufferlist>& _attrs)
-{
-  bucket_info = _bucket_info;
-  attrs = _attrs;
-
-  return finish_init();
-}
-
-int RGWDataAccess::Bucket::get_object(const rgw_obj_key& key,
-                                     ObjectRef *obj) {
-  obj->reset(new Object(sd, shared_from_this(), key));
-  return 0;
-}
-
-int RGWDataAccess::Object::put(bufferlist& data,
-                              map<string, bufferlist>& attrs,
-                               const DoutPrefixProvider *dpp,
-                               optional_yield y)
-{
-  rgw::sal::Driver* driver = sd->driver;
-  CephContext *cct = driver->ctx();
-
-  string tag;
-  append_rand_alpha(cct, tag, tag, 32);
-
-  RGWBucketInfo& bucket_info = bucket->bucket_info;
-
-  rgw::BlockingAioThrottle aio(driver->ctx()->_conf->rgw_put_obj_min_window_size);
-
-  std::unique_ptr<rgw::sal::Bucket> b;
-  driver->get_bucket(NULL, bucket_info, &b);
-  std::unique_ptr<rgw::sal::Object> obj = b->get_object(key);
-
-  auto& owner = bucket->policy.get_owner();
-
-  string req_id = driver->zone_unique_id(driver->get_new_req_id());
-
-  std::unique_ptr<rgw::sal::Writer> processor;
-  processor = driver->get_atomic_writer(dpp, y, std::move(obj),
-                                      owner.get_id(),
-                                      nullptr, olh_epoch, req_id);
-
-  int ret = processor->prepare(y);
-  if (ret < 0)
-    return ret;
-
-  rgw::sal::DataProcessor *filter = processor.get();
-
-  CompressorRef plugin;
-  boost::optional<RGWPutObj_Compress> compressor;
-
-  const auto& compression_type = driver->get_compression_type(bucket_info.placement_rule);
-  if (compression_type != "none") {
-    plugin = Compressor::create(driver->ctx(), compression_type);
-    if (!plugin) {
-      ldpp_dout(dpp, 1) << "Cannot load plugin for compression type "
-        << compression_type << dendl;
-    } else {
-      compressor.emplace(driver->ctx(), plugin, filter);
-      filter = &*compressor;
-    }
-  }
-
-  off_t ofs = 0;
-  auto obj_size = data.length();
-
-  RGWMD5Etag etag_calc;
-
-  do {
-    size_t read_len = std::min(data.length(), (unsigned int)cct->_conf->rgw_max_chunk_size);
-
-    bufferlist bl;
-
-    data.splice(0, read_len, &bl);
-    etag_calc.update(bl);
-
-    ret = filter->process(std::move(bl), ofs);
-    if (ret < 0)
-      return ret;
-
-    ofs += read_len;
-  } while (data.length() > 0);
-
-  ret = filter->process({}, ofs);
-  if (ret < 0) {
-    return ret;
-  }
-  bool has_etag_attr = false;
-  auto iter = attrs.find(RGW_ATTR_ETAG);
-  if (iter != attrs.end()) {
-    bufferlist& bl = iter->second;
-    etag = bl.to_str();
-    has_etag_attr = true;
-  }
-
-  if (!aclbl) {
-    RGWAccessControlPolicy_S3 policy(cct);
-
-    policy.create_canned(bucket->policy.get_owner(), bucket->policy.get_owner(), string()); /* default private policy */
-
-    policy.encode(aclbl.emplace());
-  }
-
-  if (etag.empty()) {
-    etag_calc.finish(&etag);
-  }
-
-  if (!has_etag_attr) {
-    bufferlist etagbl;
-    etagbl.append(etag);
-    attrs[RGW_ATTR_ETAG] = etagbl;
-  }
-  attrs[RGW_ATTR_ACL] = *aclbl;
-
-  string *puser_data = nullptr;
-  if (user_data) {
-    puser_data = &(*user_data);
-  }
-
-  return processor->complete(obj_size, etag,
-                           &mtime, mtime,
-                           attrs, delete_at,
-                            nullptr, nullptr,
-                            puser_data,
-                            nullptr, nullptr, y);
-}
-
-void RGWDataAccess::Object::set_policy(const RGWAccessControlPolicy& policy)
-{
-  policy.encode(aclbl.emplace());
-}
-
-void rgw_complete_aio_completion(librados::AioCompletion* c, int r) {
-  auto pc = c->pc;
-  librados::CB_AioCompleteAndSafe cb(pc);
-  cb(r);
-}
diff --git a/src/rgw/store/rados/rgw_tools.h b/src/rgw/store/rados/rgw_tools.h
deleted file mode 100644 (file)
index 6aeb9b8..0000000
+++ /dev/null
@@ -1,277 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab ft=cpp
-
-#ifndef CEPH_RGW_TOOLS_H
-#define CEPH_RGW_TOOLS_H
-
-#include <string>
-
-#include "include/types.h"
-#include "include/ceph_hash.h"
-
-#include "common/ceph_time.h"
-
-#include "rgw_common.h"
-#include "rgw_sal_fwd.h"
-
-class RGWSI_SysObj;
-
-class RGWRados;
-struct RGWObjVersionTracker;
-class optional_yield;
-
-struct obj_version;
-
-
-int rgw_init_ioctx(const DoutPrefixProvider *dpp,
-                   librados::Rados *rados, const rgw_pool& pool,
-                   librados::IoCtx& ioctx,
-                  bool create = false,
-                  bool mostly_omap = false);
-
-#define RGW_NO_SHARD -1
-
-#define RGW_SHARDS_PRIME_0 7877
-#define RGW_SHARDS_PRIME_1 65521
-
-extern const std::string MP_META_SUFFIX;
-
-inline int rgw_shards_max()
-{
-  return RGW_SHARDS_PRIME_1;
-}
-
-// only called by rgw_shard_id and rgw_bucket_shard_index
-static inline int rgw_shards_mod(unsigned hval, int max_shards)
-{
-  if (max_shards <= RGW_SHARDS_PRIME_0) {
-    return hval % RGW_SHARDS_PRIME_0 % max_shards;
-  }
-  return hval % RGW_SHARDS_PRIME_1 % max_shards;
-}
-
-// used for logging and tagging
-inline int rgw_shard_id(const std::string& key, int max_shards)
-{
-  return rgw_shards_mod(ceph_str_hash_linux(key.c_str(), key.size()),
-                       max_shards);
-}
-
-void rgw_shard_name(const std::string& prefix, unsigned max_shards, const std::string& key, std::string& name, int *shard_id);
-void rgw_shard_name(const std::string& prefix, unsigned max_shards, const std::string& section, const std::string& key, std::string& name);
-void rgw_shard_name(const std::string& prefix, unsigned shard_id, std::string& name);
-
-int rgw_put_system_obj(const DoutPrefixProvider *dpp, RGWSI_SysObj* svc_sysobj,
-                       const rgw_pool& pool, const std::string& oid,
-                       bufferlist& data, bool exclusive,
-                       RGWObjVersionTracker *objv_tracker,
-                       real_time set_mtime, optional_yield y,
-                       std::map<std::string, bufferlist> *pattrs = nullptr);
-int rgw_get_system_obj(RGWSI_SysObj* svc_sysobj, const rgw_pool& pool,
-                       const std::string& key, bufferlist& bl,
-                       RGWObjVersionTracker *objv_tracker, real_time *pmtime,
-                       optional_yield y, const DoutPrefixProvider *dpp,
-                       std::map<std::string, bufferlist> *pattrs = nullptr,
-                       rgw_cache_entry_info *cache_info = nullptr,
-                      boost::optional<obj_version> refresh_version = boost::none,
-                       bool raw_attrs=false);
-int rgw_delete_system_obj(const DoutPrefixProvider *dpp, 
-                          RGWSI_SysObj *sysobj_svc, const rgw_pool& pool, const std::string& oid,
-                          RGWObjVersionTracker *objv_tracker, optional_yield y);
-int rgw_stat_system_obj(const DoutPrefixProvider *dpp, RGWSI_SysObj* svc_sysobj,
-                        const rgw_pool& pool, const std::string& key,
-                        RGWObjVersionTracker *objv_tracker,
-                        real_time *pmtime, optional_yield y,
-                        std::map<std::string, bufferlist> *pattrs = nullptr);
-
-const char *rgw_find_mime_by_ext(std::string& ext);
-
-void rgw_filter_attrset(std::map<std::string, bufferlist>& unfiltered_attrset, const std::string& check_prefix,
-                        std::map<std::string, bufferlist> *attrset);
-
-/// indicates whether the current thread is in boost::asio::io_context::run(),
-/// used to log warnings if synchronous librados calls are made
-extern thread_local bool is_asio_thread;
-
-/// perform the rados operation, using the yield context when given
-int rgw_rados_operate(const DoutPrefixProvider *dpp, librados::IoCtx& ioctx, const std::string& oid,
-                      librados::ObjectReadOperation *op, bufferlist* pbl,
-                      optional_yield y, int flags = 0);
-int rgw_rados_operate(const DoutPrefixProvider *dpp, librados::IoCtx& ioctx, const std::string& oid,
-                      librados::ObjectWriteOperation *op, optional_yield y,
-                     int flags = 0);
-int rgw_rados_notify(const DoutPrefixProvider *dpp, librados::IoCtx& ioctx, const std::string& oid,
-                     bufferlist& bl, uint64_t timeout_ms, bufferlist* pbl,
-                     optional_yield y);
-
-int rgw_tools_init(const DoutPrefixProvider *dpp, CephContext *cct);
-void rgw_tools_cleanup();
-
-template<class H, size_t S>
-class RGWEtag
-{
-  H hash;
-
-public:
-  RGWEtag() {
-    if constexpr (std::is_same_v<H, MD5>) {
-      // Allow use of MD5 digest in FIPS mode for non-cryptographic purposes
-      hash.SetFlags(EVP_MD_CTX_FLAG_NON_FIPS_ALLOW);
-    }
-  }
-
-  void update(const char *buf, size_t len) {
-    hash.Update((const unsigned char *)buf, len);
-  }
-
-  void update(bufferlist& bl) {
-    if (bl.length() > 0) {
-      update(bl.c_str(), bl.length());
-    }
-  }
-
-  void update(const std::string& s) {
-    if (!s.empty()) {
-      update(s.c_str(), s.size());
-    }
-  }
-  void finish(std::string *etag) {
-    char etag_buf[S];
-    char etag_buf_str[S * 2 + 16];
-
-    hash.Final((unsigned char *)etag_buf);
-    buf_to_hex((const unsigned char *)etag_buf, S,
-              etag_buf_str);
-
-    *etag = etag_buf_str;
-  }
-};
-
-using RGWMD5Etag = RGWEtag<MD5, CEPH_CRYPTO_MD5_DIGESTSIZE>;
-
-class RGWDataAccess
-{
-  rgw::sal::Driver* driver;
-
-public:
-  RGWDataAccess(rgw::sal::Driver* _driver);
-
-  class Object;
-  class Bucket;
-
-  using BucketRef = std::shared_ptr<Bucket>;
-  using ObjectRef = std::shared_ptr<Object>;
-
-  class Bucket : public std::enable_shared_from_this<Bucket> {
-    friend class RGWDataAccess;
-    friend class Object;
-
-    RGWDataAccess *sd{nullptr};
-    RGWBucketInfo bucket_info;
-    std::string tenant;
-    std::string name;
-    std::string bucket_id;
-    ceph::real_time mtime;
-    std::map<std::string, bufferlist> attrs;
-
-    RGWAccessControlPolicy policy;
-    int finish_init();
-    
-    Bucket(RGWDataAccess *_sd,
-          const std::string& _tenant,
-          const std::string& _name,
-          const std::string& _bucket_id) : sd(_sd),
-                                       tenant(_tenant),
-                                       name(_name),
-                                      bucket_id(_bucket_id) {}
-    Bucket(RGWDataAccess *_sd) : sd(_sd) {}
-    int init(const DoutPrefixProvider *dpp, optional_yield y);
-    int init(const RGWBucketInfo& _bucket_info, const std::map<std::string, bufferlist>& _attrs);
-  public:
-    int get_object(const rgw_obj_key& key,
-                  ObjectRef *obj);
-
-  };
-
-
-  class Object {
-    RGWDataAccess *sd{nullptr};
-    BucketRef bucket;
-    rgw_obj_key key;
-
-    ceph::real_time mtime;
-    std::string etag;
-    uint64_t olh_epoch{0};
-    ceph::real_time delete_at;
-    std::optional<std::string> user_data;
-
-    std::optional<bufferlist> aclbl;
-
-    Object(RGWDataAccess *_sd,
-           BucketRef&& _bucket,
-           const rgw_obj_key& _key) : sd(_sd),
-                                      bucket(_bucket),
-                                      key(_key) {}
-  public:
-    int put(bufferlist& data, std::map<std::string, bufferlist>& attrs, const DoutPrefixProvider *dpp, optional_yield y); /* might modify attrs */
-
-    void set_mtime(const ceph::real_time& _mtime) {
-      mtime = _mtime;
-    }
-
-    void set_etag(const std::string& _etag) {
-      etag = _etag;
-    }
-
-    void set_olh_epoch(uint64_t epoch) {
-      olh_epoch = epoch;
-    }
-
-    void set_delete_at(ceph::real_time _delete_at) {
-      delete_at = _delete_at;
-    }
-
-    void set_user_data(const std::string& _user_data) {
-      user_data = _user_data;
-    }
-
-    void set_policy(const RGWAccessControlPolicy& policy);
-
-    friend class Bucket;
-  };
-
-  int get_bucket(const DoutPrefixProvider *dpp, 
-                 const std::string& tenant,
-                const std::string name,
-                const std::string bucket_id,
-                BucketRef *bucket,
-                optional_yield y) {
-    bucket->reset(new Bucket(this, tenant, name, bucket_id));
-    return (*bucket)->init(dpp, y);
-  }
-
-  int get_bucket(const RGWBucketInfo& bucket_info,
-                const std::map<std::string, bufferlist>& attrs,
-                BucketRef *bucket) {
-    bucket->reset(new Bucket(this));
-    return (*bucket)->init(bucket_info, attrs);
-  }
-  friend class Bucket;
-  friend class Object;
-};
-
-using RGWDataAccessRef = std::shared_ptr<RGWDataAccess>;
-
-/// Complete an AioCompletion. To return error values or otherwise
-/// satisfy the caller. Useful for making complicated asynchronous
-/// calls and error handling.
-void rgw_complete_aio_completion(librados::AioCompletion* c, int r);
-
-/// This returns a static, non-NULL pointer, recognized only by
-/// rgw_put_system_obj(). When supplied instead of the attributes, the
-/// attributes will be unmodified.
-///
-// (Currently providing nullptr will wipe all attributes.)
-
-std::map<std::string, ceph::buffer::list>* no_change_attrs();
-#endif
diff --git a/src/rgw/store/rados/rgw_trim_bilog.cc b/src/rgw/store/rados/rgw_trim_bilog.cc
deleted file mode 100644 (file)
index 6ddda5d..0000000
+++ /dev/null
@@ -1,1445 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab ft=cpp
-
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2017 Red Hat, Inc
- *
- * Author: Casey Bodley <cbodley@redhat.com>
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation.  See file COPYING.
- */
-
-#include <mutex>
-#include <boost/circular_buffer.hpp>
-#include <boost/container/flat_map.hpp>
-
-#include "include/scope_guard.h"
-#include "common/bounded_key_counter.h"
-#include "common/errno.h"
-#include "rgw_trim_bilog.h"
-#include "rgw_cr_rados.h"
-#include "rgw_cr_rest.h"
-#include "rgw_cr_tools.h"
-#include "rgw_data_sync.h"
-#include "rgw_metadata.h"
-#include "rgw_sal.h"
-#include "rgw_zone.h"
-#include "rgw_sync.h"
-#include "rgw_bucket.h"
-
-#include "services/svc_zone.h"
-#include "services/svc_meta.h"
-#include "services/svc_bilog_rados.h"
-
-#include <boost/asio/yield.hpp>
-#include "include/ceph_assert.h"
-
-#define dout_subsys ceph_subsys_rgw
-
-#undef dout_prefix
-#define dout_prefix (*_dout << "trim: ")
-
-using namespace std;
-
-using rgw::BucketTrimConfig;
-using BucketChangeCounter = BoundedKeyCounter<std::string, int>;
-
-const std::string rgw::BucketTrimStatus::oid = "bilog.trim";
-using rgw::BucketTrimStatus;
-
-
-// watch/notify api for gateways to coordinate about which buckets to trim
-enum TrimNotifyType {
-  NotifyTrimCounters = 0,
-  NotifyTrimComplete,
-};
-WRITE_RAW_ENCODER(TrimNotifyType);
-
-struct TrimNotifyHandler {
-  virtual ~TrimNotifyHandler() = default;
-
-  virtual void handle(bufferlist::const_iterator& input, bufferlist& output) = 0;
-};
-
-/// api to share the bucket trim counters between gateways in the same zone.
-/// each gateway will process different datalog shards, so the gateway that runs
-/// the trim process needs to accumulate their counters
-struct TrimCounters {
-  /// counter for a single bucket
-  struct BucketCounter {
-    std::string bucket; //< bucket instance metadata key
-    int count{0};
-
-    BucketCounter() = default;
-    BucketCounter(const std::string& bucket, int count)
-      : bucket(bucket), count(count) {}
-
-    void encode(bufferlist& bl) const;
-    void decode(bufferlist::const_iterator& p);
-  };
-  using Vector = std::vector<BucketCounter>;
-
-  /// request bucket trim counters from peer gateways
-  struct Request {
-    uint16_t max_buckets; //< maximum number of bucket counters to return
-
-    void encode(bufferlist& bl) const;
-    void decode(bufferlist::const_iterator& p);
-  };
-
-  /// return the current bucket trim counters
-  struct Response {
-    Vector bucket_counters;
-
-    void encode(bufferlist& bl) const;
-    void decode(bufferlist::const_iterator& p);
-  };
-
-  /// server interface to query the hottest buckets
-  struct Server {
-    virtual ~Server() = default;
-
-    virtual void get_bucket_counters(int count, Vector& counters) = 0;
-    virtual void reset_bucket_counters() = 0;
-  };
-
-  /// notify handler
-  class Handler : public TrimNotifyHandler {
-    Server *const server;
-   public:
-    explicit Handler(Server *server) : server(server) {}
-
-    void handle(bufferlist::const_iterator& input, bufferlist& output) override;
-  };
-};
-std::ostream& operator<<(std::ostream& out, const TrimCounters::BucketCounter& rhs)
-{
-  return out << rhs.bucket << ":" << rhs.count;
-}
-
-void TrimCounters::BucketCounter::encode(bufferlist& bl) const
-{
-  using ceph::encode;
-  // no versioning to save space
-  encode(bucket, bl);
-  encode(count, bl);
-}
-void TrimCounters::BucketCounter::decode(bufferlist::const_iterator& p)
-{
-  using ceph::decode;
-  decode(bucket, p);
-  decode(count, p);
-}
-WRITE_CLASS_ENCODER(TrimCounters::BucketCounter);
-
-void TrimCounters::Request::encode(bufferlist& bl) const
-{
-  ENCODE_START(1, 1, bl);
-  encode(max_buckets, bl);
-  ENCODE_FINISH(bl);
-}
-void TrimCounters::Request::decode(bufferlist::const_iterator& p)
-{
-  DECODE_START(1, p);
-  decode(max_buckets, p);
-  DECODE_FINISH(p);
-}
-WRITE_CLASS_ENCODER(TrimCounters::Request);
-
-void TrimCounters::Response::encode(bufferlist& bl) const
-{
-  ENCODE_START(1, 1, bl);
-  encode(bucket_counters, bl);
-  ENCODE_FINISH(bl);
-}
-void TrimCounters::Response::decode(bufferlist::const_iterator& p)
-{
-  DECODE_START(1, p);
-  decode(bucket_counters, p);
-  DECODE_FINISH(p);
-}
-WRITE_CLASS_ENCODER(TrimCounters::Response);
-
-void TrimCounters::Handler::handle(bufferlist::const_iterator& input,
-                                   bufferlist& output)
-{
-  Request request;
-  decode(request, input);
-  auto count = std::min<uint16_t>(request.max_buckets, 128);
-
-  Response response;
-  server->get_bucket_counters(count, response.bucket_counters);
-  encode(response, output);
-}
-
-/// api to notify peer gateways that trim has completed and their bucket change
-/// counters can be reset
-struct TrimComplete {
-  struct Request {
-    void encode(bufferlist& bl) const;
-    void decode(bufferlist::const_iterator& p);
-  };
-  struct Response {
-    void encode(bufferlist& bl) const;
-    void decode(bufferlist::const_iterator& p);
-  };
-
-  /// server interface to reset bucket counters
-  using Server = TrimCounters::Server;
-
-  /// notify handler
-  class Handler : public TrimNotifyHandler {
-    Server *const server;
-   public:
-    explicit Handler(Server *server) : server(server) {}
-
-    void handle(bufferlist::const_iterator& input, bufferlist& output) override;
-  };
-};
-
-void TrimComplete::Request::encode(bufferlist& bl) const
-{
-  ENCODE_START(1, 1, bl);
-  ENCODE_FINISH(bl);
-}
-void TrimComplete::Request::decode(bufferlist::const_iterator& p)
-{
-  DECODE_START(1, p);
-  DECODE_FINISH(p);
-}
-WRITE_CLASS_ENCODER(TrimComplete::Request);
-
-void TrimComplete::Response::encode(bufferlist& bl) const
-{
-  ENCODE_START(1, 1, bl);
-  ENCODE_FINISH(bl);
-}
-void TrimComplete::Response::decode(bufferlist::const_iterator& p)
-{
-  DECODE_START(1, p);
-  DECODE_FINISH(p);
-}
-WRITE_CLASS_ENCODER(TrimComplete::Response);
-
-void TrimComplete::Handler::handle(bufferlist::const_iterator& input,
-                                   bufferlist& output)
-{
-  Request request;
-  decode(request, input);
-
-  server->reset_bucket_counters();
-
-  Response response;
-  encode(response, output);
-}
-
-
-/// rados watcher for bucket trim notifications
-class BucketTrimWatcher : public librados::WatchCtx2 {
-  rgw::sal::RadosStore* const store;
-  const rgw_raw_obj& obj;
-  rgw_rados_ref ref;
-  uint64_t handle{0};
-
-  using HandlerPtr = std::unique_ptr<TrimNotifyHandler>;
-  boost::container::flat_map<TrimNotifyType, HandlerPtr> handlers;
-
- public:
-  BucketTrimWatcher(rgw::sal::RadosStore* store, const rgw_raw_obj& obj,
-                    TrimCounters::Server *counters)
-    : store(store), obj(obj) {
-    handlers.emplace(NotifyTrimCounters, new TrimCounters::Handler(counters));
-    handlers.emplace(NotifyTrimComplete, new TrimComplete::Handler(counters));
-  }
-
-  ~BucketTrimWatcher() {
-    stop();
-  }
-
-  int start(const DoutPrefixProvider *dpp) {
-    int r = store->getRados()->get_raw_obj_ref(dpp, obj, &ref);
-    if (r < 0) {
-      return r;
-    }
-
-    // register a watch on the realm's control object
-    r = ref.pool.ioctx().watch2(ref.obj.oid, &handle, this);
-    if (r == -ENOENT) {
-      constexpr bool exclusive = true;
-      r = ref.pool.ioctx().create(ref.obj.oid, exclusive);
-      if (r == -EEXIST || r == 0) {
-        r = ref.pool.ioctx().watch2(ref.obj.oid, &handle, this);
-      }
-    }
-    if (r < 0) {
-      ldpp_dout(dpp, -1) << "Failed to watch " << ref.obj
-          << " with " << cpp_strerror(-r) << dendl;
-      ref.pool.ioctx().close();
-      return r;
-    }
-
-    ldpp_dout(dpp, 10) << "Watching " << ref.obj.oid << dendl;
-    return 0;
-  }
-
-  int restart() {
-    int r = ref.pool.ioctx().unwatch2(handle);
-    if (r < 0) {
-      lderr(store->ctx()) << "Failed to unwatch on " << ref.obj
-          << " with " << cpp_strerror(-r) << dendl;
-    }
-    r = ref.pool.ioctx().watch2(ref.obj.oid, &handle, this);
-    if (r < 0) {
-      lderr(store->ctx()) << "Failed to restart watch on " << ref.obj
-          << " with " << cpp_strerror(-r) << dendl;
-      ref.pool.ioctx().close();
-    }
-    return r;
-  }
-
-  void stop() {
-    if (handle) {
-      ref.pool.ioctx().unwatch2(handle);
-      ref.pool.ioctx().close();
-    }
-  }
-
-  /// respond to bucket trim notifications
-  void handle_notify(uint64_t notify_id, uint64_t cookie,
-                     uint64_t notifier_id, bufferlist& bl) override {
-    if (cookie != handle) {
-      return;
-    }
-    bufferlist reply;
-    try {
-      auto p = bl.cbegin();
-      TrimNotifyType type;
-      decode(type, p);
-
-      auto handler = handlers.find(type);
-      if (handler != handlers.end()) {
-        handler->second->handle(p, reply);
-      } else {
-        lderr(store->ctx()) << "no handler for notify type " << type << dendl;
-      }
-    } catch (const buffer::error& e) {
-      lderr(store->ctx()) << "Failed to decode notification: " << e.what() << dendl;
-    }
-    ref.pool.ioctx().notify_ack(ref.obj.oid, notify_id, cookie, reply);
-  }
-
-  /// reestablish the watch if it gets disconnected
-  void handle_error(uint64_t cookie, int err) override {
-    if (cookie != handle) {
-      return;
-    }
-    if (err == -ENOTCONN) {
-      ldout(store->ctx(), 4) << "Disconnected watch on " << ref.obj << dendl;
-      restart();
-    }
-  }
-};
-
-
-/// Interface to communicate with the trim manager about completed operations
-struct BucketTrimObserver {
-  virtual ~BucketTrimObserver() = default;
-
-  virtual void on_bucket_trimmed(std::string&& bucket_instance) = 0;
-  virtual bool trimmed_recently(const std::string_view& bucket_instance) = 0;
-};
-
-/// trim each bilog shard to the given marker, while limiting the number of
-/// concurrent requests
-class BucketTrimShardCollectCR : public RGWShardCollectCR {
-  static constexpr int MAX_CONCURRENT_SHARDS = 16;
-  const DoutPrefixProvider *dpp;
-  rgw::sal::RadosStore* const store;
-  const RGWBucketInfo& bucket_info;
-  rgw::bucket_index_layout_generation generation;
-  const std::vector<std::string>& markers; //< shard markers to trim
-  size_t i{0}; //< index of current shard marker
-
-  int handle_result(int r) override {
-    if (r == -ENOENT) { // ENOENT is not a fatal error
-      return 0;
-    }
-    if (r < 0) {
-      ldout(cct, 4) << "failed to trim bilog shard: " << cpp_strerror(r) << dendl;
-    }
-    return r;
-  }
- public:
-  BucketTrimShardCollectCR(const DoutPrefixProvider *dpp,
-                           rgw::sal::RadosStore* store, const RGWBucketInfo& bucket_info,
-                          const rgw::bucket_index_layout_generation& generation,
-                           const std::vector<std::string>& markers)
-    : RGWShardCollectCR(store->ctx(), MAX_CONCURRENT_SHARDS),
-      dpp(dpp), store(store), bucket_info(bucket_info),
-      generation(generation), markers(markers)
-  {}
-  bool spawn_next() override;
-};
-
-bool BucketTrimShardCollectCR::spawn_next()
-{
-  while (i < markers.size()) {
-    const auto& marker = markers[i];
-    const auto shard_id = i++;
-
-    // skip empty markers
-    if (!marker.empty()) {
-      ldpp_dout(dpp, 10) << "trimming bilog shard " << shard_id
-          << " of " << bucket_info.bucket << " at marker " << marker << dendl;
-      spawn(new RGWRadosBILogTrimCR(dpp, store, bucket_info, shard_id,
-                                    generation, std::string{}, marker),
-            false);
-      return true;
-    }
-  }
-  return false;
-}
-
-/// Delete a BI generation, limiting the number of requests in flight.
-class BucketCleanIndexCollectCR : public RGWShardCollectCR {
-  static constexpr int MAX_CONCURRENT_SHARDS = 16;
-  const DoutPrefixProvider *dpp;
-  rgw::sal::RadosStore* const store;
-  const RGWBucketInfo& bucket_info;
-  rgw::bucket_index_layout_generation index;
-  uint32_t shard = 0;
-  const uint32_t num_shards = rgw::num_shards(index);
-
-  int handle_result(int r) override {
-    if (r == -ENOENT) { // ENOENT is not a fatal error
-      return 0;
-    }
-    if (r < 0) {
-      ldout(cct, 4) << "clean index: " << cpp_strerror(r) << dendl;
-    }
-    return r;
-  }
- public:
-  BucketCleanIndexCollectCR(const DoutPrefixProvider *dpp,
-                           rgw::sal::RadosStore* store,
-                           const RGWBucketInfo& bucket_info,
-                           rgw::bucket_index_layout_generation index)
-    : RGWShardCollectCR(store->ctx(), MAX_CONCURRENT_SHARDS),
-      dpp(dpp), store(store), bucket_info(bucket_info),
-      index(index)
-  {}
-  bool spawn_next() override {
-    if (shard < num_shards) {
-      RGWRados::BucketShard bs(store->getRados());
-      bs.init(dpp, bucket_info, index, shard);
-      spawn(new RGWRadosRemoveOidCR(store, std::move(bs.bucket_obj), nullptr),
-           false);
-      ++shard;
-      return true;
-    } else {
-      return false;
-    }
-  }
-};
-
-
-/// trim the bilog of all of the given bucket instance's shards
-class BucketTrimInstanceCR : public RGWCoroutine {
-  static constexpr auto MAX_RETRIES = 25u;
-  rgw::sal::RadosStore* const store;
-  RGWHTTPManager *const http;
-  BucketTrimObserver *const observer;
-  std::string bucket_instance;
-  rgw_bucket_get_sync_policy_params get_policy_params;
-  std::shared_ptr<rgw_bucket_get_sync_policy_result> source_policy;
-  rgw_bucket bucket;
-  const std::string& zone_id; //< my zone id
-  RGWBucketInfo _bucket_info;
-  const RGWBucketInfo *pbucket_info; //< pointer to bucket instance info to locate bucket indices
-  int child_ret = 0;
-  const DoutPrefixProvider *dpp;
-public:
-  struct StatusShards {
-    uint64_t generation = 0;
-    std::vector<rgw_bucket_shard_sync_info> shards;
-  };
-private:
-  std::vector<StatusShards> peer_status; //< sync status for each peer
-  std::vector<std::string> min_markers; //< min marker per shard
-
-  /// The log generation to trim
-  rgw::bucket_log_layout_generation totrim;
-
-  /// Generation to be cleaned/New bucket info (if any)
-  std::optional<std::pair<RGWBucketInfo,
-                         rgw::bucket_log_layout_generation>> clean_info;
-  /// Maximum number of times to attempt to put bucket info
-  unsigned retries = 0;
-
-  int take_min_generation() {
-    // Initialize the min_generation to the bucket's current
-    // generation, used in case we have no peers.
-    auto min_generation = pbucket_info->layout.logs.back().gen;
-
-    // Determine the minimum generation
-    if (auto m = std::min_element(peer_status.begin(),
-                                 peer_status.end(),
-                                 [](const StatusShards& l,
-                                    const StatusShards& r) {
-                                   return l.generation < r.generation;
-                                 }); m != peer_status.end()) {
-      min_generation = m->generation;
-    }
-
-    auto& logs = pbucket_info->layout.logs;
-    auto log = std::find_if(logs.begin(), logs.end(),
-                           rgw::matches_gen(min_generation));
-    if (log == logs.end()) {
-      ldpp_dout(dpp, 5) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                       << "ERROR: No log layout for min_generation="
-                       << min_generation << dendl;
-      return -ENOENT;
-    }
-
-    totrim = *log;
-    return 0;
-  }
-
-  /// If there is a generation below the minimum, prepare to clean it up.
-  int maybe_remove_generation() {
-    if (clean_info)
-      return 0;
-
-
-    if (pbucket_info->layout.logs.front().gen < totrim.gen) {
-      clean_info = {*pbucket_info, {}};
-      auto log = clean_info->first.layout.logs.cbegin();
-      clean_info->second = *log;
-
-      if (clean_info->first.layout.logs.size() == 1) {
-       ldpp_dout(dpp, -1)
-         << "Critical error! Attempt to remove only log generation! "
-         << "log.gen=" << log->gen << ", totrim.gen=" << totrim.gen
-         << dendl;
-       return -EIO;
-      }
-      clean_info->first.layout.logs.erase(log);
-    }
-    return 0;
-  }
-
- public:
-  BucketTrimInstanceCR(rgw::sal::RadosStore* store, RGWHTTPManager *http,
-                       BucketTrimObserver *observer,
-                       const std::string& bucket_instance,
-                       const DoutPrefixProvider *dpp)
-    : RGWCoroutine(store->ctx()), store(store),
-      http(http), observer(observer),
-      bucket_instance(bucket_instance),
-      zone_id(store->svc()->zone->get_zone().id),
-      dpp(dpp) {
-    rgw_bucket_parse_bucket_key(cct, bucket_instance, &bucket, nullptr);
-    source_policy = make_shared<rgw_bucket_get_sync_policy_result>();
-  }
-
-  int operate(const DoutPrefixProvider *dpp) override;
-};
-
-namespace {
-/// populate the status with the minimum stable marker of each shard
-int take_min_status(
-  CephContext *cct,
-  const uint64_t min_generation,
-  std::vector<BucketTrimInstanceCR::StatusShards>::const_iterator first,
-  std::vector<BucketTrimInstanceCR::StatusShards>::const_iterator last,
-  std::vector<std::string> *status) {
-  for (auto peer = first; peer != last; ++peer) {
-    // Peers on later generations don't get a say in the matter
-    if (peer->generation > min_generation) {
-      continue;
-    }
-    if (peer->shards.size() != status->size()) {
-      // all peers must agree on the number of shards
-      return -EINVAL;
-    }
-
-    auto m = status->begin();
-    for (auto& shard : peer->shards) {
-      auto& marker = *m++;
-      // always take the first marker, or any later marker that's smaller
-      if (peer == first || marker > shard.inc_marker.position) {
-       marker = std::move(shard.inc_marker.position);
-      }
-    }
-  }
-  return 0;
-}
-}
-
-template<>
-inline int parse_decode_json<BucketTrimInstanceCR::StatusShards>(
-  BucketTrimInstanceCR::StatusShards& s, bufferlist& bl)
-{
-  JSONParser p;
-  if (!p.parse(bl.c_str(), bl.length())) {
-    return -EINVAL;
-  }
-
-  try {
-    bilog_status_v2 v;
-    decode_json_obj(v, &p);
-    s.generation = v.sync_status.incremental_gen;
-    s.shards = std::move(v.inc_status);
-  } catch (JSONDecoder::err& e) {
-    try {
-      // Fall back if we're talking to an old node that can't give v2
-      // output.
-      s.generation = 0;
-      decode_json_obj(s.shards, &p);
-    } catch (JSONDecoder::err& e) {
-      return -EINVAL;
-    }
-  }
-  return 0;
-}
-
-int BucketTrimInstanceCR::operate(const DoutPrefixProvider *dpp)
-{
-  reenter(this) {
-    ldpp_dout(dpp, 4) << "starting trim on bucket=" << bucket_instance << dendl;
-
-    get_policy_params.zone = zone_id;
-    get_policy_params.bucket = bucket;
-    yield call(new RGWBucketGetSyncPolicyHandlerCR(store->svc()->rados->get_async_processor(),
-                                                   store,
-                                                   get_policy_params,
-                                                   source_policy,
-                                                   dpp));
-    if (retcode < 0) {
-      if (retcode != -ENOENT) {
-        ldpp_dout(dpp, 0) << "ERROR: failed to fetch policy handler for bucket=" << bucket << dendl;
-      }
-
-      return set_cr_error(retcode);
-    }
-
-    if (auto& opt_bucket_info = source_policy->policy_handler->get_bucket_info();
-        opt_bucket_info) {
-      pbucket_info = &(*opt_bucket_info);
-    } else {
-      /* this shouldn't really happen */
-      return set_cr_error(-ENOENT);
-    }
-
-    if (pbucket_info->layout.logs.empty()) {
-      return set_cr_done(); // no bilogs to trim
-    }
-
-    // query peers for sync status
-    set_status("fetching sync status from relevant peers");
-    yield {
-      const auto& all_dests = source_policy->policy_handler->get_all_dests();
-
-      vector<rgw_zone_id> zids;
-      rgw_zone_id last_zid;
-      for (auto& diter : all_dests) {
-        const auto& zid = diter.first;
-        if (zid == last_zid) {
-          continue;
-        }
-        last_zid = zid;
-        zids.push_back(zid);
-      }
-
-      peer_status.resize(zids.size());
-
-      auto& zone_conn_map = store->svc()->zone->get_zone_conn_map();
-
-      auto p = peer_status.begin();
-      for (auto& zid : zids) {
-        // query data sync status from each sync peer
-        rgw_http_param_pair params[] = {
-          { "type", "bucket-index" },
-          { "status", nullptr },
-          { "options", "merge" },
-          { "bucket", bucket_instance.c_str() }, /* equal to source-bucket when `options==merge` and source-bucket
-                                                    param is not provided */
-          { "source-zone", zone_id.c_str() },
-          { "version", "2" },
-          { nullptr, nullptr }
-        };
-
-        auto ziter = zone_conn_map.find(zid);
-        if (ziter == zone_conn_map.end()) {
-          ldpp_dout(dpp, 0) << "WARNING: no connection to zone " << zid << ", can't trim bucket: " << bucket << dendl;
-          return set_cr_error(-ECANCELED);
-        }
-
-       using StatusCR = RGWReadRESTResourceCR<StatusShards>;
-        spawn(new StatusCR(cct, ziter->second, http, "/admin/log/", params, &*p),
-              false);
-        ++p;
-      }
-    }
-    // wait for a response from each peer. all must respond to attempt trim
-    while (num_spawned()) {
-      yield wait_for_child();
-      collect(&child_ret, nullptr);
-      if (child_ret < 0) {
-        drain_all();
-        return set_cr_error(child_ret);
-      }
-    }
-
-    // Determine the minimum generation
-    retcode = take_min_generation();
-    if (retcode < 0) {
-      ldpp_dout(dpp, 4) << "failed to find minimum generation" << dendl;
-      return set_cr_error(retcode);
-    }
-    retcode = maybe_remove_generation();
-    if (retcode < 0) {
-      ldpp_dout(dpp, 4) << "error removing old generation from log: "
-                       << cpp_strerror(retcode) << dendl;
-      return set_cr_error(retcode);
-    }
-
-    if (clean_info) {
-      if (clean_info->second.layout.type != rgw::BucketLogType::InIndex) {
-       ldpp_dout(dpp, 0) << "Unable to convert log of unknown type "
-                         << clean_info->second.layout.type
-                         << " to rgw::bucket_index_layout_generation " << dendl;
-       return set_cr_error(-EINVAL);
-      }
-
-      yield call(new BucketCleanIndexCollectCR(dpp, store, clean_info->first,
-                                              clean_info->second.layout.in_index));
-      if (retcode < 0) {
-       ldpp_dout(dpp, 0) << "failed to remove previous generation: "
-                         << cpp_strerror(retcode) << dendl;
-       return set_cr_error(retcode);
-      }
-      while (clean_info && retries < MAX_RETRIES) {
-       yield call(new RGWPutBucketInstanceInfoCR(
-                    store->svc()->rados->get_async_processor(),
-                    store, clean_info->first, false, {},
-                    no_change_attrs(), dpp));
-
-       // Raced, try again.
-       if (retcode == -ECANCELED) {
-         yield call(new RGWGetBucketInstanceInfoCR(
-                      store->svc()->rados->get_async_processor(),
-                      store, clean_info->first.bucket,
-                      &(clean_info->first), nullptr, dpp));
-         if (retcode < 0) {
-           ldpp_dout(dpp, 0) << "failed to get bucket info: "
-                             << cpp_strerror(retcode) << dendl;
-           return set_cr_error(retcode);
-         }
-         if (clean_info->first.layout.logs.front().gen ==
-             clean_info->second.gen) {
-           clean_info->first.layout.logs.erase(
-             clean_info->first.layout.logs.begin());
-           ++retries;
-           continue;
-         }
-         // Raced, but someone else did what we needed to.
-         retcode = 0;
-       }
-
-       if (retcode < 0) {
-         ldpp_dout(dpp, 0) << "failed to put bucket info: "
-                           << cpp_strerror(retcode) << dendl;
-         return set_cr_error(retcode);
-       }
-       clean_info = std::nullopt;
-      }
-    } else {
-      if (totrim.layout.type != rgw::BucketLogType::InIndex) {
-       ldpp_dout(dpp, 0) << "Unable to convert log of unknown type "
-                         << totrim.layout.type
-                         << " to rgw::bucket_index_layout_generation " << dendl;
-       return set_cr_error(-EINVAL);
-      }
-      // To avoid hammering the OSD too hard, either trim old
-      // generations OR trim the current one.
-
-      // determine the minimum marker for each shard
-
-      // initialize each shard with the maximum marker, which is only used when
-      // there are no peers syncing from us
-      min_markers.assign(std::max(1u, rgw::num_shards(totrim.layout.in_index)),
-                        RGWSyncLogTrimCR::max_marker);
-
-
-      retcode = take_min_status(cct, totrim.gen, peer_status.cbegin(),
-                               peer_status.cend(), &min_markers);
-      if (retcode < 0) {
-       ldpp_dout(dpp, 4) << "failed to correlate bucket sync status from peers" << dendl;
-       return set_cr_error(retcode);
-      }
-
-      // trim shards with a ShardCollectCR
-      ldpp_dout(dpp, 10) << "trimming bilogs for bucket=" << pbucket_info->bucket
-                        << " markers=" << min_markers << ", shards=" << min_markers.size() << dendl;
-      set_status("trimming bilog shards");
-      yield call(new BucketTrimShardCollectCR(dpp, store, *pbucket_info, totrim.layout.in_index,
-                                             min_markers));
-      // ENODATA just means there were no keys to trim
-      if (retcode == -ENODATA) {
-       retcode = 0;
-      }
-      if (retcode < 0) {
-       ldpp_dout(dpp, 4) << "failed to trim bilog shards: "
-                         << cpp_strerror(retcode) << dendl;
-       return set_cr_error(retcode);
-      }
-    }
-
-    observer->on_bucket_trimmed(std::move(bucket_instance));
-    return set_cr_done();
-  }
-  return 0;
-}
-
-/// trim each bucket instance while limiting the number of concurrent operations
-
-class BucketTrimInstanceCollectCR : public RGWShardCollectCR {
-  rgw::sal::RadosStore* const store;
-  RGWHTTPManager *const http;
-  BucketTrimObserver *const observer;
-  std::vector<std::string>::const_iterator bucket;
-  std::vector<std::string>::const_iterator end;
-  const DoutPrefixProvider *dpp;
-
-  int handle_result(int r) override {
-    if (r == -ENOENT) { // ENOENT is not a fatal error
-      return 0;
-    }
-    if (r < 0) {
-      ldout(cct, 4) << "failed to trim bucket instance: " << cpp_strerror(r) << dendl;
-    }
-    return r;
-  }
- public:
-  BucketTrimInstanceCollectCR(rgw::sal::RadosStore* store, RGWHTTPManager *http,
-                              BucketTrimObserver *observer,
-                              const std::vector<std::string>& buckets,
-                              int max_concurrent,
-                              const DoutPrefixProvider *dpp)
-    : RGWShardCollectCR(store->ctx(), max_concurrent),
-      store(store), http(http), observer(observer),
-      bucket(buckets.begin()), end(buckets.end()),
-      dpp(dpp)
-  {}
-  bool spawn_next() override;
-};
-
-bool BucketTrimInstanceCollectCR::spawn_next()
-{
-  if (bucket == end) {
-    return false;
-  }
-  spawn(new BucketTrimInstanceCR(store, http, observer, *bucket, dpp), false);
-  ++bucket;
-  return true;
-}
-
-/// correlate the replies from each peer gateway into the given counter
-int accumulate_peer_counters(bufferlist& bl, BucketChangeCounter& counter)
-{
-  counter.clear();
-
-  try {
-    // decode notify responses
-    auto p = bl.cbegin();
-    std::map<std::pair<uint64_t, uint64_t>, bufferlist> replies;
-    std::set<std::pair<uint64_t, uint64_t>> timeouts;
-    decode(replies, p);
-    decode(timeouts, p);
-
-    for (auto& peer : replies) {
-      auto q = peer.second.cbegin();
-      TrimCounters::Response response;
-      decode(response, q);
-      for (const auto& b : response.bucket_counters) {
-        counter.insert(b.bucket, b.count);
-      }
-    }
-  } catch (const buffer::error& e) {
-    return -EIO;
-  }
-  return 0;
-}
-
-/// metadata callback has the signature bool(string&& key, string&& marker)
-using MetadataListCallback = std::function<bool(std::string&&, std::string&&)>;
-
-/// lists metadata keys, passing each to a callback until it returns false.
-/// on reaching the end, it will restart at the beginning and list up to the
-/// initial marker
-class AsyncMetadataList : public RGWAsyncRadosRequest {
-  CephContext *const cct;
-  RGWMetadataManager *const mgr;
-  const std::string section;
-  const std::string start_marker;
-  MetadataListCallback callback;
-
-  int _send_request(const DoutPrefixProvider *dpp) override;
- public:
-  AsyncMetadataList(CephContext *cct, RGWCoroutine *caller,
-                    RGWAioCompletionNotifier *cn, RGWMetadataManager *mgr,
-                    const std::string& section, const std::string& start_marker,
-                    const MetadataListCallback& callback)
-    : RGWAsyncRadosRequest(caller, cn), cct(cct), mgr(mgr),
-      section(section), start_marker(start_marker), callback(callback)
-  {}
-};
-
-int AsyncMetadataList::_send_request(const DoutPrefixProvider *dpp)
-{
-  void* handle = nullptr;
-  std::list<std::string> keys;
-  bool truncated{false};
-  std::string marker;
-
-  // start a listing at the given marker
-  int r = mgr->list_keys_init(dpp, section, start_marker, &handle);
-  if (r == -EINVAL) {
-    // restart with empty marker below
-  } else if (r < 0) {
-    ldpp_dout(dpp, 10) << "failed to init metadata listing: "
-        << cpp_strerror(r) << dendl;
-    return r;
-  } else {
-    ldpp_dout(dpp, 20) << "starting metadata listing at " << start_marker << dendl;
-
-    // release the handle when scope exits
-    auto g = make_scope_guard([=, this] { mgr->list_keys_complete(handle); });
-
-    do {
-      // get the next key and marker
-      r = mgr->list_keys_next(dpp, handle, 1, keys, &truncated);
-      if (r < 0) {
-        ldpp_dout(dpp, 10) << "failed to list metadata: "
-            << cpp_strerror(r) << dendl;
-        return r;
-      }
-      marker = mgr->get_marker(handle);
-
-      if (!keys.empty()) {
-        ceph_assert(keys.size() == 1);
-        auto& key = keys.front();
-        if (!callback(std::move(key), std::move(marker))) {
-          return 0;
-        }
-      }
-    } while (truncated);
-
-    if (start_marker.empty()) {
-      // already listed all keys
-      return 0;
-    }
-  }
-
-  // restart the listing from the beginning (empty marker)
-  handle = nullptr;
-
-  r = mgr->list_keys_init(dpp, section, "", &handle);
-  if (r < 0) {
-    ldpp_dout(dpp, 10) << "failed to restart metadata listing: "
-        << cpp_strerror(r) << dendl;
-    return r;
-  }
-  ldpp_dout(dpp, 20) << "restarting metadata listing" << dendl;
-
-  // release the handle when scope exits
-  auto g = make_scope_guard([=, this] { mgr->list_keys_complete(handle); });
-  do {
-    // get the next key and marker
-    r = mgr->list_keys_next(dpp, handle, 1, keys, &truncated);
-    if (r < 0) {
-      ldpp_dout(dpp, 10) << "failed to list metadata: "
-          << cpp_strerror(r) << dendl;
-      return r;
-    }
-    marker = mgr->get_marker(handle);
-
-    if (!keys.empty()) {
-      ceph_assert(keys.size() == 1);
-      auto& key = keys.front();
-      // stop at original marker
-      if (marker > start_marker) {
-        return 0;
-      }
-      if (!callback(std::move(key), std::move(marker))) {
-        return 0;
-      }
-    }
-  } while (truncated);
-
-  return 0;
-}
-
-/// coroutine wrapper for AsyncMetadataList
-class MetadataListCR : public RGWSimpleCoroutine {
-  RGWAsyncRadosProcessor *const async_rados;
-  RGWMetadataManager *const mgr;
-  const std::string& section;
-  const std::string& start_marker;
-  MetadataListCallback callback;
-  RGWAsyncRadosRequest *req{nullptr};
- public:
-  MetadataListCR(CephContext *cct, RGWAsyncRadosProcessor *async_rados,
-                 RGWMetadataManager *mgr, const std::string& section,
-                 const std::string& start_marker,
-                 const MetadataListCallback& callback)
-    : RGWSimpleCoroutine(cct), async_rados(async_rados), mgr(mgr),
-      section(section), start_marker(start_marker), callback(callback)
-  {}
-  ~MetadataListCR() override {
-    request_cleanup();
-  }
-
-  int send_request(const DoutPrefixProvider *dpp) override {
-    req = new AsyncMetadataList(cct, this, stack->create_completion_notifier(),
-                                mgr, section, start_marker, callback);
-    async_rados->queue(req);
-    return 0;
-  }
-  int request_complete() override {
-    return req->get_ret_status();
-  }
-  void request_cleanup() override {
-    if (req) {
-      req->finish();
-      req = nullptr;
-    }
-  }
-};
-
-class BucketTrimCR : public RGWCoroutine {
-  rgw::sal::RadosStore* const store;
-  RGWHTTPManager *const http;
-  const BucketTrimConfig& config;
-  BucketTrimObserver *const observer;
-  const rgw_raw_obj& obj;
-  ceph::mono_time start_time;
-  bufferlist notify_replies;
-  BucketChangeCounter counter;
-  std::vector<std::string> buckets; //< buckets selected for trim
-  BucketTrimStatus status;
-  RGWObjVersionTracker objv; //< version tracker for trim status object
-  std::string last_cold_marker; //< position for next trim marker
-  const DoutPrefixProvider *dpp;
-
-  static const std::string section; //< metadata section for bucket instances
- public:
-  BucketTrimCR(rgw::sal::RadosStore* store, RGWHTTPManager *http,
-               const BucketTrimConfig& config, BucketTrimObserver *observer,
-               const rgw_raw_obj& obj, const DoutPrefixProvider *dpp)
-    : RGWCoroutine(store->ctx()), store(store), http(http), config(config),
-      observer(observer), obj(obj), counter(config.counter_size), dpp(dpp)
-  {}
-
-  int operate(const DoutPrefixProvider *dpp) override;
-};
-
-const std::string BucketTrimCR::section{"bucket.instance"};
-
-int BucketTrimCR::operate(const DoutPrefixProvider *dpp)
-{
-  reenter(this) {
-    start_time = ceph::mono_clock::now();
-
-    if (config.buckets_per_interval) {
-      // query watch/notify for hot buckets
-      ldpp_dout(dpp, 10) << "fetching active bucket counters" << dendl;
-      set_status("fetching active bucket counters");
-      yield {
-        // request the top bucket counters from each peer gateway
-        const TrimNotifyType type = NotifyTrimCounters;
-        TrimCounters::Request request{32};
-        bufferlist bl;
-        encode(type, bl);
-        encode(request, bl);
-        call(new RGWRadosNotifyCR(store, obj, bl, config.notify_timeout_ms,
-                                  &notify_replies));
-      }
-      if (retcode < 0) {
-        ldpp_dout(dpp, 10) << "failed to fetch peer bucket counters" << dendl;
-        return set_cr_error(retcode);
-      }
-
-      // select the hottest buckets for trim
-      retcode = accumulate_peer_counters(notify_replies, counter);
-      if (retcode < 0) {
-        ldout(cct, 4) << "failed to correlate peer bucket counters" << dendl;
-        return set_cr_error(retcode);
-      }
-      buckets.reserve(config.buckets_per_interval);
-
-      const int max_count = config.buckets_per_interval -
-                            config.min_cold_buckets_per_interval;
-      counter.get_highest(max_count,
-        [this] (const std::string& bucket, int count) {
-          buckets.push_back(bucket);
-        });
-    }
-
-    if (buckets.size() < config.buckets_per_interval) {
-      // read BucketTrimStatus for marker position
-      set_status("reading trim status");
-      using ReadStatus = RGWSimpleRadosReadCR<BucketTrimStatus>;
-      yield call(new ReadStatus(dpp, store->svc()->rados->get_async_processor(), store->svc()->sysobj, obj,
-                                &status, true, &objv));
-      if (retcode < 0) {
-        ldpp_dout(dpp, 10) << "failed to read bilog trim status: "
-            << cpp_strerror(retcode) << dendl;
-        return set_cr_error(retcode);
-      }
-      if (status.marker == "MAX") {
-        status.marker.clear(); // restart at the beginning
-      }
-      ldpp_dout(dpp, 10) << "listing cold buckets from marker="
-          << status.marker << dendl;
-
-      set_status("listing cold buckets for trim");
-      yield {
-        // capture a reference so 'this' remains valid in the callback
-        auto ref = boost::intrusive_ptr<RGWCoroutine>{this};
-        // list cold buckets to consider for trim
-        auto cb = [this, ref] (std::string&& bucket, std::string&& marker) {
-          // filter out keys that we trimmed recently
-          if (observer->trimmed_recently(bucket)) {
-            return true;
-          }
-          // filter out active buckets that we've already selected
-          auto i = std::find(buckets.begin(), buckets.end(), bucket);
-          if (i != buckets.end()) {
-            return true;
-          }
-          buckets.emplace_back(std::move(bucket));
-          // remember the last cold bucket spawned to update the status marker
-          last_cold_marker = std::move(marker);
-          // return true if there's room for more
-          return buckets.size() < config.buckets_per_interval;
-        };
-
-        call(new MetadataListCR(cct, store->svc()->rados->get_async_processor(),
-                                store->ctl()->meta.mgr,
-                                section, status.marker, cb));
-      }
-      if (retcode < 0) {
-        ldout(cct, 4) << "failed to list bucket instance metadata: "
-            << cpp_strerror(retcode) << dendl;
-        return set_cr_error(retcode);
-      }
-    }
-
-    // trim bucket instances with limited concurrency
-    set_status("trimming buckets");
-    ldpp_dout(dpp, 4) << "collected " << buckets.size() << " buckets for trim" << dendl;
-    yield call(new BucketTrimInstanceCollectCR(store, http, observer, buckets,
-                                               config.concurrent_buckets, dpp));
-    // ignore errors from individual buckets
-
-    // write updated trim status
-    if (!last_cold_marker.empty() && status.marker != last_cold_marker) {
-      set_status("writing updated trim status");
-      status.marker = std::move(last_cold_marker);
-      ldpp_dout(dpp, 20) << "writing bucket trim marker=" << status.marker << dendl;
-      using WriteStatus = RGWSimpleRadosWriteCR<BucketTrimStatus>;
-      yield call(new WriteStatus(dpp, store->svc()->rados->get_async_processor(), store->svc()->sysobj, obj,
-                                 status, &objv));
-      if (retcode < 0) {
-        ldpp_dout(dpp, 4) << "failed to write updated trim status: "
-            << cpp_strerror(retcode) << dendl;
-        return set_cr_error(retcode);
-      }
-    }
-
-    // notify peers that trim completed
-    set_status("trim completed");
-    yield {
-      const TrimNotifyType type = NotifyTrimComplete;
-      TrimComplete::Request request;
-      bufferlist bl;
-      encode(type, bl);
-      encode(request, bl);
-      call(new RGWRadosNotifyCR(store, obj, bl, config.notify_timeout_ms,
-                                nullptr));
-    }
-    if (retcode < 0) {
-      ldout(cct, 10) << "failed to notify peers of trim completion" << dendl;
-      return set_cr_error(retcode);
-    }
-
-    ldpp_dout(dpp, 4) << "bucket index log processing completed in "
-        << ceph::mono_clock::now() - start_time << dendl;
-    return set_cr_done();
-  }
-  return 0;
-}
-
-class BucketTrimPollCR : public RGWCoroutine {
-  rgw::sal::RadosStore* const store;
-  RGWHTTPManager *const http;
-  const BucketTrimConfig& config;
-  BucketTrimObserver *const observer;
-  const rgw_raw_obj& obj;
-  const std::string name{"trim"}; //< lock name
-  const std::string cookie;
-  const DoutPrefixProvider *dpp;
-
- public:
-  BucketTrimPollCR(rgw::sal::RadosStore* store, RGWHTTPManager *http,
-                   const BucketTrimConfig& config,
-                   BucketTrimObserver *observer, const rgw_raw_obj& obj,
-                   const DoutPrefixProvider *dpp)
-    : RGWCoroutine(store->ctx()), store(store), http(http),
-      config(config), observer(observer), obj(obj),
-      cookie(RGWSimpleRadosLockCR::gen_random_cookie(cct)),
-      dpp(dpp) {}
-
-  int operate(const DoutPrefixProvider *dpp) override;
-};
-
-int BucketTrimPollCR::operate(const DoutPrefixProvider *dpp)
-{
-  reenter(this) {
-    for (;;) {
-      set_status("sleeping");
-      wait(utime_t{static_cast<time_t>(config.trim_interval_sec), 0});
-
-      // prevent others from trimming for our entire wait interval
-      set_status("acquiring trim lock");
-      yield call(new RGWSimpleRadosLockCR(store->svc()->rados->get_async_processor(), store,
-                                          obj, name, cookie,
-                                          config.trim_interval_sec));
-      if (retcode < 0) {
-        ldout(cct, 4) << "failed to lock: " << cpp_strerror(retcode) << dendl;
-        continue;
-      }
-
-      set_status("trimming");
-      yield call(new BucketTrimCR(store, http, config, observer, obj, dpp));
-      if (retcode < 0) {
-        // on errors, unlock so other gateways can try
-        set_status("unlocking");
-        yield call(new RGWSimpleRadosUnlockCR(store->svc()->rados->get_async_processor(), store,
-                                              obj, name, cookie));
-      }
-    }
-  }
-  return 0;
-}
-
-/// tracks a bounded list of events with timestamps. old events can be expired,
-/// and recent events can be searched by key. expiration depends on events being
-/// inserted in temporal order
-template <typename T, typename Clock = ceph::coarse_mono_clock>
-class RecentEventList {
- public:
-  using clock_type = Clock;
-  using time_point = typename clock_type::time_point;
-
-  RecentEventList(size_t max_size, const ceph::timespan& max_duration)
-    : events(max_size), max_duration(max_duration)
-  {}
-
-  /// insert an event at the given point in time. this time must be at least as
-  /// recent as the last inserted event
-  void insert(T&& value, const time_point& now) {
-    // ceph_assert(events.empty() || now >= events.back().time)
-    events.push_back(Event{std::move(value), now});
-  }
-
-  /// performs a linear search for an event matching the given key, whose type
-  /// U can be any that provides operator==(U, T)
-  template <typename U>
-  bool lookup(const U& key) const {
-    for (const auto& event : events) {
-      if (key == event.value) {
-        return true;
-      }
-    }
-    return false;
-  }
-
-  /// remove events that are no longer recent compared to the given point in time
-  void expire_old(const time_point& now) {
-    const auto expired_before = now - max_duration;
-    while (!events.empty() && events.front().time < expired_before) {
-      events.pop_front();
-    }
-  }
-
- private:
-  struct Event {
-    T value;
-    time_point time;
-  };
-  boost::circular_buffer<Event> events;
-  const ceph::timespan max_duration;
-};
-
-namespace rgw {
-
-// read bucket trim configuration from ceph context
-void configure_bucket_trim(CephContext *cct, BucketTrimConfig& config)
-{
-  const auto& conf = cct->_conf;
-
-  config.trim_interval_sec =
-      conf.get_val<int64_t>("rgw_sync_log_trim_interval");
-  config.counter_size = 512;
-  config.buckets_per_interval =
-      conf.get_val<int64_t>("rgw_sync_log_trim_max_buckets");
-  config.min_cold_buckets_per_interval =
-      conf.get_val<int64_t>("rgw_sync_log_trim_min_cold_buckets");
-  config.concurrent_buckets =
-      conf.get_val<int64_t>("rgw_sync_log_trim_concurrent_buckets");
-  config.notify_timeout_ms = 10000;
-  config.recent_size = 128;
-  config.recent_duration = std::chrono::hours(2);
-}
-
-class BucketTrimManager::Impl : public TrimCounters::Server,
-                                public BucketTrimObserver {
- public:
-   rgw::sal::RadosStore* const store;
-  const BucketTrimConfig config;
-
-  const rgw_raw_obj status_obj;
-
-  /// count frequency of bucket instance entries in the data changes log
-  BucketChangeCounter counter;
-
-  using RecentlyTrimmedBucketList = RecentEventList<std::string>;
-  using clock_type = RecentlyTrimmedBucketList::clock_type;
-  /// track recently trimmed buckets to focus trim activity elsewhere
-  RecentlyTrimmedBucketList trimmed;
-
-  /// serve the bucket trim watch/notify api
-  BucketTrimWatcher watcher;
-
-  /// protect data shared between data sync, trim, and watch/notify threads
-  std::mutex mutex;
-
-  Impl(rgw::sal::RadosStore* store, const BucketTrimConfig& config)
-    : store(store), config(config),
-      status_obj(store->svc()->zone->get_zone_params().log_pool, BucketTrimStatus::oid),
-      counter(config.counter_size),
-      trimmed(config.recent_size, config.recent_duration),
-      watcher(store, status_obj, this)
-  {}
-
-  /// TrimCounters::Server interface for watch/notify api
-  void get_bucket_counters(int count, TrimCounters::Vector& buckets) {
-    buckets.reserve(count);
-    std::lock_guard<std::mutex> lock(mutex);
-    counter.get_highest(count, [&buckets] (const std::string& key, int count) {
-                          buckets.emplace_back(key, count);
-                        });
-    ldout(store->ctx(), 20) << "get_bucket_counters: " << buckets << dendl;
-  }
-
-  void reset_bucket_counters() override {
-    ldout(store->ctx(), 20) << "bucket trim completed" << dendl;
-    std::lock_guard<std::mutex> lock(mutex);
-    counter.clear();
-    trimmed.expire_old(clock_type::now());
-  }
-
-  /// BucketTrimObserver interface to remember successfully-trimmed buckets
-  void on_bucket_trimmed(std::string&& bucket_instance) override {
-    ldout(store->ctx(), 20) << "trimmed bucket instance " << bucket_instance << dendl;
-    std::lock_guard<std::mutex> lock(mutex);
-    trimmed.insert(std::move(bucket_instance), clock_type::now());
-  }
-
-  bool trimmed_recently(const std::string_view& bucket_instance) override {
-    std::lock_guard<std::mutex> lock(mutex);
-    return trimmed.lookup(bucket_instance);
-  }
-};
-
-BucketTrimManager::BucketTrimManager(rgw::sal::RadosStore* store,
-                                     const BucketTrimConfig& config)
-  : impl(new Impl(store, config))
-{
-}
-BucketTrimManager::~BucketTrimManager() = default;
-
-int BucketTrimManager::init()
-{
-  return impl->watcher.start(this);
-}
-
-void BucketTrimManager::on_bucket_changed(const std::string_view& bucket)
-{
-  std::lock_guard<std::mutex> lock(impl->mutex);
-  // filter recently trimmed bucket instances out of bucket change counter
-  if (impl->trimmed.lookup(bucket)) {
-    return;
-  }
-  impl->counter.insert(std::string(bucket));
-}
-
-RGWCoroutine* BucketTrimManager::create_bucket_trim_cr(RGWHTTPManager *http)
-{
-  return new BucketTrimPollCR(impl->store, http, impl->config,
-                              impl.get(), impl->status_obj, this);
-}
-
-RGWCoroutine* BucketTrimManager::create_admin_bucket_trim_cr(RGWHTTPManager *http)
-{
-  // return the trim coroutine without any polling
-  return new BucketTrimCR(impl->store, http, impl->config,
-                          impl.get(), impl->status_obj, this);
-}
-
-CephContext* BucketTrimManager::get_cct() const
-{
-  return impl->store->ctx();
-}
-
-unsigned BucketTrimManager::get_subsys() const
-{
-  return dout_subsys;
-}
-
-std::ostream& BucketTrimManager::gen_prefix(std::ostream& out) const
-{
-  return out << "rgw bucket trim manager: ";
-}
-
-} // namespace rgw
-
-int bilog_trim(const DoutPrefixProvider* p, rgw::sal::RadosStore* store,
-              RGWBucketInfo& bucket_info, uint64_t gen, int shard_id,
-              std::string_view start_marker, std::string_view end_marker)
-{
-  auto& logs = bucket_info.layout.logs;
-  auto log = std::find_if(logs.begin(), logs.end(), rgw::matches_gen(gen));
-  if (log == logs.end()) {
-    ldpp_dout(p, 5) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                   << "ERROR: no log layout with gen=" << gen << dendl;
-    return -ENOENT;
-  }
-
-  auto log_layout = *log;
-
-  auto r = store->svc()->bilog_rados->log_trim(p, bucket_info, log_layout, shard_id, start_marker, end_marker);
-  if (r < 0) {
-    ldpp_dout(p, 5) << __PRETTY_FUNCTION__ << ":" << __LINE__
-                   << "ERROR: bilog_rados->log_trim returned r=" << r << dendl;
-  }
-  return r;
-}
diff --git a/src/rgw/store/rados/rgw_trim_bilog.h b/src/rgw/store/rados/rgw_trim_bilog.h
deleted file mode 100644 (file)
index 5b9c4cd..0000000
+++ /dev/null
@@ -1,124 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab ft=cpp
-
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2017 Red Hat, Inc
- *
- * Author: Casey Bodley <cbodley@redhat.com>
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation.  See file COPYING.
- */
-
-#ifndef RGW_SYNC_LOG_TRIM_H
-#define RGW_SYNC_LOG_TRIM_H
-
-#include <memory>
-#include <string_view>
-
-#include "include/common_fwd.h"
-#include "include/encoding.h"
-#include "common/ceph_time.h"
-#include "common/dout.h"
-#include "rgw_common.h"
-
-class RGWCoroutine;
-class RGWHTTPManager;
-
-namespace rgw {
-
-namespace sal {
-  class RadosStore;
-}
-
-/// Interface to inform the trim process about which buckets are most active
-struct BucketChangeObserver {
-  virtual ~BucketChangeObserver() = default;
-
-  virtual void on_bucket_changed(const std::string_view& bucket_instance) = 0;
-};
-
-/// Configuration for BucketTrimManager
-struct BucketTrimConfig {
-  /// time interval in seconds between bucket trim attempts
-  uint32_t trim_interval_sec{0};
-  /// maximum number of buckets to track with BucketChangeObserver
-  size_t counter_size{0};
-  /// maximum number of buckets to process each trim interval
-  uint32_t buckets_per_interval{0};
-  /// minimum number of buckets to choose from the global bucket instance list
-  uint32_t min_cold_buckets_per_interval{0};
-  /// maximum number of buckets to process in parallel
-  uint32_t concurrent_buckets{0};
-  /// timeout in ms for bucket trim notify replies
-  uint64_t notify_timeout_ms{0};
-  /// maximum number of recently trimmed buckets to remember (should be small
-  /// enough for a linear search)
-  size_t recent_size{0};
-  /// maximum duration to consider a trim as 'recent' (should be some multiple
-  /// of the trim interval, at least)
-  ceph::timespan recent_duration{0};
-};
-
-/// fill out the BucketTrimConfig from the ceph context
-void configure_bucket_trim(CephContext *cct, BucketTrimConfig& config);
-
-/// Determines the buckets on which to focus trim activity, using two sources of
-/// input: the frequency of entries read from the data changes log, and a global
-/// listing of the bucket.instance metadata. This allows us to trim active
-/// buckets quickly, while also ensuring that all buckets will eventually trim
-class BucketTrimManager : public BucketChangeObserver, public DoutPrefixProvider {
-  class Impl;
-  std::unique_ptr<Impl> impl;
- public:
-  BucketTrimManager(sal::RadosStore *store, const BucketTrimConfig& config);
-  ~BucketTrimManager();
-
-  int init();
-
-  /// increment a counter for the given bucket instance
-  void on_bucket_changed(const std::string_view& bucket_instance) override;
-
-  /// create a coroutine to run the bucket trim process every trim interval
-  RGWCoroutine* create_bucket_trim_cr(RGWHTTPManager *http);
-
-  /// create a coroutine to trim buckets directly via radosgw-admin
-  RGWCoroutine* create_admin_bucket_trim_cr(RGWHTTPManager *http);
-
-  CephContext *get_cct() const override;
-  unsigned get_subsys() const;
-  std::ostream& gen_prefix(std::ostream& out) const;
-};
-
-/// provides persistent storage for the trim manager's current position in the
-/// list of bucket instance metadata
-struct BucketTrimStatus {
-  std::string marker; //< metadata key of current bucket instance
-
-  void encode(bufferlist& bl) const {
-    ENCODE_START(1, 1, bl);
-    encode(marker, bl);
-    ENCODE_FINISH(bl);
-  }
-  void decode(bufferlist::const_iterator& p) {
-    DECODE_START(1, p);
-    decode(marker, p);
-    DECODE_FINISH(p);
-  }
-
-  static const std::string oid;
-};
-
-} // namespace rgw
-
-WRITE_CLASS_ENCODER(rgw::BucketTrimStatus);
-
-int bilog_trim(const DoutPrefixProvider* p, rgw::sal::RadosStore* store,
-              RGWBucketInfo& bucket_info, uint64_t gen, int shard_id,
-              std::string_view start_marker, std::string_view end_marker);
-
-#endif // RGW_SYNC_LOG_TRIM_H
diff --git a/src/rgw/store/rados/rgw_trim_datalog.cc b/src/rgw/store/rados/rgw_trim_datalog.cc
deleted file mode 100644 (file)
index 72a1600..0000000
+++ /dev/null
@@ -1,252 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab ft=cpp
-
-#include <vector>
-#include <string>
-
-#include "common/errno.h"
-
-#include "rgw_trim_datalog.h"
-#include "rgw_cr_rados.h"
-#include "rgw_cr_rest.h"
-#include "rgw_datalog.h"
-#include "rgw_data_sync.h"
-#include "rgw_zone.h"
-#include "rgw_bucket.h"
-
-#include "services/svc_zone.h"
-
-#include <boost/asio/yield.hpp>
-
-#define dout_subsys ceph_subsys_rgw
-
-#undef dout_prefix
-#define dout_prefix (*_dout << "data trim: ")
-
-namespace {
-
-class DatalogTrimImplCR : public RGWSimpleCoroutine {
-  const DoutPrefixProvider *dpp;
-  rgw::sal::RadosStore* store;
-  boost::intrusive_ptr<RGWAioCompletionNotifier> cn;
-  int shard;
-  std::string marker;
-  std::string* last_trim_marker;
-
- public:
-  DatalogTrimImplCR(const DoutPrefixProvider *dpp, rgw::sal::RadosStore* store, int shard,
-                   const std::string& marker, std::string* last_trim_marker)
-  : RGWSimpleCoroutine(store->ctx()), dpp(dpp), store(store), shard(shard),
-    marker(marker), last_trim_marker(last_trim_marker) {
-    set_description() << "Datalog trim shard=" << shard
-                     << " marker=" << marker;
-  }
-
-  int send_request(const DoutPrefixProvider *dpp) override {
-    set_status() << "sending request";
-    cn = stack->create_completion_notifier();
-    return store->svc()->datalog_rados->trim_entries(dpp, shard, marker,
-                                                    cn->completion());
-  }
-  int request_complete() override {
-    int r = cn->completion()->get_return_value();
-    ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << "(): trim of shard=" << shard
-                 << " marker=" << marker << " returned r=" << r << dendl;
-
-    set_status() << "request complete; ret=" << r;
-    if (r != -ENODATA) {
-      return r;
-    }
-    // nothing left to trim, update last_trim_marker
-    if (*last_trim_marker < marker &&
-       marker != store->svc()->datalog_rados->max_marker()) {
-      *last_trim_marker = marker;
-    }
-    return 0;
-  }
-};
-
-/// return the marker that it's safe to trim up to
-const std::string& get_stable_marker(const rgw_data_sync_marker& m)
-{
-  return m.state == m.FullSync ? m.next_step_marker : m.marker;
-}
-
-/// populate the container starting with 'dest' with the minimum stable marker
-/// of each shard for all of the peers in [first, last)
-template <typename IterIn, typename IterOut>
-void take_min_markers(IterIn first, IterIn last, IterOut dest)
-{
-  if (first == last) {
-    return;
-  }
-  for (auto p = first; p != last; ++p) {
-    auto m = dest;
-    for (auto &shard : p->sync_markers) {
-      const auto& stable = get_stable_marker(shard.second);
-      if (*m > stable) {
-        *m = stable;
-      }
-      ++m;
-    }
-  }
-}
-
-} // anonymous namespace
-
-class DataLogTrimCR : public RGWCoroutine {
-  using TrimCR = DatalogTrimImplCR;
-  const DoutPrefixProvider *dpp;
-  rgw::sal::RadosStore* store;
-  RGWHTTPManager *http;
-  const int num_shards;
-  const std::string& zone_id; //< my zone id
-  std::vector<rgw_data_sync_status> peer_status; //< sync status for each peer
-  std::vector<std::string> min_shard_markers; //< min marker per shard
-  std::vector<std::string>& last_trim; //< last trimmed marker per shard
-  int ret{0};
-
- public:
-  DataLogTrimCR(const DoutPrefixProvider *dpp, rgw::sal::RadosStore* store, RGWHTTPManager *http,
-                   int num_shards, std::vector<std::string>& last_trim)
-    : RGWCoroutine(store->ctx()), dpp(dpp), store(store), http(http),
-      num_shards(num_shards),
-      zone_id(store->svc()->zone->get_zone().id),
-      peer_status(store->svc()->zone->get_zone_data_notify_to_map().size()),
-      min_shard_markers(num_shards,
-                       std::string(store->svc()->datalog_rados->max_marker())),
-      last_trim(last_trim)
-  {}
-
-  int operate(const DoutPrefixProvider *dpp) override;
-};
-
-int DataLogTrimCR::operate(const DoutPrefixProvider *dpp)
-{
-  reenter(this) {
-    ldpp_dout(dpp, 10) << "fetching sync status for zone " << zone_id << dendl;
-    set_status("fetching sync status");
-    yield {
-      // query data sync status from each sync peer
-      rgw_http_param_pair params[] = {
-        { "type", "data" },
-        { "status", nullptr },
-        { "source-zone", zone_id.c_str() },
-        { nullptr, nullptr }
-      };
-
-      auto p = peer_status.begin();
-      for (auto& c : store->svc()->zone->get_zone_data_notify_to_map()) {
-        ldpp_dout(dpp, 20) << "query sync status from " << c.first << dendl;
-        using StatusCR = RGWReadRESTResourceCR<rgw_data_sync_status>;
-        spawn(new StatusCR(cct, c.second, http, "/admin/log/", params, &*p),
-              false);
-        ++p;
-      }
-    }
-
-    // must get a successful reply from all peers to consider trimming
-    ret = 0;
-    while (ret == 0 && num_spawned() > 0) {
-      yield wait_for_child();
-      collect_next(&ret);
-    }
-    drain_all();
-
-    if (ret < 0) {
-      ldpp_dout(dpp, 4) << "failed to fetch sync status from all peers" << dendl;
-      return set_cr_error(ret);
-    }
-
-    ldpp_dout(dpp, 10) << "trimming log shards" << dendl;
-    set_status("trimming log shards");
-    yield {
-      // determine the minimum marker for each shard
-      take_min_markers(peer_status.begin(), peer_status.end(),
-                       min_shard_markers.begin());
-
-      for (int i = 0; i < num_shards; i++) {
-        const auto& m = min_shard_markers[i];
-        if (m <= last_trim[i]) {
-          continue;
-        }
-        ldpp_dout(dpp, 10) << "trimming log shard " << i
-            << " at marker=" << m
-            << " last_trim=" << last_trim[i] << dendl;
-        spawn(new TrimCR(dpp, store, i, m, &last_trim[i]),
-              true);
-      }
-    }
-    return set_cr_done();
-  }
-  return 0;
-}
-
-RGWCoroutine* create_admin_data_log_trim_cr(const DoutPrefixProvider *dpp, rgw::sal::RadosStore* store,
-                                            RGWHTTPManager *http,
-                                            int num_shards,
-                                            std::vector<std::string>& markers)
-{
-  return new DataLogTrimCR(dpp, store, http, num_shards, markers);
-}
-
-class DataLogTrimPollCR : public RGWCoroutine {
-  const DoutPrefixProvider *dpp;
-  rgw::sal::RadosStore* store;
-  RGWHTTPManager *http;
-  const int num_shards;
-  const utime_t interval; //< polling interval
-  const std::string lock_oid; //< use first data log shard for lock
-  const std::string lock_cookie;
-  std::vector<std::string> last_trim; //< last trimmed marker per shard
-
- public:
-  DataLogTrimPollCR(const DoutPrefixProvider *dpp, rgw::sal::RadosStore* store, RGWHTTPManager *http,
-                    int num_shards, utime_t interval)
-    : RGWCoroutine(store->ctx()), dpp(dpp), store(store), http(http),
-      num_shards(num_shards), interval(interval),
-      lock_oid(store->svc()->datalog_rados->get_oid(0, 0)),
-      lock_cookie(RGWSimpleRadosLockCR::gen_random_cookie(cct)),
-      last_trim(num_shards)
-  {}
-
-  int operate(const DoutPrefixProvider *dpp) override;
-};
-
-int DataLogTrimPollCR::operate(const DoutPrefixProvider *dpp)
-{
-  reenter(this) {
-    for (;;) {
-      set_status("sleeping");
-      wait(interval);
-
-      // request a 'data_trim' lock that covers the entire wait interval to
-      // prevent other gateways from attempting to trim for the duration
-      set_status("acquiring trim lock");
-      yield call(new RGWSimpleRadosLockCR(store->svc()->rados->get_async_processor(), store,
-                                          rgw_raw_obj(store->svc()->zone->get_zone_params().log_pool, lock_oid),
-                                          "data_trim", lock_cookie,
-                                          interval.sec()));
-      if (retcode < 0) {
-        // if the lock is already held, go back to sleep and try again later
-        ldpp_dout(dpp, 4) << "failed to lock " << lock_oid << ", trying again in "
-            << interval.sec() << "s" << dendl;
-        continue;
-      }
-
-      set_status("trimming");
-      yield call(new DataLogTrimCR(dpp, store, http, num_shards, last_trim));
-
-      // note that the lock is not released. this is intentional, as it avoids
-      // duplicating this work in other gateways
-    }
-  }
-  return 0;
-}
-
-RGWCoroutine* create_data_log_trim_cr(const DoutPrefixProvider *dpp, rgw::sal::RadosStore* store,
-                                      RGWHTTPManager *http,
-                                      int num_shards, utime_t interval)
-{
-  return new DataLogTrimPollCR(dpp, store, http, num_shards, interval);
-}
diff --git a/src/rgw/store/rados/rgw_trim_datalog.h b/src/rgw/store/rados/rgw_trim_datalog.h
deleted file mode 100644 (file)
index 9f5bf72..0000000
+++ /dev/null
@@ -1,28 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab ft=cpp
-
-#pragma once
-
-#include <string>
-#include <vector>
-
-#include "common/dout.h"
-
-class RGWCoroutine;
-class RGWRados;
-class RGWHTTPManager;
-class utime_t;
-namespace rgw { namespace sal {
-  class RadosStore;
-} }
-
-// DataLogTrimCR factory function
-extern RGWCoroutine* create_data_log_trim_cr(const DoutPrefixProvider *dpp, rgw::sal::RadosStore* store,
-                                             RGWHTTPManager *http,
-                                             int num_shards, utime_t interval);
-
-// factory function for datalog trim via radosgw-admin
-RGWCoroutine* create_admin_data_log_trim_cr(const DoutPrefixProvider *dpp, rgw::sal::RadosStore* store,
-                                            RGWHTTPManager *http,
-                                            int num_shards,
-                                            std::vector<std::string>& markers);
diff --git a/src/rgw/store/rados/rgw_trim_mdlog.cc b/src/rgw/store/rados/rgw_trim_mdlog.cc
deleted file mode 100644 (file)
index d8e1959..0000000
+++ /dev/null
@@ -1,795 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab ft=cpp
-
-#include "common/errno.h"
-
-#include "rgw_trim_mdlog.h"
-#include "rgw_sync.h"
-#include "rgw_cr_rados.h"
-#include "rgw_cr_rest.h"
-#include "rgw_zone.h"
-#include "services/svc_zone.h"
-#include "services/svc_meta.h"
-#include "services/svc_mdlog.h"
-#include "services/svc_cls.h"
-
-#include <boost/asio/yield.hpp>
-
-#define dout_subsys ceph_subsys_rgw
-
-#undef dout_prefix
-#define dout_prefix (*_dout << "meta trim: ")
-
-/// purge all log shards for the given mdlog
-class PurgeLogShardsCR : public RGWShardCollectCR {
-  rgw::sal::RadosStore* const store;
-  const RGWMetadataLog* mdlog;
-  const int num_shards;
-  rgw_raw_obj obj;
-  int i{0};
-
-  static constexpr int max_concurrent = 16;
-
-  int handle_result(int r) override {
-    if (r == -ENOENT) { // ENOENT is not a fatal error
-      return 0;
-    }
-    if (r < 0) {
-      ldout(cct, 4) << "failed to remove mdlog shard: " << cpp_strerror(r) << dendl;
-    }
-    return r;
-  }
- public:
-  PurgeLogShardsCR(rgw::sal::RadosStore* store, const RGWMetadataLog* mdlog,
-                   const rgw_pool& pool, int num_shards)
-    : RGWShardCollectCR(store->ctx(), max_concurrent),
-      store(store), mdlog(mdlog), num_shards(num_shards), obj(pool, "")
-  {}
-
-  bool spawn_next() override {
-    if (i == num_shards) {
-      return false;
-    }
-    mdlog->get_shard_oid(i++, obj.oid);
-    spawn(new RGWRadosRemoveCR(store, obj), false);
-    return true;
-  }
-};
-
-using Cursor = RGWPeriodHistory::Cursor;
-
-/// purge mdlogs from the oldest up to (but not including) the given realm_epoch
-class PurgePeriodLogsCR : public RGWCoroutine {
-  struct Svc {
-    RGWSI_Zone *zone;
-    RGWSI_MDLog *mdlog;
-  } svc;
-  const DoutPrefixProvider *dpp;
-  rgw::sal::RadosStore* const store;
-  RGWMetadataManager *const metadata;
-  RGWObjVersionTracker objv;
-  Cursor cursor;
-  epoch_t realm_epoch;
-  epoch_t *last_trim_epoch; //< update last trim on success
-
- public:
-  PurgePeriodLogsCR(const DoutPrefixProvider *dpp, rgw::sal::RadosStore* store, epoch_t realm_epoch, epoch_t *last_trim)
-    : RGWCoroutine(store->ctx()), dpp(dpp), store(store), metadata(store->ctl()->meta.mgr),
-      realm_epoch(realm_epoch), last_trim_epoch(last_trim) {
-    svc.zone = store->svc()->zone;
-    svc.mdlog = store->svc()->mdlog;
-  }
-
-  int operate(const DoutPrefixProvider *dpp) override;
-};
-
-int PurgePeriodLogsCR::operate(const DoutPrefixProvider *dpp)
-{
-  reenter(this) {
-    // read our current oldest log period
-    yield call(svc.mdlog->read_oldest_log_period_cr(dpp, &cursor, &objv));
-    if (retcode < 0) {
-      return set_cr_error(retcode);
-    }
-    ceph_assert(cursor);
-    ldpp_dout(dpp, 20) << "oldest log realm_epoch=" << cursor.get_epoch()
-        << " period=" << cursor.get_period().get_id() << dendl;
-
-    // trim -up to- the given realm_epoch
-    while (cursor.get_epoch() < realm_epoch) {
-      ldpp_dout(dpp, 4) << "purging log shards for realm_epoch=" << cursor.get_epoch()
-          << " period=" << cursor.get_period().get_id() << dendl;
-      yield {
-        const auto mdlog = svc.mdlog->get_log(cursor.get_period().get_id());
-        const auto& pool = svc.zone->get_zone_params().log_pool;
-        auto num_shards = cct->_conf->rgw_md_log_max_shards;
-        call(new PurgeLogShardsCR(store, mdlog, pool, num_shards));
-      }
-      if (retcode < 0) {
-        ldpp_dout(dpp, 1) << "failed to remove log shards: "
-            << cpp_strerror(retcode) << dendl;
-        return set_cr_error(retcode);
-      }
-      ldpp_dout(dpp, 10) << "removed log shards for realm_epoch=" << cursor.get_epoch()
-          << " period=" << cursor.get_period().get_id() << dendl;
-
-      // update our mdlog history
-      yield call(svc.mdlog->trim_log_period_cr(dpp, cursor, &objv));
-      if (retcode == -ENOENT) {
-        // must have raced to update mdlog history. return success and allow the
-        // winner to continue purging
-        ldpp_dout(dpp, 10) << "already removed log shards for realm_epoch=" << cursor.get_epoch()
-            << " period=" << cursor.get_period().get_id() << dendl;
-        return set_cr_done();
-      } else if (retcode < 0) {
-        ldpp_dout(dpp, 1) << "failed to remove log shards for realm_epoch="
-            << cursor.get_epoch() << " period=" << cursor.get_period().get_id()
-            << " with: " << cpp_strerror(retcode) << dendl;
-        return set_cr_error(retcode);
-      }
-
-      if (*last_trim_epoch < cursor.get_epoch()) {
-        *last_trim_epoch = cursor.get_epoch();
-      }
-
-      ceph_assert(cursor.has_next()); // get_current() should always come after
-      cursor.next();
-    }
-    return set_cr_done();
-  }
-  return 0;
-}
-
-namespace {
-
-using connection_map = std::map<std::string, std::unique_ptr<RGWRESTConn>>;
-
-/// construct a RGWRESTConn for each zone in the realm
-template <typename Zonegroups>
-connection_map make_peer_connections(rgw::sal::RadosStore* store,
-                                     const Zonegroups& zonegroups)
-{
-  connection_map connections;
-  for (auto& g : zonegroups) {
-    for (auto& z : g.second.zones) {
-      std::unique_ptr<RGWRESTConn> conn{
-        new RGWRESTConn(store->ctx(), store, z.first.id, z.second.endpoints, g.second.api_name)};
-      connections.emplace(z.first.id, std::move(conn));
-    }
-  }
-  return connections;
-}
-
-/// return the marker that it's safe to trim up to
-const std::string& get_stable_marker(const rgw_meta_sync_marker& m)
-{
-  return m.state == m.FullSync ? m.next_step_marker : m.marker;
-}
-
-/// comparison operator for take_min_status()
-bool operator<(const rgw_meta_sync_marker& lhs, const rgw_meta_sync_marker& rhs)
-{
-  // sort by stable marker
-  return get_stable_marker(lhs) < get_stable_marker(rhs);
-}
-
-/// populate the status with the minimum stable marker of each shard for any
-/// peer whose realm_epoch matches the minimum realm_epoch in the input
-template <typename Iter>
-int take_min_status(CephContext *cct, Iter first, Iter last,
-                    rgw_meta_sync_status *status)
-{
-  if (first == last) {
-    return -EINVAL;
-  }
-  const size_t num_shards = cct->_conf->rgw_md_log_max_shards;
-
-  status->sync_info.realm_epoch = std::numeric_limits<epoch_t>::max();
-  for (auto p = first; p != last; ++p) {
-    // validate peer's shard count
-    if (p->sync_markers.size() != num_shards) {
-      ldout(cct, 1) << "take_min_status got peer status with "
-          << p->sync_markers.size() << " shards, expected "
-          << num_shards << dendl;
-      return -EINVAL;
-    }
-    if (p->sync_info.realm_epoch < status->sync_info.realm_epoch) {
-      // earlier epoch, take its entire status
-      *status = std::move(*p);
-    } else if (p->sync_info.realm_epoch == status->sync_info.realm_epoch) {
-      // same epoch, take any earlier markers
-      auto m = status->sync_markers.begin();
-      for (auto& shard : p->sync_markers) {
-        if (shard.second < m->second) {
-          m->second = std::move(shard.second);
-        }
-        ++m;
-      }
-    }
-  }
-  return 0;
-}
-
-struct TrimEnv {
-  const DoutPrefixProvider *dpp;
-  rgw::sal::RadosStore* const store;
-  RGWHTTPManager *const http;
-  int num_shards;
-  const rgw_zone_id& zone;
-  Cursor current; //< cursor to current period
-  epoch_t last_trim_epoch{0}; //< epoch of last mdlog that was purged
-
-  TrimEnv(const DoutPrefixProvider *dpp, rgw::sal::RadosStore* store, RGWHTTPManager *http, int num_shards)
-    : dpp(dpp), store(store), http(http), num_shards(num_shards),
-      zone(store->svc()->zone->zone_id()),
-      current(store->svc()->mdlog->get_period_history()->get_current())
-  {}
-};
-
-struct MasterTrimEnv : public TrimEnv {
-  connection_map connections; //< peer connections
-  std::vector<rgw_meta_sync_status> peer_status; //< sync status for each peer
-  /// last trim marker for each shard, only applies to current period's mdlog
-  std::vector<std::string> last_trim_markers;
-
-  MasterTrimEnv(const DoutPrefixProvider *dpp, rgw::sal::RadosStore* store, RGWHTTPManager *http, int num_shards)
-    : TrimEnv(dpp, store, http, num_shards),
-      last_trim_markers(num_shards)
-  {
-    auto& period = current.get_period();
-    connections = make_peer_connections(store, period.get_map().zonegroups);
-    connections.erase(zone.id);
-    peer_status.resize(connections.size());
-  }
-};
-
-struct PeerTrimEnv : public TrimEnv {
-  /// last trim timestamp for each shard, only applies to current period's mdlog
-  std::vector<ceph::real_time> last_trim_timestamps;
-
-  PeerTrimEnv(const DoutPrefixProvider *dpp, rgw::sal::RadosStore* store, RGWHTTPManager *http, int num_shards)
-    : TrimEnv(dpp, store, http, num_shards),
-      last_trim_timestamps(num_shards)
-  {}
-
-  void set_num_shards(int num_shards) {
-    this->num_shards = num_shards;
-    last_trim_timestamps.resize(num_shards);
-  }
-};
-
-} // anonymous namespace
-
-
-/// spawn a trim cr for each shard that needs it, while limiting the number
-/// of concurrent shards
-class MetaMasterTrimShardCollectCR : public RGWShardCollectCR {
- private:
-  static constexpr int MAX_CONCURRENT_SHARDS = 16;
-
-  MasterTrimEnv& env;
-  RGWMetadataLog *mdlog;
-  int shard_id{0};
-  std::string oid;
-  const rgw_meta_sync_status& sync_status;
-
-  int handle_result(int r) override {
-    if (r == -ENOENT) { // ENOENT is not a fatal error
-      return 0;
-    }
-    if (r < 0) {
-      ldout(cct, 4) << "failed to trim mdlog shard: " << cpp_strerror(r) << dendl;
-    }
-    return r;
-  }
- public:
-  MetaMasterTrimShardCollectCR(MasterTrimEnv& env, RGWMetadataLog *mdlog,
-                               const rgw_meta_sync_status& sync_status)
-    : RGWShardCollectCR(env.store->ctx(), MAX_CONCURRENT_SHARDS),
-      env(env), mdlog(mdlog), sync_status(sync_status)
-  {}
-
-  bool spawn_next() override;
-};
-
-bool MetaMasterTrimShardCollectCR::spawn_next()
-{
-  while (shard_id < env.num_shards) {
-    auto m = sync_status.sync_markers.find(shard_id);
-    if (m == sync_status.sync_markers.end()) {
-      shard_id++;
-      continue;
-    }
-    auto& stable = get_stable_marker(m->second);
-    auto& last_trim = env.last_trim_markers[shard_id];
-
-    if (stable <= last_trim) {
-      // already trimmed
-      ldpp_dout(env.dpp, 20) << "skipping log shard " << shard_id
-          << " at marker=" << stable
-          << " last_trim=" << last_trim
-          << " realm_epoch=" << sync_status.sync_info.realm_epoch << dendl;
-      shard_id++;
-      continue;
-    }
-
-    mdlog->get_shard_oid(shard_id, oid);
-
-    ldpp_dout(env.dpp, 10) << "trimming log shard " << shard_id
-        << " at marker=" << stable
-        << " last_trim=" << last_trim
-        << " realm_epoch=" << sync_status.sync_info.realm_epoch << dendl;
-    spawn(new RGWSyncLogTrimCR(env.dpp, env.store, oid, stable, &last_trim), false);
-    shard_id++;
-    return true;
-  }
-  return false;
-}
-
-/// spawn rest requests to read each peer's sync status
-class MetaMasterStatusCollectCR : public RGWShardCollectCR {
-  static constexpr int MAX_CONCURRENT_SHARDS = 16;
-
-  MasterTrimEnv& env;
-  connection_map::iterator c;
-  std::vector<rgw_meta_sync_status>::iterator s;
-
-  int handle_result(int r) override {
-    if (r == -ENOENT) { // ENOENT is not a fatal error
-      return 0;
-    }
-    if (r < 0) {
-      ldout(cct, 4) << "failed to fetch metadata sync status: "
-          << cpp_strerror(r) << dendl;
-    }
-    return r;
-  }
- public:
-  explicit MetaMasterStatusCollectCR(MasterTrimEnv& env)
-    : RGWShardCollectCR(env.store->ctx(), MAX_CONCURRENT_SHARDS),
-      env(env), c(env.connections.begin()), s(env.peer_status.begin())
-  {}
-
-  bool spawn_next() override {
-    if (c == env.connections.end()) {
-      return false;
-    }
-    static rgw_http_param_pair params[] = {
-      { "type", "metadata" },
-      { "status", nullptr },
-      { nullptr, nullptr }
-    };
-
-    ldout(cct, 20) << "query sync status from " << c->first << dendl;
-    auto conn = c->second.get();
-    using StatusCR = RGWReadRESTResourceCR<rgw_meta_sync_status>;
-    spawn(new StatusCR(cct, conn, env.http, "/admin/log/", params, &*s),
-          false);
-    ++c;
-    ++s;
-    return true;
-  }
-};
-
-class MetaMasterTrimCR : public RGWCoroutine {
-  MasterTrimEnv& env;
-  rgw_meta_sync_status min_status; //< minimum sync status of all peers
-  int ret{0};
-
- public:
-  explicit MetaMasterTrimCR(MasterTrimEnv& env)
-    : RGWCoroutine(env.store->ctx()), env(env)
-  {}
-
-  int operate(const DoutPrefixProvider *dpp) override;
-};
-
-int MetaMasterTrimCR::operate(const DoutPrefixProvider *dpp)
-{
-  reenter(this) {
-    // TODO: detect this and fail before we spawn the trim thread?
-    if (env.connections.empty()) {
-      ldpp_dout(dpp, 4) << "no peers, exiting" << dendl;
-      return set_cr_done();
-    }
-
-    ldpp_dout(dpp, 10) << "fetching sync status for zone " << env.zone << dendl;
-    // query mdlog sync status from peers
-    yield call(new MetaMasterStatusCollectCR(env));
-
-    // must get a successful reply from all peers to consider trimming
-    if (ret < 0) {
-      ldpp_dout(dpp, 4) << "failed to fetch sync status from all peers" << dendl;
-      return set_cr_error(ret);
-    }
-
-    // determine the minimum epoch and markers
-    ret = take_min_status(env.store->ctx(), env.peer_status.begin(),
-                          env.peer_status.end(), &min_status);
-    if (ret < 0) {
-      ldpp_dout(dpp, 4) << "failed to calculate min sync status from peers" << dendl;
-      return set_cr_error(ret);
-    }
-    yield {
-      auto store = env.store;
-      auto epoch = min_status.sync_info.realm_epoch;
-      ldpp_dout(dpp, 4) << "realm epoch min=" << epoch
-          << " current=" << env.current.get_epoch()<< dendl;
-      if (epoch > env.last_trim_epoch + 1) {
-        // delete any prior mdlog periods
-        spawn(new PurgePeriodLogsCR(dpp, store, epoch, &env.last_trim_epoch), true);
-      } else {
-        ldpp_dout(dpp, 10) << "mdlogs already purged up to realm_epoch "
-            << env.last_trim_epoch << dendl;
-      }
-
-      // if realm_epoch == current, trim mdlog based on markers
-      if (epoch == env.current.get_epoch()) {
-        auto mdlog = store->svc()->mdlog->get_log(env.current.get_period().get_id());
-        spawn(new MetaMasterTrimShardCollectCR(env, mdlog, min_status), true);
-      }
-    }
-    // ignore any errors during purge/trim because we want to hold the lock open
-    return set_cr_done();
-  }
-  return 0;
-}
-
-
-/// read the first entry of the master's mdlog shard and trim to that position
-class MetaPeerTrimShardCR : public RGWCoroutine {
-  RGWMetaSyncEnv& env;
-  RGWMetadataLog *mdlog;
-  const std::string& period_id;
-  const int shard_id;
-  RGWMetadataLogInfo info;
-  ceph::real_time stable; //< safe timestamp to trim, according to master
-  ceph::real_time *last_trim; //< last trimmed timestamp, updated on trim
-  rgw_mdlog_shard_data result; //< result from master's mdlog listing
-
- public:
-  MetaPeerTrimShardCR(RGWMetaSyncEnv& env, RGWMetadataLog *mdlog,
-                      const std::string& period_id, int shard_id,
-                      ceph::real_time *last_trim)
-    : RGWCoroutine(env.store->ctx()), env(env), mdlog(mdlog),
-      period_id(period_id), shard_id(shard_id), last_trim(last_trim)
-  {}
-
-  int operate(const DoutPrefixProvider *dpp) override;
-};
-
-int MetaPeerTrimShardCR::operate(const DoutPrefixProvider *dpp)
-{
-  reenter(this) {
-    // query master's first mdlog entry for this shard
-    yield call(create_list_remote_mdlog_shard_cr(&env, period_id, shard_id,
-                                                 "", 1, &result));
-    if (retcode < 0) {
-      ldpp_dout(dpp, 5) << "failed to read first entry from master's mdlog shard "
-          << shard_id << " for period " << period_id
-          << ": " << cpp_strerror(retcode) << dendl;
-      return set_cr_error(retcode);
-    }
-    if (result.entries.empty()) {
-      // if there are no mdlog entries, we don't have a timestamp to compare. we
-      // can't just trim everything, because there could be racing updates since
-      // this empty reply. query the mdlog shard info to read its max timestamp,
-      // then retry the listing to make sure it's still empty before trimming to
-      // that
-      ldpp_dout(dpp, 10) << "empty master mdlog shard " << shard_id
-          << ", reading last timestamp from shard info" << dendl;
-      // read the mdlog shard info for the last timestamp
-      yield call(create_read_remote_mdlog_shard_info_cr(&env, period_id, shard_id, &info));
-      if (retcode < 0) {
-        ldpp_dout(dpp, 5) << "failed to read info from master's mdlog shard "
-            << shard_id << " for period " << period_id
-            << ": " << cpp_strerror(retcode) << dendl;
-        return set_cr_error(retcode);
-      }
-      if (ceph::real_clock::is_zero(info.last_update)) {
-        return set_cr_done(); // nothing to trim
-      }
-      ldpp_dout(dpp, 10) << "got mdlog shard info with last update="
-          << info.last_update << dendl;
-      // re-read the master's first mdlog entry to make sure it hasn't changed
-      yield call(create_list_remote_mdlog_shard_cr(&env, period_id, shard_id,
-                                                   "", 1, &result));
-      if (retcode < 0) {
-        ldpp_dout(dpp, 5) << "failed to read first entry from master's mdlog shard "
-            << shard_id << " for period " << period_id
-            << ": " << cpp_strerror(retcode) << dendl;
-        return set_cr_error(retcode);
-      }
-      // if the mdlog is still empty, trim to max marker
-      if (result.entries.empty()) {
-        stable = info.last_update;
-      } else {
-        stable = result.entries.front().timestamp;
-
-        // can only trim -up to- master's first timestamp, so subtract a second.
-        // (this is why we use timestamps instead of markers for the peers)
-        stable -= std::chrono::seconds(1);
-      }
-    } else {
-      stable = result.entries.front().timestamp;
-      stable -= std::chrono::seconds(1);
-    }
-
-    if (stable <= *last_trim) {
-      ldpp_dout(dpp, 10) << "skipping log shard " << shard_id
-          << " at timestamp=" << stable
-          << " last_trim=" << *last_trim << dendl;
-      return set_cr_done();
-    }
-
-    ldpp_dout(dpp, 10) << "trimming log shard " << shard_id
-        << " at timestamp=" << stable
-        << " last_trim=" << *last_trim << dendl;
-    yield {
-      std::string oid;
-      mdlog->get_shard_oid(shard_id, oid);
-      call(new RGWRadosTimelogTrimCR(dpp, env.store, oid, real_time{}, stable, "", ""));
-    }
-    if (retcode < 0 && retcode != -ENODATA) {
-      ldpp_dout(dpp, 1) << "failed to trim mdlog shard " << shard_id
-          << ": " << cpp_strerror(retcode) << dendl;
-      return set_cr_error(retcode);
-    }
-    *last_trim = stable;
-    return set_cr_done();
-  }
-  return 0;
-}
-
-class MetaPeerTrimShardCollectCR : public RGWShardCollectCR {
-  static constexpr int MAX_CONCURRENT_SHARDS = 16;
-
-  PeerTrimEnv& env;
-  RGWMetadataLog *mdlog;
-  const std::string& period_id;
-  RGWMetaSyncEnv meta_env; //< for RGWListRemoteMDLogShardCR
-  int shard_id{0};
-
-  int handle_result(int r) override {
-    if (r == -ENOENT) { // ENOENT is not a fatal error
-      return 0;
-    }
-    if (r < 0) {
-      ldout(cct, 4) << "failed to trim mdlog shard: " << cpp_strerror(r) << dendl;
-    }
-    return r;
-  }
- public:
-  MetaPeerTrimShardCollectCR(PeerTrimEnv& env, RGWMetadataLog *mdlog)
-    : RGWShardCollectCR(env.store->ctx(), MAX_CONCURRENT_SHARDS),
-      env(env), mdlog(mdlog), period_id(env.current.get_period().get_id())
-  {
-    meta_env.init(env.dpp, cct, env.store, env.store->svc()->zone->get_master_conn(),
-                  env.store->svc()->rados->get_async_processor(), env.http, nullptr,
-                  env.store->getRados()->get_sync_tracer());
-  }
-
-  bool spawn_next() override;
-};
-
-bool MetaPeerTrimShardCollectCR::spawn_next()
-{
-  if (shard_id >= env.num_shards) {
-    return false;
-  }
-  auto& last_trim = env.last_trim_timestamps[shard_id];
-  spawn(new MetaPeerTrimShardCR(meta_env, mdlog, period_id, shard_id, &last_trim),
-        false);
-  shard_id++;
-  return true;
-}
-
-class MetaPeerTrimCR : public RGWCoroutine {
-  PeerTrimEnv& env;
-  rgw_mdlog_info mdlog_info; //< master's mdlog info
-
- public:
-  explicit MetaPeerTrimCR(PeerTrimEnv& env) : RGWCoroutine(env.store->ctx()), env(env) {}
-
-  int operate(const DoutPrefixProvider *dpp) override;
-};
-
-int MetaPeerTrimCR::operate(const DoutPrefixProvider *dpp)
-{
-  reenter(this) {
-    ldpp_dout(dpp, 10) << "fetching master mdlog info" << dendl;
-    yield {
-      // query mdlog_info from master for oldest_log_period
-      rgw_http_param_pair params[] = {
-        { "type", "metadata" },
-        { nullptr, nullptr }
-      };
-
-      using LogInfoCR = RGWReadRESTResourceCR<rgw_mdlog_info>;
-      call(new LogInfoCR(cct, env.store->svc()->zone->get_master_conn(), env.http,
-                         "/admin/log/", params, &mdlog_info));
-    }
-    if (retcode < 0) {
-      ldpp_dout(dpp, 4) << "failed to read mdlog info from master" << dendl;
-      return set_cr_error(retcode);
-    }
-    // use master's shard count instead
-    env.set_num_shards(mdlog_info.num_shards);
-
-    if (mdlog_info.realm_epoch > env.last_trim_epoch + 1) {
-      // delete any prior mdlog periods
-      yield call(new PurgePeriodLogsCR(dpp, env.store, mdlog_info.realm_epoch,
-                                       &env.last_trim_epoch));
-    } else {
-      ldpp_dout(dpp, 10) << "mdlogs already purged through realm_epoch "
-          << env.last_trim_epoch << dendl;
-    }
-
-    // if realm_epoch == current, trim mdlog based on master's markers
-    if (mdlog_info.realm_epoch == env.current.get_epoch()) {
-      yield {
-        auto mdlog = env.store->svc()->mdlog->get_log(env.current.get_period().get_id());
-        call(new MetaPeerTrimShardCollectCR(env, mdlog));
-        // ignore any errors during purge/trim because we want to hold the lock open
-      }
-    }
-    return set_cr_done();
-  }
-  return 0;
-}
-
-class MetaTrimPollCR : public RGWCoroutine {
-  rgw::sal::RadosStore* const store;
-  const utime_t interval; //< polling interval
-  const rgw_raw_obj obj;
-  const std::string name{"meta_trim"}; //< lock name
-  const std::string cookie;
-
- protected:
-  /// allocate the coroutine to run within the lease
-  virtual RGWCoroutine* alloc_cr() = 0;
-
- public:
-  MetaTrimPollCR(rgw::sal::RadosStore* store, utime_t interval)
-    : RGWCoroutine(store->ctx()), store(store), interval(interval),
-      obj(store->svc()->zone->get_zone_params().log_pool, RGWMetadataLogHistory::oid),
-      cookie(RGWSimpleRadosLockCR::gen_random_cookie(cct))
-  {}
-
-  int operate(const DoutPrefixProvider *dpp) override;
-};
-
-int MetaTrimPollCR::operate(const DoutPrefixProvider *dpp)
-{
-  reenter(this) {
-    for (;;) {
-      set_status("sleeping");
-      wait(interval);
-
-      // prevent others from trimming for our entire wait interval
-      set_status("acquiring trim lock");
-      yield call(new RGWSimpleRadosLockCR(store->svc()->rados->get_async_processor(), store,
-                                          obj, name, cookie, interval.sec()));
-      if (retcode < 0) {
-        ldout(cct, 4) << "failed to lock: " << cpp_strerror(retcode) << dendl;
-        continue;
-      }
-
-      set_status("trimming");
-      yield call(alloc_cr());
-
-      if (retcode < 0) {
-        // on errors, unlock so other gateways can try
-        set_status("unlocking");
-        yield call(new RGWSimpleRadosUnlockCR(store->svc()->rados->get_async_processor(), store,
-                                              obj, name, cookie));
-      }
-    }
-  }
-  return 0;
-}
-
-class MetaMasterTrimPollCR : public MetaTrimPollCR  {
-  MasterTrimEnv env; //< trim state to share between calls
-  RGWCoroutine* alloc_cr() override {
-    return new MetaMasterTrimCR(env);
-  }
- public:
-  MetaMasterTrimPollCR(const DoutPrefixProvider *dpp, rgw::sal::RadosStore* store, RGWHTTPManager *http,
-                       int num_shards, utime_t interval)
-    : MetaTrimPollCR(store, interval),
-      env(dpp, store, http, num_shards)
-  {}
-};
-
-class MetaPeerTrimPollCR : public MetaTrimPollCR {
-  PeerTrimEnv env; //< trim state to share between calls
-  RGWCoroutine* alloc_cr() override {
-    return new MetaPeerTrimCR(env);
-  }
- public:
-  MetaPeerTrimPollCR(const DoutPrefixProvider *dpp, rgw::sal::RadosStore* store, RGWHTTPManager *http,
-                     int num_shards, utime_t interval)
-    : MetaTrimPollCR(store, interval),
-      env(dpp, store, http, num_shards)
-  {}
-};
-
-namespace {
-bool sanity_check_endpoints(const DoutPrefixProvider *dpp, rgw::sal::RadosStore* store) {
-  bool retval = true;
-  auto current = store->svc()->mdlog->get_period_history()->get_current();
-  const auto& period = current.get_period();
-  for (const auto& [_, zonegroup] : period.get_map().zonegroups) {
-    if (zonegroup.endpoints.empty()) {
-      ldpp_dout(dpp, -1)
-       << __PRETTY_FUNCTION__ << ":" << __LINE__
-       << " WARNING: Cluster is is misconfigured! "
-       << " Zonegroup " << zonegroup.get_name()
-       << " (" << zonegroup.get_id() << ") in Realm "
-       << period.get_realm_name() << " ( " << period.get_realm() << ") "
-       << " has no endpoints!" << dendl;
-    }
-    for (const auto& [_, zone] : zonegroup.zones) {
-      if (zone.endpoints.empty()) {
-       ldpp_dout(dpp, -1)
-         << __PRETTY_FUNCTION__ << ":" << __LINE__
-         << " ERROR: Cluster is is misconfigured! "
-         << " Zone " << zone.name << " (" << zone.id << ") in Zonegroup "
-         << zonegroup.get_name() << " ( " << zonegroup.get_id()
-         << ") in Realm " << period.get_realm_name()
-         << " ( " << period.get_realm() << ") "
-         << " has no endpoints! Trimming is impossible." << dendl;
-       retval = false;
-      }
-    }
-  }
-  return retval;
-}
-}
-
-RGWCoroutine* create_meta_log_trim_cr(const DoutPrefixProvider *dpp, rgw::sal::RadosStore* store, RGWHTTPManager *http,
-                                      int num_shards, utime_t interval)
-{
-  if (!sanity_check_endpoints(dpp, store)) {
-    ldpp_dout(dpp, -1)
-      << __PRETTY_FUNCTION__ << ":" << __LINE__
-      << " ERROR: Cluster is is misconfigured! Refusing to trim." << dendl;
-      return nullptr;
-  }
-  if (store->svc()->zone->is_meta_master()) {
-    return new MetaMasterTrimPollCR(dpp, store, http, num_shards, interval);
-  }
-  return new MetaPeerTrimPollCR(dpp, store, http, num_shards, interval);
-}
-
-
-struct MetaMasterAdminTrimCR : private MasterTrimEnv, public MetaMasterTrimCR {
-  MetaMasterAdminTrimCR(const DoutPrefixProvider *dpp, rgw::sal::RadosStore* store, RGWHTTPManager *http, int num_shards)
-    : MasterTrimEnv(dpp, store, http, num_shards),
-      MetaMasterTrimCR(*static_cast<MasterTrimEnv*>(this))
-  {}
-};
-
-struct MetaPeerAdminTrimCR : private PeerTrimEnv, public MetaPeerTrimCR {
-  MetaPeerAdminTrimCR(const DoutPrefixProvider *dpp, rgw::sal::RadosStore* store, RGWHTTPManager *http, int num_shards)
-    : PeerTrimEnv(dpp, store, http, num_shards),
-      MetaPeerTrimCR(*static_cast<PeerTrimEnv*>(this))
-  {}
-};
-
-RGWCoroutine* create_admin_meta_log_trim_cr(const DoutPrefixProvider *dpp, rgw::sal::RadosStore* store,
-                                            RGWHTTPManager *http,
-                                            int num_shards)
-{
-  if (!sanity_check_endpoints(dpp, store)) {
-    ldpp_dout(dpp, -1)
-      << __PRETTY_FUNCTION__ << ":" << __LINE__
-      << " ERROR: Cluster is is misconfigured! Refusing to trim." << dendl;
-      return nullptr;
-  }
-  if (store->svc()->zone->is_meta_master()) {
-    return new MetaMasterAdminTrimCR(dpp, store, http, num_shards);
-  }
-  return new MetaPeerAdminTrimCR(dpp, store, http, num_shards);
-}
diff --git a/src/rgw/store/rados/rgw_trim_mdlog.h b/src/rgw/store/rados/rgw_trim_mdlog.h
deleted file mode 100644 (file)
index 1dba861..0000000
+++ /dev/null
@@ -1,25 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab ft=cpp
-
-#pragma once
-
-class RGWCoroutine;
-class DoutPrefixProvider;
-class RGWRados;
-class RGWHTTPManager;
-class utime_t;
-namespace rgw { namespace sal {
-  class RadosStore;
-} }
-
-// MetaLogTrimCR factory function
-RGWCoroutine* create_meta_log_trim_cr(const DoutPrefixProvider *dpp,
-                                      rgw::sal::RadosStore* store,
-                                      RGWHTTPManager *http,
-                                      int num_shards, utime_t interval);
-
-// factory function for mdlog trim via radosgw-admin
-RGWCoroutine* create_admin_meta_log_trim_cr(const DoutPrefixProvider *dpp,
-                                            rgw::sal::RadosStore* store,
-                                            RGWHTTPManager *http,
-                                            int num_shards);
diff --git a/src/rgw/store/rados/rgw_user.cc b/src/rgw/store/rados/rgw_user.cc
deleted file mode 100644 (file)
index 7c36a52..0000000
+++ /dev/null
@@ -1,2768 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab ft=cpp
-
-#include "common/errno.h"
-
-#include "rgw_user.h"
-
-#include "rgw_bucket.h"
-
-#include "services/svc_user.h"
-#include "services/svc_meta.h"
-
-#define dout_subsys ceph_subsys_rgw
-
-using namespace std;
-
-extern void op_type_to_str(uint32_t mask, char *buf, int len);
-
-static string key_type_to_str(int key_type) {
-  switch (key_type) {
-    case KEY_TYPE_SWIFT:
-      return "swift";
-      break;
-
-    default:
-      return "s3";
-      break;
-  }
-}
-
-static bool char_is_unreserved_url(char c)
-{
-  if (isalnum(c))
-    return true;
-
-  switch (c) {
-  case '-':
-  case '.':
-  case '_':
-  case '~':
-    return true;
-  default:
-    return false;
-  }
-}
-
-static bool validate_access_key(string& key)
-{
-  const char *p = key.c_str();
-  while (*p) {
-    if (!char_is_unreserved_url(*p))
-      return false;
-    p++;
-  }
-  return true;
-}
-
-static void set_err_msg(std::string *sink, std::string msg)
-{
-  if (sink && !msg.empty())
-    *sink = msg;
-}
-
-/*
- * Dump either the full user info or a subset to a formatter.
- *
- * NOTE: It is the caller's responsibility to ensure that the
- * formatter is flushed at the correct time.
- */
-
-static void dump_subusers_info(Formatter *f, RGWUserInfo &info)
-{
-  map<string, RGWSubUser>::iterator uiter;
-
-  f->open_array_section("subusers");
-  for (uiter = info.subusers.begin(); uiter != info.subusers.end(); ++uiter) {
-    RGWSubUser& u = uiter->second;
-    f->open_object_section("user");
-    string s;
-    info.user_id.to_str(s);
-    f->dump_format("id", "%s:%s", s.c_str(), u.name.c_str());
-    char buf[256];
-    rgw_perm_to_str(u.perm_mask, buf, sizeof(buf));
-    f->dump_string("permissions", buf);
-    f->close_section();
-  }
-  f->close_section();
-}
-
-static void dump_access_keys_info(Formatter *f, RGWUserInfo &info)
-{
-  map<string, RGWAccessKey>::iterator kiter;
-  f->open_array_section("keys");
-  for (kiter = info.access_keys.begin(); kiter != info.access_keys.end(); ++kiter) {
-    RGWAccessKey& k = kiter->second;
-    const char *sep = (k.subuser.empty() ? "" : ":");
-    const char *subuser = (k.subuser.empty() ? "" : k.subuser.c_str());
-    f->open_object_section("key");
-    string s;
-    info.user_id.to_str(s);
-    f->dump_format("user", "%s%s%s", s.c_str(), sep, subuser);
-    f->dump_string("access_key", k.id);
-    f->dump_string("secret_key", k.key);
-    f->close_section();
-  }
-  f->close_section();
-}
-
-static void dump_swift_keys_info(Formatter *f, RGWUserInfo &info)
-{
-  map<string, RGWAccessKey>::iterator kiter;
-  f->open_array_section("swift_keys");
-  for (kiter = info.swift_keys.begin(); kiter != info.swift_keys.end(); ++kiter) {
-    RGWAccessKey& k = kiter->second;
-    const char *sep = (k.subuser.empty() ? "" : ":");
-    const char *subuser = (k.subuser.empty() ? "" : k.subuser.c_str());
-    f->open_object_section("key");
-    string s;
-    info.user_id.to_str(s);
-    f->dump_format("user", "%s%s%s", s.c_str(), sep, subuser);
-    f->dump_string("secret_key", k.key);
-    f->close_section();
-  }
-  f->close_section();
-}
-
-static void dump_user_info(Formatter *f, RGWUserInfo &info,
-                           RGWStorageStats *stats = NULL)
-{
-  f->open_object_section("user_info");
-  encode_json("tenant", info.user_id.tenant, f);
-  encode_json("user_id", info.user_id.id, f);
-  encode_json("display_name", info.display_name, f);
-  encode_json("email", info.user_email, f);
-  encode_json("suspended", (int)info.suspended, f);
-  encode_json("max_buckets", (int)info.max_buckets, f);
-
-  dump_subusers_info(f, info);
-  dump_access_keys_info(f, info);
-  dump_swift_keys_info(f, info);
-
-  encode_json("caps", info.caps, f);
-
-  char buf[256];
-  op_type_to_str(info.op_mask, buf, sizeof(buf));
-  encode_json("op_mask", (const char *)buf, f);
-  encode_json("system", (bool)info.system, f);
-  encode_json("admin", (bool)info.admin, f);
-  encode_json("default_placement", info.default_placement.name, f);
-  encode_json("default_storage_class", info.default_placement.storage_class, f);
-  encode_json("placement_tags", info.placement_tags, f);
-  encode_json("bucket_quota", info.quota.bucket_quota, f);
-  encode_json("user_quota", info.quota.user_quota, f);
-  encode_json("temp_url_keys", info.temp_url_keys, f);
-
-  string user_source_type;
-  switch ((RGWIdentityType)info.type) {
-  case TYPE_RGW:
-    user_source_type = "rgw";
-    break;
-  case TYPE_KEYSTONE:
-    user_source_type = "keystone";
-    break;
-  case TYPE_LDAP:
-    user_source_type = "ldap";
-    break;
-  case TYPE_NONE:
-    user_source_type = "none";
-    break;
-  default:
-    user_source_type = "none";
-    break;
-  }
-  encode_json("type", user_source_type, f);
-  encode_json("mfa_ids", info.mfa_ids, f);
-  if (stats) {
-    encode_json("stats", *stats, f);
-  }
-  f->close_section();
-}
-
-static int user_add_helper(RGWUserAdminOpState& op_state, std::string *err_msg)
-{
-  int ret = 0;
-  const rgw_user& uid = op_state.get_user_id();
-  std::string user_email = op_state.get_user_email();
-  std::string display_name = op_state.get_display_name();
-
-  // fail if the user exists already
-  if (op_state.has_existing_user()) {
-    if (op_state.found_by_email) {
-      set_err_msg(err_msg, "email: " + user_email +
-          " is the email address of an existing user");
-      ret = -ERR_EMAIL_EXIST;
-    } else if (op_state.found_by_key) {
-      set_err_msg(err_msg, "duplicate key provided");
-      ret = -ERR_KEY_EXIST;
-    } else {
-      set_err_msg(err_msg, "user: " + uid.to_str() + " exists");
-      ret = -EEXIST;
-    }
-    return ret;
-  }
-
-  // fail if the user_info has already been populated
-  if (op_state.is_populated()) {
-    set_err_msg(err_msg, "cannot overwrite already populated user");
-    return -EEXIST;
-  }
-
-  // fail if the display name was not included
-  if (display_name.empty()) {
-    set_err_msg(err_msg, "no display name specified");
-    return -EINVAL;
-  }
-
-  return ret;
-}
-
-RGWAccessKeyPool::RGWAccessKeyPool(RGWUser* usr)
-{
-  if (!usr) {
-    return;
-  }
-
-  user = usr;
-
-  driver = user->get_driver();
-}
-
-int RGWAccessKeyPool::init(RGWUserAdminOpState& op_state)
-{
-  if (!op_state.is_initialized()) {
-    keys_allowed = false;
-    return -EINVAL;
-  }
-
-  const rgw_user& uid = op_state.get_user_id();
-  if (uid.compare(RGW_USER_ANON_ID) == 0) {
-    keys_allowed = false;
-    return -EINVAL;
-  }
-
-  swift_keys = op_state.get_swift_keys();
-  access_keys = op_state.get_access_keys();
-
-  keys_allowed = true;
-
-  return 0;
-}
-
-RGWUserAdminOpState::RGWUserAdminOpState(rgw::sal::Driver* driver)
-{
-  user = driver->get_user(rgw_user(RGW_USER_ANON_ID));
-}
-
-void RGWUserAdminOpState::set_user_id(const rgw_user& id)
-{
-  if (id.empty())
-    return;
-
-  user->get_info().user_id = id;
-}
-
-void RGWUserAdminOpState::set_subuser(std::string& _subuser)
-{
-  if (_subuser.empty())
-    return;
-
-  size_t pos = _subuser.find(":");
-  if (pos != string::npos) {
-    rgw_user tmp_id;
-    tmp_id.from_str(_subuser.substr(0, pos));
-    if (tmp_id.tenant.empty()) {
-      user->get_info().user_id.id = tmp_id.id;
-    } else {
-      user->get_info().user_id = tmp_id;
-    }
-    subuser = _subuser.substr(pos+1);
-  } else {
-    subuser = _subuser;
-  }
-
-  subuser_specified = true;
-}
-
-void RGWUserAdminOpState::set_user_info(RGWUserInfo& user_info)
-{
-  user->get_info() = user_info;
-}
-
-void RGWUserAdminOpState::set_user_version_tracker(RGWObjVersionTracker& objv_tracker)
-{
-  user->get_version_tracker() = objv_tracker;
-}
-
-const rgw_user& RGWUserAdminOpState::get_user_id()
-{
-  return user->get_id();
-}
-
-RGWUserInfo& RGWUserAdminOpState::get_user_info()
-{
-  return user->get_info();
-}
-
-map<std::string, RGWAccessKey>* RGWUserAdminOpState::get_swift_keys()
-{
-  return &user->get_info().swift_keys;
-}
-
-map<std::string, RGWAccessKey>* RGWUserAdminOpState::get_access_keys()
-{
-  return &user->get_info().access_keys;
-}
-
-map<std::string, RGWSubUser>* RGWUserAdminOpState::get_subusers()
-{
-  return &user->get_info().subusers;
-}
-
-RGWUserCaps *RGWUserAdminOpState::get_caps_obj()
-{
-  return &user->get_info().caps;
-}
-
-std::string RGWUserAdminOpState::build_default_swift_kid()
-{
-  if (user->get_id().empty() || subuser.empty())
-    return "";
-
-  std::string kid;
-  user->get_id().to_str(kid);
-  kid.append(":");
-  kid.append(subuser);
-
-  return kid;
-}
-
-std::string RGWUserAdminOpState::generate_subuser() {
-  if (user->get_id().empty())
-    return "";
-
-  std::string generated_subuser;
-  user->get_id().to_str(generated_subuser);
-  std::string rand_suffix;
-
-  int sub_buf_size = RAND_SUBUSER_LEN + 1;
-  char sub_buf[RAND_SUBUSER_LEN + 1];
-
-  gen_rand_alphanumeric_upper(g_ceph_context, sub_buf, sub_buf_size);
-
-  rand_suffix = sub_buf;
-  if (rand_suffix.empty())
-    return "";
-
-  generated_subuser.append(rand_suffix);
-  subuser = generated_subuser;
-
-  return generated_subuser;
-}
-
-/*
- * Do a fairly exhaustive search for an existing key matching the parameters
- * given. Also handles the case where no key type was specified and updates
- * the operation state if needed.
- */
-
-bool RGWAccessKeyPool::check_existing_key(RGWUserAdminOpState& op_state)
-{
-  bool existing_key = false;
-
-  int key_type = op_state.get_key_type();
-  std::string kid = op_state.get_access_key();
-  std::map<std::string, RGWAccessKey>::iterator kiter;
-  std::string swift_kid = op_state.build_default_swift_kid();
-
-  RGWUserInfo dup_info;
-
-  if (kid.empty() && swift_kid.empty())
-    return false;
-
-  switch (key_type) {
-  case KEY_TYPE_SWIFT:
-    kiter = swift_keys->find(swift_kid);
-
-    existing_key = (kiter != swift_keys->end());
-    if (existing_key)
-      op_state.set_access_key(swift_kid);
-
-    break;
-  case KEY_TYPE_S3:
-    kiter = access_keys->find(kid);
-    existing_key = (kiter != access_keys->end());
-
-    break;
-  default:
-    kiter = access_keys->find(kid);
-
-    existing_key = (kiter != access_keys->end());
-    if (existing_key) {
-      op_state.set_key_type(KEY_TYPE_S3);
-      break;
-    }
-
-    kiter = swift_keys->find(kid);
-
-    existing_key = (kiter != swift_keys->end());
-    if (existing_key) {
-      op_state.set_key_type(KEY_TYPE_SWIFT);
-      break;
-    }
-
-    // handle the case where the access key was not provided in user:key format
-    if (swift_kid.empty())
-      return false;
-
-    kiter = swift_keys->find(swift_kid);
-
-    existing_key = (kiter != swift_keys->end());
-    if (existing_key) {
-      op_state.set_access_key(swift_kid);
-      op_state.set_key_type(KEY_TYPE_SWIFT);
-    }
-  }
-
-  op_state.set_existing_key(existing_key);
-
-  return existing_key;
-}
-
-int RGWAccessKeyPool::check_op(RGWUserAdminOpState& op_state,
-     std::string *err_msg)
-{
-  RGWUserInfo dup_info;
-
-  if (!op_state.is_populated()) {
-    set_err_msg(err_msg, "user info was not populated");
-    return -EINVAL;
-  }
-
-  if (!keys_allowed) {
-    set_err_msg(err_msg, "keys not allowed for this user");
-    return -EACCES;
-  }
-
-  int32_t key_type = op_state.get_key_type();
-
-  // if a key type wasn't specified
-  if (key_type < 0) {
-      if (op_state.has_subuser()) {
-        key_type = KEY_TYPE_SWIFT;
-      } else {
-        key_type = KEY_TYPE_S3;
-      }
-  }
-
-  op_state.set_key_type(key_type);
-
-  /* see if the access key was specified */
-  if (key_type == KEY_TYPE_S3 && !op_state.will_gen_access() && 
-      op_state.get_access_key().empty()) {
-    set_err_msg(err_msg, "empty access key");
-    return -ERR_INVALID_ACCESS_KEY;
-  }
-
-  // don't check for secret key because we may be doing a removal
-
-  if (check_existing_key(op_state)) {
-    op_state.set_access_key_exist();
-  }
-  return 0;
-}
-
-// Generate a new random key
-int RGWAccessKeyPool::generate_key(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state,
-                                  optional_yield y, std::string *err_msg)
-{
-  std::string id;
-  std::string key;
-
-  std::pair<std::string, RGWAccessKey> key_pair;
-  RGWAccessKey new_key;
-  std::unique_ptr<rgw::sal::User> duplicate_check;
-
-  int key_type = op_state.get_key_type();
-  bool gen_access = op_state.will_gen_access();
-  bool gen_secret = op_state.will_gen_secret();
-
-  if (!keys_allowed) {
-    set_err_msg(err_msg, "access keys not allowed for this user");
-    return -EACCES;
-  }
-
-  if (op_state.has_existing_key()) {
-    set_err_msg(err_msg, "cannot create existing key");
-    return -ERR_KEY_EXIST;
-  }
-
-  if (!gen_access) {
-    id = op_state.get_access_key();
-  }
-
-  if (!id.empty()) {
-    switch (key_type) {
-    case KEY_TYPE_SWIFT:
-      if (driver->get_user_by_swift(dpp, id, y, &duplicate_check) >= 0) {
-        set_err_msg(err_msg, "existing swift key in RGW system:" + id);
-        return -ERR_KEY_EXIST;
-      }
-      break;
-    case KEY_TYPE_S3:
-      if (driver->get_user_by_access_key(dpp, id, y, &duplicate_check) >= 0) {
-        set_err_msg(err_msg, "existing S3 key in RGW system:" + id);
-        return -ERR_KEY_EXIST;
-      }
-    }
-  }
-
-  //key's subuser
-  if (op_state.has_subuser()) {
-    //create user and subuser at the same time, user's s3 key should not be set this
-    if (!op_state.key_type_setbycontext || (key_type == KEY_TYPE_SWIFT)) {
-      new_key.subuser = op_state.get_subuser();
-    }
-  }
-
-  //Secret key
-  if (!gen_secret) {
-    if (op_state.get_secret_key().empty()) {
-      set_err_msg(err_msg, "empty secret key");
-      return -ERR_INVALID_SECRET_KEY;
-    }
-
-    key = op_state.get_secret_key();
-  } else {
-    char secret_key_buf[SECRET_KEY_LEN + 1];
-    gen_rand_alphanumeric_plain(g_ceph_context, secret_key_buf, sizeof(secret_key_buf));
-    key = secret_key_buf;
-  }
-
-  // Generate the access key
-  if (key_type == KEY_TYPE_S3 && gen_access) {
-    char public_id_buf[PUBLIC_ID_LEN + 1];
-
-    do {
-      int id_buf_size = sizeof(public_id_buf);
-      gen_rand_alphanumeric_upper(g_ceph_context, public_id_buf, id_buf_size);
-      id = public_id_buf;
-      if (!validate_access_key(id))
-        continue;
-
-    } while (!driver->get_user_by_access_key(dpp, id, y, &duplicate_check));
-  }
-
-  if (key_type == KEY_TYPE_SWIFT) {
-    id = op_state.build_default_swift_kid();
-    if (id.empty()) {
-      set_err_msg(err_msg, "empty swift access key");
-      return -ERR_INVALID_ACCESS_KEY;
-    }
-
-    // check that the access key doesn't exist
-    if (driver->get_user_by_swift(dpp, id, y, &duplicate_check) >= 0) {
-      set_err_msg(err_msg, "cannot create existing swift key");
-      return -ERR_KEY_EXIST;
-    }
-  }
-
-  // finally create the new key
-  new_key.id = id;
-  new_key.key = key;
-
-  key_pair.first = id;
-  key_pair.second = new_key;
-
-  if (key_type == KEY_TYPE_S3) {
-    access_keys->insert(key_pair);
-  } else if (key_type == KEY_TYPE_SWIFT) {
-    swift_keys->insert(key_pair);
-  }
-
-  return 0;
-}
-
-// modify an existing key
-int RGWAccessKeyPool::modify_key(RGWUserAdminOpState& op_state, std::string *err_msg)
-{
-  std::string id;
-  std::string key = op_state.get_secret_key();
-  int key_type = op_state.get_key_type();
-
-  RGWAccessKey modify_key;
-
-  pair<string, RGWAccessKey> key_pair;
-  map<std::string, RGWAccessKey>::iterator kiter;
-
-  switch (key_type) {
-  case KEY_TYPE_S3:
-    id = op_state.get_access_key();
-    if (id.empty()) {
-      set_err_msg(err_msg, "no access key specified");
-      return -ERR_INVALID_ACCESS_KEY;
-    }
-    break;
-  case KEY_TYPE_SWIFT:
-    id = op_state.build_default_swift_kid();
-    if (id.empty()) {
-      set_err_msg(err_msg, "no subuser specified");
-      return -EINVAL;
-    }
-    break;
-  default:
-    set_err_msg(err_msg, "invalid key type");
-    return -ERR_INVALID_KEY_TYPE;
-  }
-
-  if (!op_state.has_existing_key()) {
-    set_err_msg(err_msg, "key does not exist");
-    return -ERR_INVALID_ACCESS_KEY;
-  }
-
-  key_pair.first = id;
-
-  if (key_type == KEY_TYPE_SWIFT) {
-    modify_key.id = id;
-    modify_key.subuser = op_state.get_subuser();
-  } else if (key_type == KEY_TYPE_S3) {
-    kiter = access_keys->find(id);
-    if (kiter != access_keys->end()) {
-      modify_key = kiter->second;
-    }
-  }
-
-  if (op_state.will_gen_secret()) {
-    char secret_key_buf[SECRET_KEY_LEN + 1];
-    int key_buf_size = sizeof(secret_key_buf);
-    gen_rand_alphanumeric_plain(g_ceph_context, secret_key_buf, key_buf_size);
-    key = secret_key_buf;
-  }
-
-  if (key.empty()) {
-      set_err_msg(err_msg, "empty secret key");
-      return -ERR_INVALID_SECRET_KEY;
-  }
-
-  // update the access key with the new secret key
-  modify_key.key = key;
-
-  key_pair.second = modify_key;
-
-
-  if (key_type == KEY_TYPE_S3) {
-    (*access_keys)[id] = modify_key;
-  } else if (key_type == KEY_TYPE_SWIFT) {
-    (*swift_keys)[id] = modify_key;
-  }
-
-  return 0;
-}
-
-int RGWAccessKeyPool::execute_add(const DoutPrefixProvider *dpp, 
-                                  RGWUserAdminOpState& op_state,
-                                 std::string *err_msg, bool defer_user_update,
-                                 optional_yield y)
-{
-  int ret = 0;
-
-  std::string subprocess_msg;
-  int key_op = GENERATE_KEY;
-
-  // set the op
-  if (op_state.has_existing_key())
-    key_op = MODIFY_KEY;
-
-  switch (key_op) {
-  case GENERATE_KEY:
-    ret = generate_key(dpp, op_state, y, &subprocess_msg);
-    break;
-  case MODIFY_KEY:
-    ret = modify_key(op_state, &subprocess_msg);
-    break;
-  }
-
-  if (ret < 0) {
-    set_err_msg(err_msg, subprocess_msg);
-    return ret;
-  }
-
-  // store the updated info
-  if (!defer_user_update)
-    ret = user->update(dpp, op_state, err_msg, y);
-
-  if (ret < 0)
-    return ret;
-
-  return 0;
-}
-
-int RGWAccessKeyPool::add(const DoutPrefixProvider *dpp, 
-                          RGWUserAdminOpState& op_state, optional_yield y,
-                         std::string *err_msg)
-{
-  return add(dpp, op_state, err_msg, false, y);
-}
-
-int RGWAccessKeyPool::add(const DoutPrefixProvider *dpp, 
-                          RGWUserAdminOpState& op_state, std::string *err_msg,
-                         bool defer_user_update, optional_yield y)
-{
-  int ret;
-  std::string subprocess_msg;
-
-  ret = check_op(op_state, &subprocess_msg);
-  if (ret < 0) {
-    set_err_msg(err_msg, "unable to parse request, " + subprocess_msg);
-    return ret;
-  }
-
-  ret = execute_add(dpp, op_state, &subprocess_msg, defer_user_update, y);
-  if (ret < 0) {
-    set_err_msg(err_msg, "unable to add access key, " + subprocess_msg);
-    return ret;
-  }
-
-  return 0;
-}
-
-int RGWAccessKeyPool::execute_remove(const DoutPrefixProvider *dpp, 
-                                     RGWUserAdminOpState& op_state,
-                                    std::string *err_msg,
-                                    bool defer_user_update,
-                                    optional_yield y)
-{
-  int ret = 0;
-
-  int key_type = op_state.get_key_type();
-  std::string id = op_state.get_access_key();
-  map<std::string, RGWAccessKey>::iterator kiter;
-  map<std::string, RGWAccessKey> *keys_map;
-
-  if (!op_state.has_existing_key()) {
-    set_err_msg(err_msg, "unable to find access key,  with key type: " +
-                             key_type_to_str(key_type));
-    return -ERR_INVALID_ACCESS_KEY;
-  }
-
-  if (key_type == KEY_TYPE_S3) {
-    keys_map = access_keys;
-  } else if (key_type == KEY_TYPE_SWIFT) {
-    keys_map = swift_keys;
-  } else {
-    keys_map = NULL;
-    set_err_msg(err_msg, "invalid access key");
-    return -ERR_INVALID_ACCESS_KEY;
-  }
-
-  kiter = keys_map->find(id);
-  if (kiter == keys_map->end()) {
-    set_err_msg(err_msg, "key not found");
-    return -ERR_INVALID_ACCESS_KEY;
-  }
-
-  keys_map->erase(kiter);
-
-  if (!defer_user_update)
-    ret = user->update(dpp, op_state, err_msg, y);
-
-  if (ret < 0)
-    return ret;
-
-  return 0;
-}
-
-int RGWAccessKeyPool::remove(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, optional_yield y,
-                            std::string *err_msg)
-{
-  return remove(dpp, op_state, err_msg, false, y);
-}
-
-int RGWAccessKeyPool::remove(const DoutPrefixProvider *dpp, 
-                             RGWUserAdminOpState& op_state,
-                            std::string *err_msg, bool defer_user_update,
-                            optional_yield y)
-{
-  int ret;
-
-  std::string subprocess_msg;
-
-  ret = check_op(op_state, &subprocess_msg);
-  if (ret < 0) {
-    set_err_msg(err_msg, "unable to parse request, " + subprocess_msg);
-    return ret;
-  }
-
-  ret = execute_remove(dpp, op_state, &subprocess_msg, defer_user_update, y);
-  if (ret < 0) {
-    set_err_msg(err_msg, "unable to remove access key, " + subprocess_msg);
-    return ret;
-  }
-
-  return 0;
-}
-
-// remove all keys associated with a subuser
-int RGWAccessKeyPool::remove_subuser_keys(const DoutPrefixProvider *dpp, 
-                                          RGWUserAdminOpState& op_state,
-                                         std::string *err_msg,
-                                         bool defer_user_update,
-                                         optional_yield y)
-{
-  int ret = 0;
-
-  if (!op_state.is_populated()) {
-    set_err_msg(err_msg, "user info was not populated");
-    return -EINVAL;
-  }
-
-  if (!op_state.has_subuser()) {
-    set_err_msg(err_msg, "no subuser specified");
-    return -EINVAL;
-  }
-
-  std::string swift_kid = op_state.build_default_swift_kid();
-  if (swift_kid.empty()) {
-    set_err_msg(err_msg, "empty swift access key");
-    return -EINVAL;
-  }
-
-  map<std::string, RGWAccessKey>::iterator kiter;
-  map<std::string, RGWAccessKey> *keys_map;
-
-  // a subuser can have at most one swift key
-  keys_map = swift_keys;
-  kiter = keys_map->find(swift_kid);
-  if (kiter != keys_map->end()) {
-    keys_map->erase(kiter);
-  }
-
-  // a subuser may have multiple s3 key pairs
-  std::string subuser_str = op_state.get_subuser();
-  keys_map = access_keys;
-  RGWUserInfo user_info = op_state.get_user_info();
-  auto user_kiter = user_info.access_keys.begin();
-  for (; user_kiter != user_info.access_keys.end(); ++user_kiter) {
-    if (user_kiter->second.subuser == subuser_str) {
-      kiter = keys_map->find(user_kiter->first);
-      if (kiter != keys_map->end()) {
-        keys_map->erase(kiter);
-      }
-    }
-  }
-
-  if (!defer_user_update)
-    ret = user->update(dpp, op_state, err_msg, y);
-
-  if (ret < 0)
-    return ret;
-
-  return 0;
-}
-
-RGWSubUserPool::RGWSubUserPool(RGWUser *usr)
-{
-  if (!usr) {
-    return;
-  }
-
-  user = usr;
-
-  subusers_allowed = true;
-  driver = user->get_driver();
-}
-
-int RGWSubUserPool::init(RGWUserAdminOpState& op_state)
-{
-  if (!op_state.is_initialized()) {
-    subusers_allowed = false;
-    return -EINVAL;
-  }
-
-  const rgw_user& uid = op_state.get_user_id();
-  if (uid.compare(RGW_USER_ANON_ID) == 0) {
-    subusers_allowed = false;
-    return -EACCES;
-  }
-
-  subuser_map = op_state.get_subusers();
-  if (subuser_map == NULL) {
-    subusers_allowed = false;
-    return -EINVAL;
-  }
-
-  subusers_allowed = true;
-
-  return 0;
-}
-
-bool RGWSubUserPool::exists(std::string subuser)
-{
-  if (subuser.empty())
-    return false;
-
-  if (!subuser_map)
-    return false;
-
-  if (subuser_map->count(subuser))
-    return true;
-
-  return false;
-}
-
-int RGWSubUserPool::check_op(RGWUserAdminOpState& op_state,
-        std::string *err_msg)
-{
-  bool existing = false;
-  std::string subuser = op_state.get_subuser();
-
-  if (!op_state.is_populated()) {
-    set_err_msg(err_msg, "user info was not populated");
-    return -EINVAL;
-  }
-
-  if (!subusers_allowed) {
-    set_err_msg(err_msg, "subusers not allowed for this user");
-    return -EACCES;
-  }
-
-  if (subuser.empty() && !op_state.will_gen_subuser()) {
-    set_err_msg(err_msg, "empty subuser name");
-    return -EINVAL;
-  }
-
-  if (op_state.get_subuser_perm() == RGW_PERM_INVALID) {
-    set_err_msg(err_msg, "invalid subuser access");
-    return -EINVAL;
-  }
-
-  //set key type when it not set or set by context
-  if ((op_state.get_key_type() < 0) || op_state.key_type_setbycontext) {
-    op_state.set_key_type(KEY_TYPE_SWIFT);
-    op_state.key_type_setbycontext = true;
-  }
-
-  // check if the subuser exists
-  if (!subuser.empty())
-    existing = exists(subuser);
-
-  op_state.set_existing_subuser(existing);
-
-  return 0;
-}
-
-int RGWSubUserPool::execute_add(const DoutPrefixProvider *dpp, 
-                                RGWUserAdminOpState& op_state,
-                               std::string *err_msg, bool defer_user_update,
-                               optional_yield y)
-{
-  int ret = 0;
-  std::string subprocess_msg;
-
-  RGWSubUser subuser;
-  std::pair<std::string, RGWSubUser> subuser_pair;
-  std::string subuser_str = op_state.get_subuser();
-
-  subuser_pair.first = subuser_str;
-
-  // assumes key should be created
-  if (op_state.has_key_op()) {
-    ret = user->keys.add(dpp, op_state, &subprocess_msg, true, y);
-    if (ret < 0) {
-      set_err_msg(err_msg, "unable to create subuser key, " + subprocess_msg);
-      return ret;
-    }
-  }
-
-  // create the subuser
-  subuser.name = subuser_str;
-
-  if (op_state.has_subuser_perm())
-    subuser.perm_mask = op_state.get_subuser_perm();
-
-  // insert the subuser into user info
-  subuser_pair.second = subuser;
-  subuser_map->insert(subuser_pair);
-
-  // attempt to save the subuser
-  if (!defer_user_update)
-    ret = user->update(dpp, op_state, err_msg, y);
-
-  if (ret < 0)
-    return ret;
-
-  return 0;
-}
-
-int RGWSubUserPool::add(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, optional_yield y,
-                       std::string *err_msg)
-{
-  return add(dpp, op_state, err_msg, false, y);
-}
-
-int RGWSubUserPool::add(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, std::string *err_msg, bool defer_user_update, optional_yield y)
-{
-  std::string subprocess_msg;
-  int ret;
-  int32_t key_type = op_state.get_key_type();
-
-  ret = check_op(op_state, &subprocess_msg);
-  if (ret < 0) {
-    set_err_msg(err_msg, "unable to parse request, " + subprocess_msg);
-    return ret;
-  }
-
-  if (op_state.get_access_key_exist()) {
-    set_err_msg(err_msg, "cannot create existing key");
-    return -ERR_KEY_EXIST;
-  }
-
-  if (key_type == KEY_TYPE_S3 && op_state.get_access_key().empty()) {
-    op_state.set_gen_access();
-  }
-
-  if (op_state.get_secret_key().empty()) {
-    op_state.set_gen_secret();
-  }
-
-  ret = execute_add(dpp, op_state, &subprocess_msg, defer_user_update, y);
-  if (ret < 0) {
-    set_err_msg(err_msg, "unable to create subuser, " + subprocess_msg);
-    return ret;
-  }
-
-  return 0;
-}
-
-int RGWSubUserPool::execute_remove(const DoutPrefixProvider *dpp, 
-                                   RGWUserAdminOpState& op_state,
-                                  std::string *err_msg, bool defer_user_update,
-                                  optional_yield y)
-{
-  int ret = 0;
-  std::string subprocess_msg;
-
-  std::string subuser_str = op_state.get_subuser();
-
-  map<std::string, RGWSubUser>::iterator siter;
-  siter = subuser_map->find(subuser_str);
-  if (siter == subuser_map->end()){
-    set_err_msg(err_msg, "subuser not found: " + subuser_str);
-    return -ERR_NO_SUCH_SUBUSER;
-  }
-  if (!op_state.has_existing_subuser()) {
-    set_err_msg(err_msg, "subuser not found: " + subuser_str);
-    return -ERR_NO_SUCH_SUBUSER;
-  }
-
-  // always purge all associate keys
-  user->keys.remove_subuser_keys(dpp, op_state, &subprocess_msg, true, y);
-
-  // remove the subuser from the user info
-  subuser_map->erase(siter);
-
-  // attempt to save the subuser
-  if (!defer_user_update)
-    ret = user->update(dpp, op_state, err_msg, y);
-
-  if (ret < 0)
-    return ret;
-
-  return 0;
-}
-
-int RGWSubUserPool::remove(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, optional_yield y,
-                          std::string *err_msg)
-{
-  return remove(dpp, op_state, err_msg, false, y);
-}
-
-int RGWSubUserPool::remove(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, std::string *err_msg,
-                          bool defer_user_update, optional_yield y)
-{
-  std::string subprocess_msg;
-  int ret;
-
-  ret = check_op(op_state, &subprocess_msg);
-  if (ret < 0) {
-    set_err_msg(err_msg, "unable to parse request, " + subprocess_msg);
-    return ret;
-  }
-
-  ret = execute_remove(dpp, op_state, &subprocess_msg, defer_user_update, y);
-  if (ret < 0) {
-    set_err_msg(err_msg, "unable to remove subuser, " + subprocess_msg);
-    return ret;
-  }
-
-  return 0;
-}
-
-int RGWSubUserPool::execute_modify(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, std::string *err_msg, bool defer_user_update, optional_yield y)
-{
-  int ret = 0;
-  std::string subprocess_msg;
-  std::map<std::string, RGWSubUser>::iterator siter;
-  std::pair<std::string, RGWSubUser> subuser_pair;
-
-  std::string subuser_str = op_state.get_subuser();
-  RGWSubUser subuser;
-
-  if (!op_state.has_existing_subuser()) {
-    set_err_msg(err_msg, "subuser does not exist");
-    return -ERR_NO_SUCH_SUBUSER;
-  }
-
-  subuser_pair.first = subuser_str;
-
-  siter = subuser_map->find(subuser_str);
-  subuser = siter->second;
-
-  if (op_state.has_key_op()) {
-    ret = user->keys.add(dpp, op_state, &subprocess_msg, true, y);
-    if (ret < 0) {
-      set_err_msg(err_msg, "unable to create subuser keys, " + subprocess_msg);
-      return ret;
-    }
-  }
-
-  if (op_state.has_subuser_perm())
-    subuser.perm_mask = op_state.get_subuser_perm();
-
-  subuser_pair.second = subuser;
-
-  subuser_map->erase(siter);
-  subuser_map->insert(subuser_pair);
-
-  // attempt to save the subuser
-  if (!defer_user_update)
-    ret = user->update(dpp, op_state, err_msg, y);
-
-  if (ret < 0)
-    return ret;
-
-  return 0;
-}
-
-int RGWSubUserPool::modify(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, optional_yield y, std::string *err_msg)
-{
-  return RGWSubUserPool::modify(dpp, op_state, y, err_msg, false);
-}
-
-int RGWSubUserPool::modify(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, optional_yield y, std::string *err_msg, bool defer_user_update)
-{
-  std::string subprocess_msg;
-  int ret;
-
-  RGWSubUser subuser;
-
-  ret = check_op(op_state, &subprocess_msg);
-  if (ret < 0) {
-    set_err_msg(err_msg, "unable to parse request, " + subprocess_msg);
-    return ret;
-  }
-
-  ret = execute_modify(dpp, op_state, &subprocess_msg, defer_user_update, y);
-  if (ret < 0) {
-    set_err_msg(err_msg, "unable to modify subuser, " + subprocess_msg);
-    return ret;
-  }
-
-  return 0;
-}
-
-RGWUserCapPool::RGWUserCapPool(RGWUser *usr)
-{
-  if (!usr) {
-    return;
-  }
-  user = usr;
-  caps_allowed = true;
-}
-
-int RGWUserCapPool::init(RGWUserAdminOpState& op_state)
-{
-  if (!op_state.is_initialized()) {
-    caps_allowed = false;
-    return -EINVAL;
-  }
-
-  const rgw_user& uid = op_state.get_user_id();
-  if (uid.compare(RGW_USER_ANON_ID) == 0) {
-    caps_allowed = false;
-    return -EACCES;
-  }
-
-  caps = op_state.get_caps_obj();
-  if (!caps) {
-    caps_allowed = false;
-    return -ERR_INVALID_CAP;
-  }
-
-  caps_allowed = true;
-
-  return 0;
-}
-
-int RGWUserCapPool::add(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, optional_yield y,
-                       std::string *err_msg)
-{
-  return add(dpp, op_state, err_msg, false, y);
-}
-
-int RGWUserCapPool::add(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, std::string *err_msg,
-                       bool defer_save, optional_yield y)
-{
-  int ret = 0;
-  std::string caps_str = op_state.get_caps();
-
-  if (!op_state.is_populated()) {
-    set_err_msg(err_msg, "user info was not populated");
-    return -EINVAL;
-  }
-
-  if (!caps_allowed) {
-    set_err_msg(err_msg, "caps not allowed for this user");
-    return -EACCES;
-  }
-
-  if (caps_str.empty()) {
-    set_err_msg(err_msg, "empty user caps");
-    return -ERR_INVALID_CAP;
-  }
-
-  int r = caps->add_from_string(caps_str);
-  if (r < 0) {
-    set_err_msg(err_msg, "unable to add caps: " + caps_str);
-    return r;
-  }
-
-  if (!defer_save)
-    ret = user->update(dpp, op_state, err_msg, y);
-
-  if (ret < 0)
-    return ret;
-
-  return 0;
-}
-
-int RGWUserCapPool::remove(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, optional_yield y,
-                          std::string *err_msg)
-{
-  return remove(dpp, op_state, err_msg, false, y);
-}
-
-int RGWUserCapPool::remove(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, std::string *err_msg,
-                          bool defer_save, optional_yield y)
-{
-  int ret = 0;
-
-  std::string caps_str = op_state.get_caps();
-
-  if (!op_state.is_populated()) {
-    set_err_msg(err_msg, "user info was not populated");
-    return -EINVAL;
-  }
-
-  if (!caps_allowed) {
-    set_err_msg(err_msg, "caps not allowed for this user");
-    return -EACCES;
-  }
-
-  if (caps_str.empty()) {
-    set_err_msg(err_msg, "empty user caps");
-    return -ERR_INVALID_CAP;
-  }
-
-  int r = caps->remove_from_string(caps_str);
-  if (r < 0) {
-    set_err_msg(err_msg, "unable to remove caps: " + caps_str);
-    return r;
-  }
-
-  if (!defer_save)
-    ret = user->update(dpp, op_state, err_msg, y);
-
-  if (ret < 0)
-    return ret;
-
-  return 0;
-}
-
-RGWUser::RGWUser() : caps(this), keys(this), subusers(this)
-{
-  init_default();
-}
-
-int RGWUser::init(const DoutPrefixProvider *dpp, rgw::sal::Driver* _driver,
-                 RGWUserAdminOpState& op_state, optional_yield y)
-{
-  init_default();
-  int ret = init_storage(_driver);
-  if (ret < 0)
-    return ret;
-
-  ret = init(dpp, op_state, y);
-  if (ret < 0)
-    return ret;
-
-  return 0;
-}
-
-void RGWUser::init_default()
-{
-  // use anonymous user info as a placeholder
-  rgw_get_anon_user(old_info);
-  user_id = RGW_USER_ANON_ID;
-
-  clear_populated();
-}
-
-int RGWUser::init_storage(rgw::sal::Driver* _driver)
-{
-  if (!_driver) {
-    return -EINVAL;
-  }
-
-  driver = _driver;
-
-  clear_populated();
-
-  /* API wrappers */
-  keys = RGWAccessKeyPool(this);
-  caps = RGWUserCapPool(this);
-  subusers = RGWSubUserPool(this);
-
-  return 0;
-}
-
-int RGWUser::init(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, optional_yield y)
-{
-  bool found = false;
-  std::string swift_user;
-  user_id = op_state.get_user_id();
-  std::string user_email = op_state.get_user_email();
-  std::string access_key = op_state.get_access_key();
-  std::string subuser = op_state.get_subuser();
-
-  int key_type = op_state.get_key_type();
-  if (key_type == KEY_TYPE_SWIFT) {
-    swift_user = op_state.get_access_key();
-    access_key.clear();
-  }
-
-  std::unique_ptr<rgw::sal::User> user;
-
-  clear_populated();
-
-  if (user_id.empty() && !subuser.empty()) {
-    size_t pos = subuser.find(':');
-    if (pos != string::npos) {
-      user_id = subuser.substr(0, pos);
-      op_state.set_user_id(user_id);
-    }
-  }
-
-  if (!user_id.empty() && (user_id.compare(RGW_USER_ANON_ID) != 0)) {
-    user = driver->get_user(user_id);
-    found = (user->load_user(dpp, y) >= 0);
-    op_state.found_by_uid = found;
-  }
-  if (driver->ctx()->_conf.get_val<bool>("rgw_user_unique_email")) {
-    if (!user_email.empty() && !found) {
-      found = (driver->get_user_by_email(dpp, user_email, y, &user) >= 0);
-      op_state.found_by_email = found;
-    }
-  }
-  if (!swift_user.empty() && !found) {
-    found = (driver->get_user_by_swift(dpp, swift_user, y, &user) >= 0);
-    op_state.found_by_key = found;
-  }
-  if (!access_key.empty() && !found) {
-    found = (driver->get_user_by_access_key(dpp, access_key, y, &user) >= 0);
-    op_state.found_by_key = found;
-  }
-  
-  op_state.set_existing_user(found);
-  if (found) {
-    op_state.set_user_info(user->get_info());
-    op_state.set_populated();
-    op_state.objv = user->get_version_tracker();
-    op_state.set_user_version_tracker(user->get_version_tracker());
-
-    old_info = user->get_info();
-    set_populated();
-  }
-
-  if (user_id.empty()) {
-    user_id = user->get_id();
-  }
-  op_state.set_initialized();
-
-  // this may have been called by a helper object
-  int ret = init_members(op_state);
-  if (ret < 0)
-    return ret;
-
-  return 0;
-}
-
-int RGWUser::init_members(RGWUserAdminOpState& op_state)
-{
-  int ret = 0;
-
-  ret = keys.init(op_state);
-  if (ret < 0)
-    return ret;
-
-  ret = subusers.init(op_state);
-  if (ret < 0)
-    return ret;
-
-  ret = caps.init(op_state);
-  if (ret < 0)
-    return ret;
-
-  return 0;
-}
-
-int RGWUser::update(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, std::string *err_msg,
-                   optional_yield y)
-{
-  int ret;
-  std::string subprocess_msg;
-  rgw::sal::User* user = op_state.get_user();
-
-  if (!driver) {
-    set_err_msg(err_msg, "couldn't initialize storage");
-    return -EINVAL;
-  }
-
-  RGWUserInfo *pold_info = (is_populated() ? &old_info : nullptr);
-
-  ret = user->store_user(dpp, y, false, pold_info);
-  op_state.objv = user->get_version_tracker();
-  op_state.set_user_version_tracker(user->get_version_tracker());
-
-  if (ret < 0) {
-    set_err_msg(err_msg, "unable to store user info");
-    return ret;
-  }
-
-  old_info = user->get_info();
-  set_populated();
-
-  return 0;
-}
-
-int RGWUser::check_op(RGWUserAdminOpState& op_state, std::string *err_msg)
-{
-  int ret = 0;
-  const rgw_user& uid = op_state.get_user_id();
-
-  if (uid.compare(RGW_USER_ANON_ID) == 0) {
-    set_err_msg(err_msg, "unable to perform operations on the anonymous user");
-    return -EINVAL;
-  }
-
-  if (is_populated() && user_id.compare(uid) != 0) {
-    set_err_msg(err_msg, "user id mismatch, operation id: " + uid.to_str()
-            + " does not match: " + user_id.to_str());
-
-    return -EINVAL;
-  }
-
-  ret = rgw_validate_tenant_name(uid.tenant);
-  if (ret) {
-    set_err_msg(err_msg,
-               "invalid tenant only alphanumeric and _ characters are allowed");
-    return ret;
-  }
-
-  //set key type when it not set or set by context
-  if ((op_state.get_key_type() < 0) || op_state.key_type_setbycontext) {
-    op_state.set_key_type(KEY_TYPE_S3);
-    op_state.key_type_setbycontext = true;
-  }
-
-  return 0;
-}
-
-// update swift_keys with new user id
-static void rename_swift_keys(const rgw_user& user,
-                              std::map<std::string, RGWAccessKey>& keys)
-{
-  std::string user_id;
-  user.to_str(user_id);
-
-  auto modify_keys = std::move(keys);
-  for ([[maybe_unused]] auto& [k, key] : modify_keys) {
-    std::string id = user_id + ":" + key.subuser;
-    key.id = id;
-    keys[id] = std::move(key);
-  }
-}
-
-int RGWUser::execute_rename(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, std::string *err_msg, optional_yield y)
-{
-  int ret;
-  bool populated = op_state.is_populated();
-
-  if (!op_state.has_existing_user() && !populated) {
-    set_err_msg(err_msg, "user not found");
-    return -ENOENT;
-  }
-
-  if (!populated) {
-    ret = init(dpp, op_state, y);
-    if (ret < 0) {
-      set_err_msg(err_msg, "unable to retrieve user info");
-      return ret;
-    }
-  }
-
-  std::unique_ptr<rgw::sal::User> old_user = driver->get_user(op_state.get_user_info().user_id);
-  std::unique_ptr<rgw::sal::User> new_user = driver->get_user(op_state.get_new_uid());
-  if (old_user->get_tenant() != new_user->get_tenant()) {
-    set_err_msg(err_msg, "users have to be under the same tenant namespace "
-                + old_user->get_tenant() + " != " + new_user->get_tenant());
-    return -EINVAL;
-  }
-
-  // create a stub user and write only the uid index and buckets object
-  std::unique_ptr<rgw::sal::User> user;
-  user = driver->get_user(new_user->get_id());
-
-  const bool exclusive = !op_state.get_overwrite_new_user(); // overwrite if requested
-
-  ret = user->store_user(dpp, y, exclusive);
-  if (ret == -EEXIST) {
-    set_err_msg(err_msg, "user name given by --new-uid already exists");
-    return ret;
-  }
-  if (ret < 0) {
-    set_err_msg(err_msg, "unable to store new user info");
-    return ret;
-  }
-
-  RGWAccessControlPolicy policy_instance;
-  policy_instance.create_default(new_user->get_id(), old_user->get_display_name());
-
-  //unlink and link buckets to new user
-  string marker;
-  CephContext *cct = driver->ctx();
-  size_t max_buckets = cct->_conf->rgw_list_buckets_max_chunk;
-  rgw::sal::BucketList buckets;
-
-  do {
-    ret = old_user->list_buckets(dpp, marker, "", max_buckets, false, buckets, y);
-    if (ret < 0) {
-      set_err_msg(err_msg, "unable to list user buckets");
-      return ret;
-    }
-
-    auto& m = buckets.get_buckets();
-
-    for (auto it = m.begin(); it != m.end(); ++it) {
-      auto& bucket = it->second;
-      marker = it->first;
-
-      ret = bucket->load_bucket(dpp, y);
-      if (ret < 0) {
-        set_err_msg(err_msg, "failed to fetch bucket info for bucket=" + bucket->get_name());
-        return ret;
-      }
-
-      ret = bucket->set_acl(dpp, policy_instance, y);
-      if (ret < 0) {
-        set_err_msg(err_msg, "failed to set acl on bucket " + bucket->get_name());
-        return ret;
-      }
-
-      ret = bucket->chown(dpp, new_user.get(), old_user.get(), y);
-      if (ret < 0) {
-        set_err_msg(err_msg, "failed to run bucket chown" + cpp_strerror(-ret));
-        return ret;
-      }
-    }
-
-  } while (buckets.is_truncated());
-
-  // update the 'stub user' with all of the other fields and rewrite all of the
-  // associated index objects
-  RGWUserInfo& user_info = op_state.get_user_info();
-  user_info.user_id = new_user->get_id();
-  op_state.objv = user->get_version_tracker();
-  op_state.set_user_version_tracker(user->get_version_tracker());
-
-  rename_swift_keys(new_user->get_id(), user_info.swift_keys);
-
-  return update(dpp, op_state, err_msg, y);
-}
-
-int RGWUser::execute_add(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, std::string *err_msg,
-                        optional_yield y)
-{
-  const rgw_user& uid = op_state.get_user_id();
-  std::string user_email = op_state.get_user_email();
-  std::string display_name = op_state.get_display_name();
-
-  // set the user info
-  RGWUserInfo user_info;
-  user_id = uid;
-  user_info.user_id = user_id;
-  user_info.display_name = display_name;
-  user_info.type = TYPE_RGW;
-
-  if (!user_email.empty())
-    user_info.user_email = user_email;
-
-  CephContext *cct = driver->ctx();
-  if (op_state.max_buckets_specified) {
-    user_info.max_buckets = op_state.get_max_buckets();
-  } else {
-    user_info.max_buckets =
-      cct->_conf.get_val<int64_t>("rgw_user_max_buckets");
-  }
-
-  user_info.suspended = op_state.get_suspension_status();
-  user_info.admin = op_state.admin;
-  user_info.system = op_state.system;
-
-  if (op_state.op_mask_specified)
-    user_info.op_mask = op_state.get_op_mask();
-
-  if (op_state.has_bucket_quota()) {
-    user_info.quota.bucket_quota = op_state.get_bucket_quota();
-  } else {
-    rgw_apply_default_bucket_quota(user_info.quota.bucket_quota, cct->_conf);
-  }
-
-  if (op_state.temp_url_key_specified) {
-    map<int, string>::iterator iter;
-    for (iter = op_state.temp_url_keys.begin();
-         iter != op_state.temp_url_keys.end(); ++iter) {
-      user_info.temp_url_keys[iter->first] = iter->second;
-    }
-  }
-
-  if (op_state.has_user_quota()) {
-    user_info.quota.user_quota = op_state.get_user_quota();
-  } else {
-    rgw_apply_default_user_quota(user_info.quota.user_quota, cct->_conf);
-  }
-
-  if (op_state.default_placement_specified) {
-    user_info.default_placement = op_state.default_placement;
-  }
-
-  if (op_state.placement_tags_specified) {
-    user_info.placement_tags = op_state.placement_tags;
-  }
-
-  // update the request
-  op_state.set_user_info(user_info);
-  op_state.set_populated();
-
-  // update the helper objects
-  int ret = init_members(op_state);
-  if (ret < 0) {
-    set_err_msg(err_msg, "unable to initialize user");
-    return ret;
-  }
-
-  // see if we need to add an access key
-  std::string subprocess_msg;
-  bool defer_user_update = true;
-  if (op_state.has_key_op()) {
-    ret = keys.add(dpp, op_state, &subprocess_msg, defer_user_update, y);
-    if (ret < 0) {
-      set_err_msg(err_msg, "unable to create access key, " + subprocess_msg);
-      return ret;
-    }
-  }
-
-  // see if we need to add some caps
-  if (op_state.has_caps_op()) {
-    ret = caps.add(dpp, op_state, &subprocess_msg, defer_user_update, y);
-    if (ret < 0) {
-      set_err_msg(err_msg, "unable to add user capabilities, " + subprocess_msg);
-      return ret;
-    }
-  }
-
-  ret = update(dpp, op_state, err_msg, y);
-  if (ret < 0)
-    return ret;
-
-  return 0;
-}
-
-int RGWUser::add(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, optional_yield y, std::string *err_msg)
-{
-  std::string subprocess_msg;
-  int ret = user_add_helper(op_state, &subprocess_msg);
-  if (ret != 0) {
-    set_err_msg(err_msg, "unable to parse parameters, " + subprocess_msg);
-    return ret;
-  }
-
-  ret = check_op(op_state, &subprocess_msg);
-  if (ret < 0) {
-    set_err_msg(err_msg, "unable to parse parameters, " + subprocess_msg);
-    return ret;
-  }
-
-  ret = execute_add(dpp, op_state, &subprocess_msg, y);
-  if (ret < 0) {
-    set_err_msg(err_msg, "unable to create user, " + subprocess_msg);
-    return ret;
-  }
-
-  return 0;
-}
-
-int RGWUser::rename(RGWUserAdminOpState& op_state, optional_yield y, const DoutPrefixProvider *dpp, std::string *err_msg)
-{
-  std::string subprocess_msg;
-  int ret;
-
-  ret = check_op(op_state, &subprocess_msg);
-  if (ret < 0) {
-    set_err_msg(err_msg, "unable to parse parameters, " + subprocess_msg);
-    return ret;
-  }
-
-  ret = execute_rename(dpp, op_state, &subprocess_msg, y);
-  if (ret < 0) {
-    set_err_msg(err_msg, "unable to rename user, " + subprocess_msg);
-    return ret;
-  }
-
-  return 0;
-}
-
-int RGWUser::execute_remove(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, std::string *err_msg, optional_yield y)
-{
-  int ret;
-
-  bool purge_data = op_state.will_purge_data();
-  rgw::sal::User* user = op_state.get_user();
-
-  if (!op_state.has_existing_user()) {
-    set_err_msg(err_msg, "user does not exist");
-    return -ENOENT;
-  }
-
-  rgw::sal::BucketList buckets;
-  string marker;
-  CephContext *cct = driver->ctx();
-  size_t max_buckets = cct->_conf->rgw_list_buckets_max_chunk;
-  do {
-    ret = user->list_buckets(dpp, marker, string(), max_buckets, false, buckets, y);
-    if (ret < 0) {
-      set_err_msg(err_msg, "unable to read user bucket info");
-      return ret;
-    }
-
-    auto& m = buckets.get_buckets();
-    if (!m.empty() && !purge_data) {
-      set_err_msg(err_msg, "must specify purge data to remove user with buckets");
-      return -EEXIST; // change to code that maps to 409: conflict
-    }
-
-    for (auto it = m.begin(); it != m.end(); ++it) {
-      ret = it->second->remove_bucket(dpp, true, false, nullptr, y);
-      if (ret < 0) {
-        set_err_msg(err_msg, "unable to delete user data");
-        return ret;
-      }
-
-      marker = it->first;
-    }
-
-  } while (buckets.is_truncated());
-
-  ret = user->remove_user(dpp, y);
-  if (ret < 0) {
-    set_err_msg(err_msg, "unable to remove user from RADOS");
-    return ret;
-  }
-
-  op_state.clear_populated();
-  clear_populated();
-
-  return 0;
-}
-
-int RGWUser::remove(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, optional_yield y, std::string *err_msg)
-{
-  std::string subprocess_msg;
-  int ret;
-
-  ret = check_op(op_state, &subprocess_msg);
-  if (ret < 0) {
-    set_err_msg(err_msg, "unable to parse parameters, " + subprocess_msg);
-    return ret;
-  }
-
-  ret = execute_remove(dpp, op_state, &subprocess_msg, y);
-  if (ret < 0) {
-    set_err_msg(err_msg, "unable to remove user, " + subprocess_msg);
-    return ret;
-  }
-
-  return 0;
-}
-
-int RGWUser::execute_modify(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, std::string *err_msg, optional_yield y)
-{
-  bool populated = op_state.is_populated();
-  int ret = 0;
-  std::string subprocess_msg;
-  std::string op_email = op_state.get_user_email();
-  std::string display_name = op_state.get_display_name();
-
-  RGWUserInfo user_info;
-  std::unique_ptr<rgw::sal::User> duplicate_check;
-
-  // ensure that the user info has been populated or is populate-able
-  if (!op_state.has_existing_user() && !populated) {
-    set_err_msg(err_msg, "user not found");
-    return -ENOENT;
-  }
-
-  // if the user hasn't already been populated...attempt to
-  if (!populated) {
-    ret = init(dpp, op_state, y);
-    if (ret < 0) {
-      set_err_msg(err_msg, "unable to retrieve user info");
-      return ret;
-    }
-  }
-
-  // ensure that we can modify the user's attributes
-  if (user_id.compare(RGW_USER_ANON_ID) == 0) {
-    set_err_msg(err_msg, "unable to modify anonymous user's info");
-    return -EACCES;
-  }
-
-  user_info = old_info;
-
-  std::string old_email = old_info.user_email;
-  if (!op_email.empty()) {
-    // make sure we are not adding a duplicate email
-    if (old_email != op_email) {
-      ret = driver->get_user_by_email(dpp, op_email, y, &duplicate_check);
-      if (ret >= 0 && duplicate_check->get_id().compare(user_id) != 0) {
-        set_err_msg(err_msg, "cannot add duplicate email");
-        return -ERR_EMAIL_EXIST;
-      }
-    }
-    user_info.user_email = op_email;
-  } else if (op_email.empty() && op_state.user_email_specified) {
-    ldpp_dout(dpp, 10) << "removing email index: " << user_info.user_email << dendl;
-    /* will be physically removed later when calling update() */
-    user_info.user_email.clear();
-  }
-
-  // update the remaining user info
-  if (!display_name.empty())
-    user_info.display_name = display_name;
-
-  if (op_state.max_buckets_specified)
-    user_info.max_buckets = op_state.get_max_buckets();
-
-  if (op_state.admin_specified)
-    user_info.admin = op_state.admin;
-
-  if (op_state.system_specified)
-    user_info.system = op_state.system;
-
-  if (op_state.temp_url_key_specified) {
-    map<int, string>::iterator iter;
-    for (iter = op_state.temp_url_keys.begin();
-         iter != op_state.temp_url_keys.end(); ++iter) {
-      user_info.temp_url_keys[iter->first] = iter->second;
-    }
-  }
-
-  if (op_state.op_mask_specified)
-    user_info.op_mask = op_state.get_op_mask();
-
-  if (op_state.has_bucket_quota())
-    user_info.quota.bucket_quota = op_state.get_bucket_quota();
-
-  if (op_state.has_user_quota())
-    user_info.quota.user_quota = op_state.get_user_quota();
-
-  if (op_state.has_suspension_op()) {
-    __u8 suspended = op_state.get_suspension_status();
-    user_info.suspended = suspended;
-
-    rgw::sal::BucketList buckets;
-
-    if (user_id.empty()) {
-      set_err_msg(err_msg, "empty user id passed...aborting");
-      return -EINVAL;
-    }
-
-    string marker;
-    CephContext *cct = driver->ctx();
-    size_t max_buckets = cct->_conf->rgw_list_buckets_max_chunk;
-    std::unique_ptr<rgw::sal::User> user = driver->get_user(user_id);
-    do {
-      ret = user->list_buckets(dpp, marker, string(), max_buckets, false, buckets, y);
-      if (ret < 0) {
-        set_err_msg(err_msg, "could not get buckets for uid:  " + user_id.to_str());
-        return ret;
-      }
-
-      auto& m = buckets.get_buckets();
-
-      vector<rgw_bucket> bucket_names;
-      for (auto iter = m.begin(); iter != m.end(); ++iter) {
-       auto& bucket = iter->second;
-        bucket_names.push_back(bucket->get_key());
-
-        marker = iter->first;
-      }
-
-      ret = driver->set_buckets_enabled(dpp, bucket_names, !suspended);
-      if (ret < 0) {
-        set_err_msg(err_msg, "failed to modify bucket");
-        return ret;
-      }
-
-    } while (buckets.is_truncated());
-  }
-
-  if (op_state.mfa_ids_specified) {
-    user_info.mfa_ids = op_state.mfa_ids;
-  }
-
-  if (op_state.default_placement_specified) {
-    user_info.default_placement = op_state.default_placement;
-  }
-
-  if (op_state.placement_tags_specified) {
-    user_info.placement_tags = op_state.placement_tags;
-  }
-
-  op_state.set_user_info(user_info);
-
-  // if we're supposed to modify keys, do so
-  if (op_state.has_key_op()) {
-    ret = keys.add(dpp, op_state, &subprocess_msg, true, y);
-    if (ret < 0) {
-      set_err_msg(err_msg, "unable to create or modify keys, " + subprocess_msg);
-      return ret;
-    }
-  }
-
-  ret = update(dpp, op_state, err_msg, y);
-  if (ret < 0)
-    return ret;
-
-  return 0;
-}
-
-int RGWUser::modify(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, optional_yield y, std::string *err_msg)
-{
-  std::string subprocess_msg;
-  int ret;
-
-  ret = check_op(op_state, &subprocess_msg);
-  if (ret < 0) {
-    set_err_msg(err_msg, "unable to parse parameters, " + subprocess_msg);
-    return ret;
-  }
-
-  ret = execute_modify(dpp, op_state, &subprocess_msg, y);
-  if (ret < 0) {
-    set_err_msg(err_msg, "unable to modify user, " + subprocess_msg);
-    return ret;
-  }
-
-  return 0;
-}
-
-int RGWUser::info(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, RGWUserInfo& fetched_info,
-                 optional_yield y, std::string *err_msg)
-{
-  int ret = init(dpp, op_state, y);
-  if (ret < 0) {
-    set_err_msg(err_msg, "unable to fetch user info");
-    return ret;
-  }
-
-  fetched_info = op_state.get_user_info();
-
-  return 0;
-}
-
-int RGWUser::info(RGWUserInfo& fetched_info, std::string *err_msg)
-{
-  if (!is_populated()) {
-    set_err_msg(err_msg, "no user info saved");
-    return -EINVAL;
-  }
-
-  fetched_info = old_info;
-
-  return 0;
-}
-
-int RGWUser::list(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, RGWFormatterFlusher& flusher)
-{
-  Formatter *formatter = flusher.get_formatter();
-  void *handle = nullptr;
-  std::string metadata_key = "user";
-  if (op_state.max_entries > 1000) {
-    op_state.max_entries = 1000;
-  }
-
-  int ret = driver->meta_list_keys_init(dpp, metadata_key, op_state.marker, &handle);
-  if (ret < 0) {
-    return ret;
-  }
-
-  bool truncated = false;
-  uint64_t count = 0;
-  uint64_t left = 0;
-  flusher.start(0);
-
-  // open the result object section
-  formatter->open_object_section("result");
-
-  // open the user id list array section
-  formatter->open_array_section("keys");
-  do {
-    std::list<std::string> keys;
-    left = op_state.max_entries - count;
-    ret = driver->meta_list_keys_next(dpp, handle, left, keys, &truncated);
-    if (ret < 0 && ret != -ENOENT) {
-      return ret;
-    } if (ret != -ENOENT) {
-      for (std::list<std::string>::iterator iter = keys.begin(); iter != keys.end(); ++iter) {
-      formatter->dump_string("key", *iter);
-        ++count;
-      }
-    }
-  } while (truncated && left > 0);
-  // close user id list section
-  formatter->close_section();
-
-  formatter->dump_bool("truncated", truncated);
-  formatter->dump_int("count", count);
-  if (truncated) {
-    formatter->dump_string("marker", driver->meta_get_marker(handle));
-  }
-
-  // close result object section
-  formatter->close_section();
-
-  driver->meta_list_keys_complete(handle);
-
-  flusher.flush();
-  return 0;
-}
-
-int RGWUserAdminOp_User::list(const DoutPrefixProvider *dpp, rgw::sal::Driver* driver, RGWUserAdminOpState& op_state,
-                  RGWFormatterFlusher& flusher)
-{
-  RGWUser user;
-
-  int ret = user.init_storage(driver);
-  if (ret < 0)
-    return ret;
-
-  ret = user.list(dpp, op_state, flusher);
-  if (ret < 0)
-    return ret;
-
-  return 0;
-}
-
-int RGWUserAdminOp_User::info(const DoutPrefixProvider *dpp,
-                             rgw::sal::Driver* driver, RGWUserAdminOpState& op_state,
-                             RGWFormatterFlusher& flusher,
-                             optional_yield y)
-{
-  RGWUserInfo info;
-  RGWUser user;
-  std::unique_ptr<rgw::sal::User> ruser;
-
-  int ret = user.init(dpp, driver, op_state, y);
-  if (ret < 0)
-    return ret;
-
-  if (!op_state.has_existing_user())
-    return -ERR_NO_SUCH_USER;
-
-  Formatter *formatter = flusher.get_formatter();
-
-  ret = user.info(info, NULL);
-  if (ret < 0)
-    return ret;
-
-  ruser = driver->get_user(info.user_id);
-
-  if (op_state.sync_stats) {
-    ret = rgw_user_sync_all_stats(dpp, driver, ruser.get(), y);
-    if (ret < 0) {
-      return ret;
-    }
-  }
-
-  RGWStorageStats stats;
-  RGWStorageStats *arg_stats = NULL;
-  if (op_state.fetch_stats) {
-    int ret = ruser->read_stats(dpp, y, &stats);
-    if (ret < 0 && ret != -ENOENT) {
-      return ret;
-    }
-
-    arg_stats = &stats;
-  }
-
-  if (formatter) {
-    flusher.start(0);
-
-    dump_user_info(formatter, info, arg_stats);
-    flusher.flush();
-  }
-
-  return 0;
-}
-
-int RGWUserAdminOp_User::create(const DoutPrefixProvider *dpp,
-                               rgw::sal::Driver* driver,
-                               RGWUserAdminOpState& op_state,
-                               RGWFormatterFlusher& flusher, optional_yield y)
-{
-  RGWUserInfo info;
-  RGWUser user;
-  int ret = user.init(dpp, driver, op_state, y);
-  if (ret < 0)
-    return ret;
-
-  Formatter *formatter = flusher.get_formatter();
-
-  ret = user.add(dpp, op_state, y, NULL);
-  if (ret < 0) {
-    if (ret == -EEXIST)
-      ret = -ERR_USER_EXIST;
-    return ret;
-  }
-
-  ret = user.info(info, NULL);
-  if (ret < 0)
-    return ret;
-
-  if (formatter) {
-    flusher.start(0);
-
-    dump_user_info(formatter, info);
-    flusher.flush();
-  }
-
-  return 0;
-}
-
-int RGWUserAdminOp_User::modify(const DoutPrefixProvider *dpp,
-                               rgw::sal::Driver* driver,
-                               RGWUserAdminOpState& op_state,
-                               RGWFormatterFlusher& flusher, optional_yield y)
-{
-  RGWUserInfo info;
-  RGWUser user;
-  int ret = user.init(dpp, driver, op_state, y);
-  if (ret < 0)
-    return ret;
-  Formatter *formatter = flusher.get_formatter();
-
-  ret = user.modify(dpp, op_state, y, NULL);
-  if (ret < 0) {
-    if (ret == -ENOENT)
-      ret = -ERR_NO_SUCH_USER;
-    return ret;
-  }
-
-  ret = user.info(info, NULL);
-  if (ret < 0)
-    return ret;
-
-  if (formatter) {
-    flusher.start(0);
-
-    dump_user_info(formatter, info);
-    flusher.flush();
-  }
-
-  return 0;
-}
-
-int RGWUserAdminOp_User::remove(const DoutPrefixProvider *dpp,
-                               rgw::sal::Driver* driver, RGWUserAdminOpState& op_state,
-                               RGWFormatterFlusher& flusher, optional_yield y)
-{
-  RGWUserInfo info;
-  RGWUser user;
-  int ret = user.init(dpp, driver, op_state, y);
-  if (ret < 0)
-    return ret;
-
-
-  ret = user.remove(dpp, op_state, y, NULL);
-
-  if (ret == -ENOENT)
-    ret = -ERR_NO_SUCH_USER;
-  return ret;
-}
-
-int RGWUserAdminOp_Subuser::create(const DoutPrefixProvider *dpp,
-                                  rgw::sal::Driver* driver,
-                                  RGWUserAdminOpState& op_state,
-                                  RGWFormatterFlusher& flusher,
-                                  optional_yield y)
-{
-  RGWUserInfo info;
-  RGWUser user;
-  int ret = user.init(dpp, driver, op_state, y);
-  if (ret < 0)
-    return ret;
-
-  if (!op_state.has_existing_user())
-    return -ERR_NO_SUCH_USER;
-
-  Formatter *formatter = flusher.get_formatter();
-
-  ret = user.subusers.add(dpp, op_state, y, NULL);
-  if (ret < 0)
-    return ret;
-
-  ret = user.info(info, NULL);
-  if (ret < 0)
-    return ret;
-
-  if (formatter) {
-    flusher.start(0);
-
-    dump_subusers_info(formatter, info);
-    flusher.flush();
-  }
-
-  return 0;
-}
-
-int RGWUserAdminOp_Subuser::modify(const DoutPrefixProvider *dpp,
-                                  rgw::sal::Driver* driver, RGWUserAdminOpState& op_state,
-                                  RGWFormatterFlusher& flusher, optional_yield y)
-{
-  RGWUserInfo info;
-  RGWUser user;
-  int ret = user.init(dpp, driver, op_state, y);
-  if (ret < 0)
-    return ret;
-
-  if (!op_state.has_existing_user())
-    return -ERR_NO_SUCH_USER;
-
-  Formatter *formatter = flusher.get_formatter();
-
-  ret = user.subusers.modify(dpp, op_state, y, NULL);
-  if (ret < 0)
-    return ret;
-
-  ret = user.info(info, NULL);
-  if (ret < 0)
-    return ret;
-  if (formatter) {
-    flusher.start(0);
-
-    dump_subusers_info(formatter, info);
-    flusher.flush();
-  }
-
-  return 0;
-}
-
-int RGWUserAdminOp_Subuser::remove(const DoutPrefixProvider *dpp,
-                                  rgw::sal::Driver* driver,
-                                  RGWUserAdminOpState& op_state,
-                                  RGWFormatterFlusher& flusher,
-                                  optional_yield y)
-{
-  RGWUserInfo info;
-  RGWUser user;
-  int ret = user.init(dpp, driver, op_state, y);
-  if (ret < 0)
-    return ret;
-
-
-  if (!op_state.has_existing_user())
-    return -ERR_NO_SUCH_USER;
-
-  ret = user.subusers.remove(dpp, op_state, y, NULL);
-  if (ret < 0)
-    return ret;
-
-  return 0;
-}
-
-int RGWUserAdminOp_Key::create(const DoutPrefixProvider *dpp,
-                              rgw::sal::Driver* driver, RGWUserAdminOpState& op_state,
-                              RGWFormatterFlusher& flusher,
-                              optional_yield y)
-{
-  RGWUserInfo info;
-  RGWUser user;
-  int ret = user.init(dpp, driver, op_state, y);
-  if (ret < 0)
-    return ret;
-
-  if (!op_state.has_existing_user())
-    return -ERR_NO_SUCH_USER;
-
-  Formatter *formatter = flusher.get_formatter();
-
-  ret = user.keys.add(dpp, op_state, y, NULL);
-  if (ret < 0)
-    return ret;
-
-  ret = user.info(info, NULL);
-  if (ret < 0)
-    return ret;
-
-  if (formatter) {
-    flusher.start(0);
-
-    int key_type = op_state.get_key_type();
-
-    if (key_type == KEY_TYPE_SWIFT)
-      dump_swift_keys_info(formatter, info);
-
-    else if (key_type == KEY_TYPE_S3)
-      dump_access_keys_info(formatter, info);
-
-    flusher.flush();
-  }
-
-  return 0;
-}
-
-int RGWUserAdminOp_Key::remove(const DoutPrefixProvider *dpp,
-                              rgw::sal::Driver* driver,
-                              RGWUserAdminOpState& op_state,
-                              RGWFormatterFlusher& flusher,
-                              optional_yield y)
-{
-  RGWUserInfo info;
-  RGWUser user;
-  int ret = user.init(dpp, driver, op_state, y);
-  if (ret < 0)
-    return ret;
-
-  if (!op_state.has_existing_user())
-    return -ERR_NO_SUCH_USER;
-
-
-  ret = user.keys.remove(dpp, op_state, y, NULL);
-  if (ret < 0)
-    return ret;
-
-  return 0;
-}
-
-int RGWUserAdminOp_Caps::add(const DoutPrefixProvider *dpp,
-                            rgw::sal::Driver* driver,
-                            RGWUserAdminOpState& op_state,
-                            RGWFormatterFlusher& flusher, optional_yield y)
-{
-  RGWUserInfo info;
-  RGWUser user;
-  int ret = user.init(dpp, driver, op_state, y);
-  if (ret < 0)
-    return ret;
-
-  if (!op_state.has_existing_user())
-    return -ERR_NO_SUCH_USER;
-
-  Formatter *formatter = flusher.get_formatter();
-
-  ret = user.caps.add(dpp, op_state, y, NULL);
-  if (ret < 0)
-    return ret;
-
-  ret = user.info(info, NULL);
-  if (ret < 0)
-    return ret;
-
-  if (formatter) {
-    flusher.start(0);
-
-    info.caps.dump(formatter);
-    flusher.flush();
-  }
-
-  return 0;
-}
-
-
-int RGWUserAdminOp_Caps::remove(const DoutPrefixProvider *dpp,
-                               rgw::sal::Driver* driver,
-                               RGWUserAdminOpState& op_state,
-                               RGWFormatterFlusher& flusher, optional_yield y)
-{
-  RGWUserInfo info;
-  RGWUser user;
-  int ret = user.init(dpp, driver, op_state, y);
-  if (ret < 0)
-    return ret;
-
-  if (!op_state.has_existing_user())
-    return -ERR_NO_SUCH_USER;
-
-  Formatter *formatter = flusher.get_formatter();
-
-  ret = user.caps.remove(dpp, op_state, y, NULL);
-  if (ret < 0)
-    return ret;
-
-  ret = user.info(info, NULL);
-  if (ret < 0)
-    return ret;
-
-  if (formatter) {
-    flusher.start(0);
-
-    info.caps.dump(formatter);
-    flusher.flush();
-  }
-
-  return 0;
-}
-
-class RGWUserMetadataHandler : public RGWMetadataHandler_GenericMetaBE {
-public:
-  struct Svc {
-    RGWSI_User *user{nullptr};
-  } svc;
-
-  RGWUserMetadataHandler(RGWSI_User *user_svc) {
-    base_init(user_svc->ctx(), user_svc->get_be_handler());
-    svc.user = user_svc;
-  }
-
-  ~RGWUserMetadataHandler() {}
-
-  string get_type() override { return "user"; }
-
-  int do_get(RGWSI_MetaBackend_Handler::Op *op, string& entry, RGWMetadataObject **obj, optional_yield y, const DoutPrefixProvider *dpp) override {
-    RGWUserCompleteInfo uci;
-    RGWObjVersionTracker objv_tracker;
-    real_time mtime;
-
-    rgw_user user = RGWSI_User::user_from_meta_key(entry);
-
-    int ret = svc.user->read_user_info(op->ctx(), user, &uci.info, &objv_tracker,
-                                       &mtime, nullptr, &uci.attrs,
-                                       y, dpp);
-    if (ret < 0) {
-      return ret;
-    }
-
-    RGWUserMetadataObject *mdo = new RGWUserMetadataObject(uci, objv_tracker.read_version, mtime);
-    *obj = mdo;
-
-    return 0;
-  }
-
-  RGWMetadataObject *get_meta_obj(JSONObj *jo, const obj_version& objv, const ceph::real_time& mtime) override {
-    RGWUserCompleteInfo uci;
-
-    try {
-      decode_json_obj(uci, jo);
-    } catch (JSONDecoder::err& e) {
-      return nullptr;
-    }
-
-    return new RGWUserMetadataObject(uci, objv, mtime);
-  }
-
-  int do_put(RGWSI_MetaBackend_Handler::Op *op, string& entry,
-             RGWMetadataObject *obj,
-             RGWObjVersionTracker& objv_tracker,
-             optional_yield y, const DoutPrefixProvider *dpp,
-             RGWMDLogSyncType type, bool from_remote_zone) override;
-
-  int do_remove(RGWSI_MetaBackend_Handler::Op *op, string& entry, RGWObjVersionTracker& objv_tracker,
-                optional_yield y, const DoutPrefixProvider *dpp) override {
-    RGWUserInfo info;
-
-    rgw_user user = RGWSI_User::user_from_meta_key(entry);
-
-    int ret = svc.user->read_user_info(op->ctx(), user, &info, nullptr,
-                                       nullptr, nullptr, nullptr,
-                                       y, dpp);
-    if (ret < 0) {
-      return ret;
-    }
-
-    return svc.user->remove_user_info(op->ctx(), info, &objv_tracker,
-                                      y, dpp);
-  }
-};
-
-class RGWMetadataHandlerPut_User : public RGWMetadataHandlerPut_SObj
-{
-  RGWUserMetadataHandler *uhandler;
-  RGWUserMetadataObject *uobj;
-public:
-  RGWMetadataHandlerPut_User(RGWUserMetadataHandler *_handler,
-                             RGWSI_MetaBackend_Handler::Op *op, string& entry,
-                             RGWMetadataObject *obj, RGWObjVersionTracker& objv_tracker,
-                             optional_yield y,
-                             RGWMDLogSyncType type, bool from_remote_zone) : RGWMetadataHandlerPut_SObj(_handler, op, entry, obj, objv_tracker, y, type, from_remote_zone),
-                                                                uhandler(_handler) {
-    uobj = static_cast<RGWUserMetadataObject *>(obj);
-  }
-
-  int put_checked(const DoutPrefixProvider *dpp) override;
-};
-
-int RGWUserMetadataHandler::do_put(RGWSI_MetaBackend_Handler::Op *op, string& entry,
-                                   RGWMetadataObject *obj,
-                                   RGWObjVersionTracker& objv_tracker,
-                                   optional_yield y, const DoutPrefixProvider *dpp,
-                                   RGWMDLogSyncType type, bool from_remote_zone)
-{
-  RGWMetadataHandlerPut_User put_op(this, op, entry, obj, objv_tracker, y, type, from_remote_zone);
-  return do_put_operate(&put_op, dpp);
-}
-
-int RGWMetadataHandlerPut_User::put_checked(const DoutPrefixProvider *dpp)
-{
-  RGWUserMetadataObject *orig_obj = static_cast<RGWUserMetadataObject *>(old_obj);
-  RGWUserCompleteInfo& uci = uobj->get_uci();
-
-  map<string, bufferlist> *pattrs{nullptr};
-  if (uci.has_attrs) {
-    pattrs = &uci.attrs;
-  }
-
-  RGWUserInfo *pold_info = (orig_obj ? &orig_obj->get_uci().info : nullptr);
-
-  auto mtime = obj->get_mtime();
-
-  int ret = uhandler->svc.user->store_user_info(op->ctx(), uci.info, pold_info,
-                                               &objv_tracker, mtime,
-                                               false, pattrs, y, dpp);
-  if (ret < 0) {
-    return ret;
-  }
-
-  return STATUS_APPLIED;
-}
-
-
-RGWUserCtl::RGWUserCtl(RGWSI_Zone *zone_svc,
-                       RGWSI_User *user_svc,
-                       RGWUserMetadataHandler *_umhandler) : umhandler(_umhandler) {
-  svc.zone = zone_svc;
-  svc.user = user_svc;
-  be_handler = umhandler->get_be_handler();
-}
-
-template <class T>
-class optional_default
-{
-  const std::optional<T>& opt;
-  std::optional<T> def;
-  const T *p;
-public:
-  optional_default(const std::optional<T>& _o) : opt(_o) {
-    if (opt) {
-      p = &(*opt);
-    } else {
-      def = T();
-      p = &(*def);
-    }
-  }
-
-  const T *operator->() {
-    return p;
-  }
-
-  const T& operator*() {
-    return *p;
-  }
-};
-
-int RGWUserCtl::get_info_by_uid(const DoutPrefixProvider *dpp, 
-                                const rgw_user& uid,
-                                RGWUserInfo *info,
-                                optional_yield y,
-                                const GetParams& params)
-
-{
-  return be_handler->call([&](RGWSI_MetaBackend_Handler::Op *op) {
-    return svc.user->read_user_info(op->ctx(),
-                                    uid,
-                                    info,
-                                    params.objv_tracker,
-                                    params.mtime,
-                                    params.cache_info,
-                                    params.attrs,
-                                    y,
-                                    dpp);
-  });
-}
-
-int RGWUserCtl::get_info_by_email(const DoutPrefixProvider *dpp, 
-                                  const string& email,
-                                  RGWUserInfo *info,
-                                  optional_yield y,
-                                  const GetParams& params)
-{
-  return be_handler->call([&](RGWSI_MetaBackend_Handler::Op *op) {
-    return svc.user->get_user_info_by_email(op->ctx(), email,
-                                            info,
-                                            params.objv_tracker,
-                                            params.mtime,
-                                            y,
-                                            dpp);
-  });
-}
-
-int RGWUserCtl::get_info_by_swift(const DoutPrefixProvider *dpp, 
-                                  const string& swift_name,
-                                  RGWUserInfo *info,
-                                  optional_yield y,
-                                  const GetParams& params)
-{
-  return be_handler->call([&](RGWSI_MetaBackend_Handler::Op *op) {
-    return svc.user->get_user_info_by_swift(op->ctx(), swift_name,
-                                            info,
-                                            params.objv_tracker,
-                                            params.mtime,
-                                            y,
-                                            dpp);
-  });
-}
-
-int RGWUserCtl::get_info_by_access_key(const DoutPrefixProvider *dpp, 
-                                       const string& access_key,
-                                       RGWUserInfo *info,
-                                       optional_yield y,
-                                       const GetParams& params)
-{
-  return be_handler->call([&](RGWSI_MetaBackend_Handler::Op *op) {
-    return svc.user->get_user_info_by_access_key(op->ctx(), access_key,
-                                                 info,
-                                                 params.objv_tracker,
-                                                 params.mtime,
-                                                 y,
-                                                 dpp);
-  });
-}
-
-int RGWUserCtl::get_attrs_by_uid(const DoutPrefixProvider *dpp, 
-                                 const rgw_user& user_id,
-                                 map<string, bufferlist> *pattrs,
-                                 optional_yield y,
-                                 RGWObjVersionTracker *objv_tracker)
-{
-  RGWUserInfo user_info;
-
-  return get_info_by_uid(dpp, user_id, &user_info, y, RGWUserCtl::GetParams()
-                         .set_attrs(pattrs)
-                         .set_objv_tracker(objv_tracker));
-}
-
-int RGWUserCtl::store_info(const DoutPrefixProvider *dpp, 
-                           const RGWUserInfo& info, optional_yield y,
-                           const PutParams& params)
-{
-  string key = RGWSI_User::get_meta_key(info.user_id);
-
-  return be_handler->call([&](RGWSI_MetaBackend_Handler::Op *op) {
-    return svc.user->store_user_info(op->ctx(), info,
-                                     params.old_info,
-                                     params.objv_tracker,
-                                     params.mtime,
-                                     params.exclusive,
-                                     params.attrs,
-                                     y,
-                                     dpp);
-  });
-}
-
-int RGWUserCtl::remove_info(const DoutPrefixProvider *dpp, 
-                            const RGWUserInfo& info, optional_yield y,
-                            const RemoveParams& params)
-
-{
-  string key = RGWSI_User::get_meta_key(info.user_id);
-
-  return be_handler->call([&](RGWSI_MetaBackend_Handler::Op *op) {
-    return svc.user->remove_user_info(op->ctx(), info,
-                                      params.objv_tracker,
-                                      y, dpp);
-  });
-}
-
-int RGWUserCtl::list_buckets(const DoutPrefixProvider *dpp, 
-                             const rgw_user& user,
-                             const string& marker,
-                             const string& end_marker,
-                             uint64_t max,
-                             bool need_stats,
-                             RGWUserBuckets *buckets,
-                             bool *is_truncated,
-                            optional_yield y,
-                             uint64_t default_max)
-{
-  if (!max) {
-    max = default_max;
-  }
-
-  int ret = svc.user->list_buckets(dpp, user, marker, end_marker,
-                                   max, buckets, is_truncated, y);
-  if (ret < 0) {
-    return ret;
-  }
-  if (need_stats) {
-    map<string, RGWBucketEnt>& m = buckets->get_buckets();
-    ret = ctl.bucket->read_buckets_stats(m, y, dpp);
-    if (ret < 0 && ret != -ENOENT) {
-      ldpp_dout(dpp, 0) << "ERROR: could not get stats for buckets" << dendl;
-      return ret;
-    }
-  }
-  return 0;
-}
-
-int RGWUserCtl::read_stats(const DoutPrefixProvider *dpp, 
-                           const rgw_user& user, RGWStorageStats *stats,
-                          optional_yield y,
-                          ceph::real_time *last_stats_sync,
-                          ceph::real_time *last_stats_update)
-{
-  return be_handler->call([&](RGWSI_MetaBackend_Handler::Op *op) {
-    return svc.user->read_stats(dpp, op->ctx(), user, stats,
-                               last_stats_sync, last_stats_update, y);
-  });
-}
-
-RGWMetadataHandler *RGWUserMetaHandlerAllocator::alloc(RGWSI_User *user_svc) {
-  return new RGWUserMetadataHandler(user_svc);
-}
-
-void rgw_user::dump(Formatter *f) const
-{
-  ::encode_json("user", *this, f);
-}
-
diff --git a/src/rgw/store/rados/rgw_user.h b/src/rgw/store/rados/rgw_user.h
deleted file mode 100644 (file)
index 110124c..0000000
+++ /dev/null
@@ -1,887 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab ft=cpp
-
-#ifndef CEPH_RGW_USER_H
-#define CEPH_RGW_USER_H
-
-#include <string>
-#include <boost/algorithm/string.hpp>
-#include "include/ceph_assert.h"
-
-#include "include/types.h"
-#include "rgw_common.h"
-#include "rgw_tools.h"
-
-#include "rgw_string.h"
-
-#include "common/Formatter.h"
-#include "rgw_formats.h"
-#include "rgw_metadata.h"
-#include "rgw_sal_fwd.h"
-
-#define RGW_USER_ANON_ID "anonymous"
-
-#define SECRET_KEY_LEN 40
-#define PUBLIC_ID_LEN 20
-#define RAND_SUBUSER_LEN 5
-
-#define XMLNS_AWS_S3 "http://s3.amazonaws.com/doc/2006-03-01/"
-
-class RGWUserCtl;
-class RGWBucketCtl;
-class RGWUserBuckets;
-
-class RGWGetUserStats_CB;
-
-/**
- * A string wrapper that includes encode/decode functions
- * for easily accessing a UID in all forms
- */
-struct RGWUID
-{
-  rgw_user user_id;
-  void encode(bufferlist& bl) const {
-    std::string s;
-    user_id.to_str(s);
-    using ceph::encode;
-    encode(s, bl);
-  }
-  void decode(bufferlist::const_iterator& bl) {
-    std::string s;
-    using ceph::decode;
-    decode(s, bl);
-    user_id.from_str(s);
-  }
-};
-WRITE_CLASS_ENCODER(RGWUID)
-
-/** Entry for bucket metadata collection */
-struct bucket_meta_entry {
-  size_t size;
-  size_t size_rounded;
-  ceph::real_time creation_time;
-  uint64_t count;
-};
-
-extern int rgw_user_sync_all_stats(const DoutPrefixProvider *dpp, rgw::sal::Driver* driver, rgw::sal::User* user, optional_yield y);
-extern int rgw_user_get_all_buckets_stats(const DoutPrefixProvider *dpp,
-  rgw::sal::Driver* driver, rgw::sal::User* user,
-  std::map<std::string, bucket_meta_entry>& buckets_usage_map, optional_yield y);
-
-/**
- * Get the anonymous (ie, unauthenticated) user info.
- */
-extern void rgw_get_anon_user(RGWUserInfo& info);
-
-extern void rgw_perm_to_str(uint32_t mask, char *buf, int len);
-extern uint32_t rgw_str_to_perm(const char *str);
-
-extern int rgw_validate_tenant_name(const std::string& t);
-
-enum ObjectKeyType {
-  KEY_TYPE_SWIFT,
-  KEY_TYPE_S3,
-  KEY_TYPE_UNDEFINED
-};
-
-enum RGWKeyPoolOp {
-  GENERATE_KEY,
-  MODIFY_KEY
-};
-
-enum RGWUserId {
-  RGW_USER_ID,
-  RGW_SWIFT_USERNAME,
-  RGW_USER_EMAIL,
-  RGW_ACCESS_KEY,
-};
-
-/*
- * An RGWUser class along with supporting classes created
- * to support the creation of an RESTful administrative API
- */
-struct RGWUserAdminOpState {
-  // user attributes
-  std::unique_ptr<rgw::sal::User> user;
-  std::string user_email;
-  std::string display_name;
-  rgw_user new_user_id;
-  bool overwrite_new_user = false;
-  int32_t max_buckets{RGW_DEFAULT_MAX_BUCKETS};
-  __u8 suspended{0};
-  __u8 admin{0};
-  __u8 system{0};
-  __u8 exclusive{0};
-  __u8 fetch_stats{0};
-  __u8 sync_stats{0};
-  std::string caps;
-  RGWObjVersionTracker objv;
-  uint32_t op_mask{0};
-  std::map<int, std::string> temp_url_keys;
-
-  // subuser attributes
-  std::string subuser;
-  uint32_t perm_mask{RGW_PERM_NONE};
-
-  // key_attributes
-  std::string id; // access key
-  std::string key; // secret key
-  int32_t key_type{-1};
-  bool access_key_exist = false;
-
-  std::set<std::string> mfa_ids;
-
-  // operation attributes
-  bool existing_user{false};
-  bool existing_key{false};
-  bool existing_subuser{false};
-  bool existing_email{false};
-  bool subuser_specified{false};
-  bool gen_secret{false};
-  bool gen_access{false};
-  bool gen_subuser{false};
-  bool id_specified{false};
-  bool key_specified{false};
-  bool type_specified{false};
-  bool key_type_setbycontext{false};   // key type set by user or subuser context
-  bool purge_data{false};
-  bool purge_keys{false};
-  bool display_name_specified{false};
-  bool user_email_specified{false};
-  bool max_buckets_specified{false};
-  bool perm_specified{false};
-  bool op_mask_specified{false};
-  bool caps_specified{false};
-  bool suspension_op{false};
-  bool admin_specified{false};
-  bool system_specified{false};
-  bool key_op{false};
-  bool temp_url_key_specified{false};
-  bool found_by_uid{false};
-  bool found_by_email{false};
-  bool found_by_key{false};
-  bool mfa_ids_specified{false};
-  // req parameters
-  bool populated{false};
-  bool initialized{false};
-  bool key_params_checked{false};
-  bool subuser_params_checked{false};
-  bool user_params_checked{false};
-
-  bool bucket_quota_specified{false};
-  bool user_quota_specified{false};
-  bool bucket_ratelimit_specified{false};
-  bool user_ratelimit_specified{false};
-
-  RGWQuota quota;
-  RGWRateLimitInfo user_ratelimit;
-  RGWRateLimitInfo bucket_ratelimit;
-
-  // req parameters for listing user
-  std::string marker{""};
-  uint32_t max_entries{1000};
-  rgw_placement_rule default_placement; // user default placement
-  bool default_placement_specified{false};
-
-  std::list<std::string> placement_tags;  // user default placement_tags
-  bool placement_tags_specified{false};
-
-  void set_access_key(const std::string& access_key) {
-    if (access_key.empty())
-      return;
-
-    id = access_key;
-    id_specified = true;
-    gen_access = false;
-    key_op = true;
-  }
-
-  void set_secret_key(const std::string& secret_key) {
-    if (secret_key.empty())
-      return;
-
-    key = secret_key;
-    key_specified = true;
-    gen_secret = false;
-    key_op = true;
-  }
-
-  void set_user_id(const rgw_user& id);
-
-  void set_new_user_id(const rgw_user& id) {
-    if (id.empty())
-      return;
-
-    new_user_id = id;
-  }
-  void set_overwrite_new_user(bool b) {
-    overwrite_new_user = b;
-  }
-
-  void set_user_email(std::string& email) {
-   /* always lowercase email address */
-    boost::algorithm::to_lower(email);
-    user_email = email;
-    user_email_specified = true;
-  }
-
-  void set_display_name(const std::string& name) {
-    if (name.empty())
-      return;
-
-    display_name = name;
-    display_name_specified = true;
-  }
-
-  void set_subuser(std::string& _subuser);
-
-  void set_caps(const std::string& _caps) {
-    if (_caps.empty())
-      return;
-
-    caps = _caps;
-    caps_specified = true;
-  }
-
-  void set_perm(uint32_t perm) {
-    perm_mask = perm;
-    perm_specified = true;
-  }
-
-  void set_op_mask(uint32_t mask) {
-    op_mask = mask;
-    op_mask_specified = true;
-  }
-
-  void set_temp_url_key(const std::string& key, int index) {
-    temp_url_keys[index] = key;
-    temp_url_key_specified = true;
-  }
-
-  void set_key_type(int32_t type) {
-    key_type = type;
-    type_specified = true;
-  }
-
-  void set_access_key_exist() {
-    access_key_exist = true;
-  }
-
-  void set_suspension(__u8 is_suspended) {
-    suspended = is_suspended;
-    suspension_op = true;
-  }
-
-  void set_admin(__u8 is_admin) {
-    admin = is_admin;
-    admin_specified = true;
-  }
-
-  void set_system(__u8 is_system) {
-    system = is_system;
-    system_specified = true;
-  }
-
-  void set_exclusive(__u8 is_exclusive) {
-    exclusive = is_exclusive;
-  }
-
-  void set_fetch_stats(__u8 is_fetch_stats) {
-    fetch_stats = is_fetch_stats;
-  }
-
-  void set_sync_stats(__u8 is_sync_stats) {
-    sync_stats = is_sync_stats;
-  }
-
-  void set_user_info(RGWUserInfo& user_info);
-
-  void set_user_version_tracker(RGWObjVersionTracker& objv_tracker);
-
-  void set_max_buckets(int32_t mb) {
-    max_buckets = mb;
-    max_buckets_specified = true;
-  }
-
-  void set_gen_access() {
-    gen_access = true;
-    key_op = true;
-  }
-
-  void set_gen_secret() {
-    gen_secret = true;
-    key_op = true;
-  }
-
-  void set_generate_key() {
-    if (id.empty())
-      gen_access = true;
-    if (key.empty())
-      gen_secret = true;
-    key_op = true;
-  }
-
-  void clear_generate_key() {
-    gen_access = false;
-    gen_secret = false;
-  }
-
-  void set_purge_keys() {
-    purge_keys = true;
-    key_op = true;
-  }
-
-  void set_bucket_quota(RGWQuotaInfo& quotas) {
-    quota.bucket_quota = quotas;
-    bucket_quota_specified = true;
-  }
-
-  void set_user_quota(RGWQuotaInfo& quotas) {
-    quota.user_quota = quotas;
-    user_quota_specified = true;
-  }
-
-  void set_bucket_ratelimit(RGWRateLimitInfo& ratelimit) {
-    bucket_ratelimit = ratelimit;
-    bucket_ratelimit_specified = true;
-  }
-
-  void set_user_ratelimit(RGWRateLimitInfo& ratelimit) {
-    user_ratelimit = ratelimit;
-    user_ratelimit_specified = true;
-  }
-
-  void set_mfa_ids(const std::set<std::string>& ids) {
-    mfa_ids = ids;
-    mfa_ids_specified = true;
-  }
-
-  void set_default_placement(const rgw_placement_rule& _placement) {
-    default_placement = _placement;
-    default_placement_specified = true;
-  }
-
-  void set_placement_tags(const std::list<std::string>& _tags) {
-    placement_tags = _tags;
-    placement_tags_specified = true;
-  }
-
-  bool is_populated() { return populated; }
-  bool is_initialized() { return initialized; }
-  bool has_existing_user() { return existing_user; }
-  bool has_existing_key() { return existing_key; }
-  bool has_existing_subuser() { return existing_subuser; }
-  bool has_existing_email() { return existing_email; }
-  bool has_subuser() { return subuser_specified; }
-  bool has_key_op() { return key_op; }
-  bool has_caps_op() { return caps_specified; }
-  bool has_suspension_op() { return suspension_op; }
-  bool has_subuser_perm() { return perm_specified; }
-  bool has_op_mask() { return op_mask_specified; }
-  bool will_gen_access() { return gen_access; }
-  bool will_gen_secret() { return gen_secret; }
-  bool will_gen_subuser() { return gen_subuser; }
-  bool will_purge_keys() { return purge_keys; }
-  bool will_purge_data() { return purge_data; }
-  bool will_generate_subuser() { return gen_subuser; }
-  bool has_bucket_quota() { return bucket_quota_specified; }
-  bool has_user_quota() { return user_quota_specified; }
-  void set_populated() { populated = true; }
-  void clear_populated() { populated = false; }
-  void set_initialized() { initialized = true; }
-  void set_existing_user(bool flag) { existing_user = flag; }
-  void set_existing_key(bool flag) { existing_key = flag; }
-  void set_existing_subuser(bool flag) { existing_subuser = flag; }
-  void set_existing_email(bool flag) { existing_email = flag; }
-  void set_purge_data(bool flag) { purge_data = flag; }
-  void set_generate_subuser(bool flag) { gen_subuser = flag; }
-  __u8 get_suspension_status() { return suspended; }
-  int32_t get_key_type() {return key_type; }
-  bool get_access_key_exist() {return access_key_exist; }
-  uint32_t get_subuser_perm() { return perm_mask; }
-  int32_t get_max_buckets() { return max_buckets; }
-  uint32_t get_op_mask() { return op_mask; }
-  RGWQuotaInfo& get_bucket_quota() { return quota.bucket_quota; }
-  RGWQuotaInfo& get_user_quota() { return quota.user_quota; }
-  std::set<std::string>& get_mfa_ids() { return mfa_ids; }
-
-  rgw::sal::User* get_user() { return user.get(); }
-  const rgw_user& get_user_id();
-  std::string get_subuser() { return subuser; }
-  std::string get_access_key() { return id; }
-  std::string get_secret_key() { return key; }
-  std::string get_caps() { return caps; }
-  std::string get_user_email() { return user_email; }
-  std::string get_display_name() { return display_name; }
-  rgw_user& get_new_uid() { return new_user_id; }
-  bool get_overwrite_new_user() const { return overwrite_new_user; }
-  std::map<int, std::string>& get_temp_url_keys() { return temp_url_keys; }
-
-  RGWUserInfo&  get_user_info();
-
-  std::map<std::string, RGWAccessKey>* get_swift_keys();
-  std::map<std::string, RGWAccessKey>* get_access_keys();
-  std::map<std::string, RGWSubUser>* get_subusers();
-
-  RGWUserCaps* get_caps_obj();
-
-  std::string build_default_swift_kid();
-
-  std::string generate_subuser();
-
-  RGWUserAdminOpState(rgw::sal::Driver* driver);
-};
-
-class RGWUser;
-
-class RGWAccessKeyPool
-{
-  RGWUser *user{nullptr};
-
-  std::map<std::string, int, ltstr_nocase> key_type_map;
-  rgw_user user_id;
-  rgw::sal::Driver* driver{nullptr};
-
-  std::map<std::string, RGWAccessKey> *swift_keys{nullptr};
-  std::map<std::string, RGWAccessKey> *access_keys{nullptr};
-
-  // we don't want to allow keys for the anonymous user or a null user
-  bool keys_allowed{false};
-
-private:
-  int create_key(RGWUserAdminOpState& op_state, std::string *err_msg = NULL);
-  int generate_key(const DoutPrefixProvider *dpp, 
-                   RGWUserAdminOpState& op_state, optional_yield y,
-                  std::string *err_msg = NULL);
-  int modify_key(RGWUserAdminOpState& op_state, std::string *err_msg = NULL);
-
-  int check_key_owner(RGWUserAdminOpState& op_state);
-  bool check_existing_key(RGWUserAdminOpState& op_state);
-  int check_op(RGWUserAdminOpState& op_state, std::string *err_msg = NULL);
-
-  /* API Contract Fulfilment */
-  int execute_add(const DoutPrefixProvider *dpp, 
-                  RGWUserAdminOpState& op_state, std::string *err_msg,
-                 bool defer_save, optional_yield y);
-  int execute_remove(const DoutPrefixProvider *dpp, 
-                     RGWUserAdminOpState& op_state, std::string *err_msg,
-                    bool defer_save, optional_yield y);
-  int remove_subuser_keys(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, std::string *err_msg,
-                         bool defer_save, optional_yield y);
-
-  int add(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, std::string *err_msg, bool defer_save,
-         optional_yield y);
-  int remove(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, std::string *err_msg,
-            bool defer_save, optional_yield y);
-public:
-  explicit RGWAccessKeyPool(RGWUser* usr);
-
-  int init(RGWUserAdminOpState& op_state);
-
-  /* API Contracted Methods */
-  int add(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, optional_yield y,
-         std::string *err_msg = NULL);
-  int remove(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, optional_yield y,
-            std::string *err_msg = NULL);
-
-  friend class RGWUser;
-  friend class RGWSubUserPool;
-};
-
-class RGWSubUserPool
-{
-  RGWUser *user{nullptr};
-
-  rgw_user user_id;
-  rgw::sal::Driver* driver{nullptr};
-  bool subusers_allowed{false};
-
-  std::map<std::string, RGWSubUser> *subuser_map{nullptr};
-
-private:
-  int check_op(RGWUserAdminOpState& op_state, std::string *err_msg = NULL);
-
-  /* API Contract Fulfillment */
-  int execute_add(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, std::string *err_msg, bool defer_save, optional_yield y);
-  int execute_remove(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, std::string *err_msg, bool defer_save, optional_yield y);
-  int execute_modify(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, std::string *err_msg, bool defer_save, optional_yield y);
-
-  int add(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, std::string *err_msg, bool defer_save,
-         optional_yield y);
-  int remove(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, std::string *err_msg, bool defer_save, optional_yield y);
-  int modify(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, optional_yield y, std::string *err_msg, bool defer_save);
-public:
-  explicit RGWSubUserPool(RGWUser *user);
-
-  bool exists(std::string subuser);
-  int init(RGWUserAdminOpState& op_state);
-
-  /* API contracted methods */
-  int add(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, optional_yield y,
-         std::string *err_msg = NULL);
-  int remove(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, optional_yield y, std::string *err_msg = NULL);
-  int modify(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, optional_yield y, std::string *err_msg = NULL);
-
-  friend class RGWUser;
-};
-
-class RGWUserCapPool
-{
-  RGWUserCaps *caps{nullptr};
-  bool caps_allowed{false};
-  RGWUser *user{nullptr};
-
-private:
-  int add(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, std::string *err_msg, bool defer_save,
-         optional_yield y);
-  int remove(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, std::string *err_msg, bool defer_save,
-            optional_yield y);
-
-public:
-  explicit RGWUserCapPool(RGWUser *user);
-
-  int init(RGWUserAdminOpState& op_state);
-
-  /* API contracted methods */
-  int add(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, optional_yield y,
-         std::string *err_msg = NULL);
-  int remove(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, optional_yield y, std::string *err_msg = NULL);
-
-  friend class RGWUser;
-};
-
-class RGWUser
-{
-
-private:
-  RGWUserInfo old_info;
-  rgw::sal::Driver* driver{nullptr};
-
-  rgw_user user_id;
-  bool info_stored{false};
-
-  void set_populated() { info_stored = true; }
-  void clear_populated() { info_stored = false; }
-  bool is_populated() { return info_stored; }
-
-  int check_op(RGWUserAdminOpState&  req, std::string *err_msg);
-  int update(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, std::string *err_msg, optional_yield y);
-
-  void clear_members();
-  void init_default();
-
-  /* API Contract Fulfillment */
-  int execute_add(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, std::string *err_msg,
-                 optional_yield y);
-  int execute_remove(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state,
-                    std::string *err_msg, optional_yield y);
-  int execute_modify(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, std::string *err_msg, optional_yield y);
-  int execute_rename(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, std::string *err_msg, optional_yield y);
-
-public:
-  RGWUser();
-
-  int init(const DoutPrefixProvider *dpp, rgw::sal::Driver* storage, RGWUserAdminOpState& op_state,
-          optional_yield y);
-
-  int init_storage(rgw::sal::Driver* storage);
-  int init(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, optional_yield y);
-  int init_members(RGWUserAdminOpState& op_state);
-
-  rgw::sal::Driver* get_driver() { return driver; }
-
-  /* API Contracted Members */
-  RGWUserCapPool caps;
-  RGWAccessKeyPool keys;
-  RGWSubUserPool subusers;
-
-  /* API Contracted Methods */
-  int add(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, optional_yield y, std::string *err_msg = NULL);
-
-  int remove(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, optional_yield y, std::string *err_msg = NULL);
-
-  int rename(RGWUserAdminOpState& op_state, optional_yield y, const DoutPrefixProvider *dpp, std::string *err_msg = NULL);
-
-  /* remove an already populated RGWUser */
-  int remove(std::string *err_msg = NULL);
-
-  int modify(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, optional_yield y, std::string *err_msg = NULL);
-
-  /* retrieve info from an existing user in the RGW system */
-  int info(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, RGWUserInfo& fetched_info, optional_yield y,
-          std::string *err_msg = NULL);
-
-  /* info from an already populated RGWUser */
-  int info (RGWUserInfo& fetched_info, std::string *err_msg = NULL);
-
-  /* list the existing users */
-  int list(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, RGWFormatterFlusher& flusher);
-
-  friend class RGWAccessKeyPool;
-  friend class RGWSubUserPool;
-  friend class RGWUserCapPool;
-};
-
-/* Wrappers for admin API functionality */
-
-class RGWUserAdminOp_User
-{
-public:
-  static int list(const DoutPrefixProvider *dpp, rgw::sal::Driver* driver,
-                  RGWUserAdminOpState& op_state, RGWFormatterFlusher& flusher);
-
-  static int info(const DoutPrefixProvider *dpp,
-                 rgw::sal::Driver* driver,
-                  RGWUserAdminOpState& op_state, RGWFormatterFlusher& flusher,
-                 optional_yield y);
-
-  static int create(const DoutPrefixProvider *dpp,
-                   rgw::sal::Driver* driver,
-                   RGWUserAdminOpState& op_state, RGWFormatterFlusher& flusher,
-                   optional_yield y);
-
-  static int modify(const DoutPrefixProvider *dpp,
-                   rgw::sal::Driver* driver,
-                   RGWUserAdminOpState& op_state, RGWFormatterFlusher& flusher, optional_yield y);
-
-  static int remove(const DoutPrefixProvider *dpp, rgw::sal::Driver* driver,
-                  RGWUserAdminOpState& op_state, RGWFormatterFlusher& flusher, optional_yield y);
-};
-
-class RGWUserAdminOp_Subuser
-{
-public:
-  static int create(const DoutPrefixProvider *dpp,
-                   rgw::sal::Driver* driver,
-                   RGWUserAdminOpState& op_state, RGWFormatterFlusher& flusher,
-                   optional_yield y);
-
-  static int modify(const DoutPrefixProvider *dpp,
-                   rgw::sal::Driver* driver,
-                   RGWUserAdminOpState& op_state, RGWFormatterFlusher& flusher,
-                   optional_yield y);
-
-  static int remove(const DoutPrefixProvider *dpp,
-                   rgw::sal::Driver* driver,
-                   RGWUserAdminOpState& op_state, RGWFormatterFlusher& flusher,
-                   optional_yield y);
-};
-
-class RGWUserAdminOp_Key
-{
-public:
-  static int create(const DoutPrefixProvider *dpp, rgw::sal::Driver* driver,
-                   RGWUserAdminOpState& op_state, RGWFormatterFlusher& flusher,
-                   optional_yield y);
-
-  static int remove(const DoutPrefixProvider *dpp,
-                   rgw::sal::Driver* driver,
-                   RGWUserAdminOpState& op_state, RGWFormatterFlusher& flusher,
-                   optional_yield y);
-};
-
-class RGWUserAdminOp_Caps
-{
-public:
-  static int add(const DoutPrefixProvider *dpp,
-                rgw::sal::Driver* driver,
-                RGWUserAdminOpState& op_state, RGWFormatterFlusher& flusher,
-                optional_yield y);
-
-  static int remove(const DoutPrefixProvider *dpp,
-                   rgw::sal::Driver* driver,
-                   RGWUserAdminOpState& op_state, RGWFormatterFlusher& flusher,
-                   optional_yield y);
-};
-
-struct RGWUserCompleteInfo {
-  RGWUserInfo info;
-  std::map<std::string, bufferlist> attrs;
-  bool has_attrs{false};
-
-  void dump(Formatter * const f) const {
-    info.dump(f);
-    encode_json("attrs", attrs, f);
-  }
-
-  void decode_json(JSONObj *obj) {
-    decode_json_obj(info, obj);
-    has_attrs = JSONDecoder::decode_json("attrs", attrs, obj);
-  }
-};
-
-class RGWUserMetadataObject : public RGWMetadataObject {
-  RGWUserCompleteInfo uci;
-public:
-  RGWUserMetadataObject() {}
-  RGWUserMetadataObject(const RGWUserCompleteInfo& _uci, const obj_version& v, real_time m)
-      : uci(_uci) {
-    objv = v;
-    mtime = m;
-  }
-
-  void dump(Formatter *f) const override {
-    uci.dump(f);
-  }
-
-  RGWUserCompleteInfo& get_uci() {
-    return uci;
-  }
-};
-
-class RGWUserMetadataHandler;
-
-class RGWUserCtl
-{
-  struct Svc {
-    RGWSI_Zone *zone{nullptr};
-    RGWSI_User *user{nullptr};
-  } svc;
-
-  struct Ctl {
-    RGWBucketCtl *bucket{nullptr};
-  } ctl;
-
-  RGWUserMetadataHandler *umhandler;
-  RGWSI_MetaBackend_Handler *be_handler{nullptr};
-  
-public:
-  RGWUserCtl(RGWSI_Zone *zone_svc,
-             RGWSI_User *user_svc,
-             RGWUserMetadataHandler *_umhandler);
-
-  void init(RGWBucketCtl *bucket_ctl) {
-    ctl.bucket = bucket_ctl;
-  }
-
-  RGWBucketCtl *get_bucket_ctl() {
-    return ctl.bucket;
-  }
-
-  struct GetParams {
-    RGWObjVersionTracker *objv_tracker{nullptr};
-    ceph::real_time *mtime{nullptr};
-    rgw_cache_entry_info *cache_info{nullptr};
-    std::map<std::string, bufferlist> *attrs{nullptr};
-
-    GetParams() {}
-
-    GetParams& set_objv_tracker(RGWObjVersionTracker *_objv_tracker) {
-      objv_tracker = _objv_tracker;
-      return *this;
-    }
-
-    GetParams& set_mtime(ceph::real_time *_mtime) {
-      mtime = _mtime;
-      return *this;
-    }
-
-    GetParams& set_cache_info(rgw_cache_entry_info *_cache_info) {
-      cache_info = _cache_info;
-      return *this;
-    }
-
-    GetParams& set_attrs(std::map<std::string, bufferlist> *_attrs) {
-      attrs = _attrs;
-      return *this;
-    }
-  };
-
-  struct PutParams {
-    RGWUserInfo *old_info{nullptr};
-    RGWObjVersionTracker *objv_tracker{nullptr};
-    ceph::real_time mtime;
-    bool exclusive{false};
-    std::map<std::string, bufferlist> *attrs{nullptr};
-
-    PutParams() {}
-
-    PutParams& set_old_info(RGWUserInfo *_info) {
-      old_info = _info;
-      return *this;
-    }
-
-    PutParams& set_objv_tracker(RGWObjVersionTracker *_objv_tracker) {
-      objv_tracker = _objv_tracker;
-      return *this;
-    }
-
-    PutParams& set_mtime(const ceph::real_time& _mtime) {
-      mtime = _mtime;
-      return *this;
-    }
-
-    PutParams& set_exclusive(bool _exclusive) {
-      exclusive = _exclusive;
-      return *this;
-    }
-
-    PutParams& set_attrs(std::map<std::string, bufferlist> *_attrs) {
-      attrs = _attrs;
-      return *this;
-    }
-  };
-
-  struct RemoveParams {
-    RGWObjVersionTracker *objv_tracker{nullptr};
-
-    RemoveParams() {}
-
-    RemoveParams& set_objv_tracker(RGWObjVersionTracker *_objv_tracker) {
-      objv_tracker = _objv_tracker;
-      return *this;
-    }
-  };
-
-  int get_info_by_uid(const DoutPrefixProvider *dpp, 
-                      const rgw_user& uid, RGWUserInfo *info,
-                      optional_yield y, const GetParams& params = {});
-  int get_info_by_email(const DoutPrefixProvider *dpp, 
-                        const std::string& email, RGWUserInfo *info,
-                        optional_yield y, const GetParams& params = {});
-  int get_info_by_swift(const DoutPrefixProvider *dpp, 
-                        const std::string& swift_name, RGWUserInfo *info,
-                        optional_yield y, const GetParams& params = {});
-  int get_info_by_access_key(const DoutPrefixProvider *dpp, 
-                             const std::string& access_key, RGWUserInfo *info,
-                             optional_yield y, const GetParams& params = {});
-
-  int get_attrs_by_uid(const DoutPrefixProvider *dpp, 
-                       const rgw_user& user_id,
-                       std::map<std::string, bufferlist> *attrs,
-                       optional_yield y,
-                       RGWObjVersionTracker *objv_tracker = nullptr);
-
-  int store_info(const DoutPrefixProvider *dpp, 
-                 const RGWUserInfo& info, optional_yield y,
-                 const PutParams& params = {});
-  int remove_info(const DoutPrefixProvider *dpp, 
-                  const RGWUserInfo& info, optional_yield y,
-                  const RemoveParams& params = {});
-
-  int list_buckets(const DoutPrefixProvider *dpp, 
-                   const rgw_user& user,
-                   const std::string& marker,
-                   const std::string& end_marker,
-                   uint64_t max,
-                   bool need_stats,
-                   RGWUserBuckets *buckets,
-                   bool *is_truncated,
-                  optional_yield y,
-                   uint64_t default_max = 1000);
-
-  int read_stats(const DoutPrefixProvider *dpp, 
-                 const rgw_user& user, RGWStorageStats *stats,
-                optional_yield y,
-                ceph::real_time *last_stats_sync = nullptr,     /* last time a full stats sync completed */
-                ceph::real_time *last_stats_update = nullptr);   /* last time a stats update was done */
-};
-
-class RGWUserMetaHandlerAllocator {
-public:
-  static RGWMetadataHandler *alloc(RGWSI_User *user_svc);
-};
-
-
-#endif
diff --git a/src/rgw/store/rados/rgw_zone.cc b/src/rgw/store/rados/rgw_zone.cc
deleted file mode 100644 (file)
index d9e7505..0000000
+++ /dev/null
@@ -1,1287 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab ft=cpp
-
-#include "rgw_zone.h"
-#include "rgw_realm_watcher.h"
-#include "rgw_sal_config.h"
-#include "rgw_sync.h"
-
-#include "services/svc_zone.h"
-
-#define dout_context g_ceph_context
-#define dout_subsys ceph_subsys_rgw
-
-using namespace std;
-using namespace rgw_zone_defaults;
-
-RGWMetaSyncStatusManager::~RGWMetaSyncStatusManager(){}
-
-#define FIRST_EPOCH 1
-
-struct RGWAccessKey;
-
-/// Generate a random uuid for realm/period/zonegroup/zone ids
-static std::string gen_random_uuid()
-{
-  uuid_d uuid;
-  uuid.generate_random();
-  return uuid.to_string();
-}
-
-void RGWDefaultZoneGroupInfo::dump(Formatter *f) const {
-  encode_json("default_zonegroup", default_zonegroup, f);
-}
-
-void RGWDefaultZoneGroupInfo::decode_json(JSONObj *obj) {
-
-  JSONDecoder::decode_json("default_zonegroup", default_zonegroup, obj);
-  /* backward compatability with region */
-  if (default_zonegroup.empty()) {
-    JSONDecoder::decode_json("default_region", default_zonegroup, obj);
-  }
-}
-
-int RGWZoneGroup::create_default(const DoutPrefixProvider *dpp, optional_yield y, bool old_format)
-{
-  name = default_zonegroup_name;
-  api_name = default_zonegroup_name;
-  is_master = true;
-
-  RGWZoneGroupPlacementTarget placement_target;
-  placement_target.name = "default-placement";
-  placement_targets[placement_target.name] = placement_target;
-  default_placement.name = "default-placement";
-
-  RGWZoneParams zone_params(default_zone_name);
-
-  int r = zone_params.init(dpp, cct, sysobj_svc, y, false);
-  if (r < 0) {
-    ldpp_dout(dpp, 0) << "create_default: error initializing zone params: " << cpp_strerror(-r) << dendl;
-    return r;
-  }
-
-  r = zone_params.create_default(dpp, y);
-  if (r < 0 && r != -EEXIST) {
-    ldpp_dout(dpp, 0) << "create_default: error in create_default  zone params: " << cpp_strerror(-r) << dendl;
-    return r;
-  } else if (r == -EEXIST) {
-    ldpp_dout(dpp, 10) << "zone_params::create_default() returned -EEXIST, we raced with another default zone_params creation" << dendl;
-    zone_params.clear_id();
-    r = zone_params.init(dpp, cct, sysobj_svc, y);
-    if (r < 0) {
-      ldpp_dout(dpp, 0) << "create_default: error in init existing zone params: " << cpp_strerror(-r) << dendl;
-      return r;
-    }
-    ldpp_dout(dpp, 20) << "zone_params::create_default() " << zone_params.get_name() << " id " << zone_params.get_id()
-                  << dendl;
-  }
-  
-  RGWZone& default_zone = zones[zone_params.get_id()];
-  default_zone.name = zone_params.get_name();
-  default_zone.id = zone_params.get_id();
-  master_zone = default_zone.id;
-
-  // enable all supported features
-  enabled_features.insert(rgw::zone_features::supported.begin(),
-                          rgw::zone_features::supported.end());
-  default_zone.supported_features = enabled_features;
-  
-  r = create(dpp, y);
-  if (r < 0 && r != -EEXIST) {
-    ldpp_dout(dpp, 0) << "error storing zone group info: " << cpp_strerror(-r) << dendl;
-    return r;
-  }
-
-  if (r == -EEXIST) {
-    ldpp_dout(dpp, 10) << "create_default() returned -EEXIST, we raced with another zonegroup creation" << dendl;
-    id.clear();
-    r = init(dpp, cct, sysobj_svc, y);
-    if (r < 0) {
-      return r;
-    }
-  }
-
-  if (old_format) {
-    name = id;
-  }
-
-  post_process_params(dpp, y);
-
-  return 0;
-}
-
-int RGWZoneGroup::equals(const string& other_zonegroup) const
-{
-  if (is_master && other_zonegroup.empty())
-    return true;
-
-  return (id  == other_zonegroup);
-}
-
-int RGWZoneGroup::add_zone(const DoutPrefixProvider *dpp, 
-                           const RGWZoneParams& zone_params, bool *is_master, bool *read_only,
-                           const list<string>& endpoints, const string *ptier_type,
-                           bool *psync_from_all, list<string>& sync_from, list<string>& sync_from_rm,
-                           string *predirect_zone, std::optional<int> bucket_index_max_shards,
-                           RGWSyncModulesManager *sync_mgr,
-                           const rgw::zone_features::set& enable_features,
-                           const rgw::zone_features::set& disable_features,
-                          optional_yield y)
-{
-  auto& zone_id = zone_params.get_id();
-  auto& zone_name = zone_params.get_name();
-
-  // check for duplicate zone name on insert
-  if (!zones.count(zone_id)) {
-    for (const auto& zone : zones) {
-      if (zone.second.name == zone_name) {
-        ldpp_dout(dpp, 0) << "ERROR: found existing zone name " << zone_name
-            << " (" << zone.first << ") in zonegroup " << get_name() << dendl;
-        return -EEXIST;
-      }
-    }
-  }
-
-  if (is_master) {
-    if (*is_master) {
-      if (!master_zone.empty() && master_zone != zone_id) {
-        ldpp_dout(dpp, 0) << "NOTICE: overriding master zone: " << master_zone << dendl;
-      }
-      master_zone = zone_id;
-    } else if (master_zone == zone_id) {
-      master_zone.clear();
-    }
-  }
-
-  RGWZone& zone = zones[zone_id];
-  zone.name = zone_name;
-  zone.id = zone_id;
-  if (!endpoints.empty()) {
-    zone.endpoints = endpoints;
-  }
-  if (read_only) {
-    zone.read_only = *read_only;
-  }
-  if (ptier_type) {
-    zone.tier_type = *ptier_type;
-    if (!sync_mgr->get_module(*ptier_type, nullptr)) {
-      ldpp_dout(dpp, 0) << "ERROR: could not found sync module: " << *ptier_type 
-                    << ",  valid sync modules: " 
-                    << sync_mgr->get_registered_module_names()
-                    << dendl;
-      return -ENOENT;
-    }
-  }
-
-  if (psync_from_all) {
-    zone.sync_from_all = *psync_from_all;
-  }
-
-  if (predirect_zone) {
-    zone.redirect_zone = *predirect_zone;
-  }
-
-  if (bucket_index_max_shards) {
-    zone.bucket_index_max_shards = *bucket_index_max_shards;
-  }
-
-  for (auto add : sync_from) {
-    zone.sync_from.insert(add);
-  }
-
-  for (auto rm : sync_from_rm) {
-    zone.sync_from.erase(rm);
-  }
-
-  zone.supported_features.insert(enable_features.begin(),
-                                 enable_features.end());
-
-  for (const auto& feature : disable_features) {
-    if (enabled_features.contains(feature)) {
-      lderr(cct) << "ERROR: Cannot disable zone feature \"" << feature
-          << "\" until it's been disabled in zonegroup " << name << dendl;
-      return -EINVAL;
-    }
-    auto i = zone.supported_features.find(feature);
-    if (i == zone.supported_features.end()) {
-      ldout(cct, 1) << "WARNING: zone feature \"" << feature
-          << "\" was not enabled in zone " << zone.name << dendl;
-      continue;
-    }
-    zone.supported_features.erase(i);
-  }
-
-  post_process_params(dpp, y);
-
-  return update(dpp,y);
-}
-
-
-int RGWZoneGroup::rename_zone(const DoutPrefixProvider *dpp, 
-                              const RGWZoneParams& zone_params,
-                             optional_yield y)
-{
-  RGWZone& zone = zones[zone_params.get_id()];
-  zone.name = zone_params.get_name();
-
-  return update(dpp, y);
-}
-
-void RGWZoneGroup::post_process_params(const DoutPrefixProvider *dpp, optional_yield y)
-{
-  bool log_data = zones.size() > 1;
-
-  if (master_zone.empty()) {
-    auto iter = zones.begin();
-    if (iter != zones.end()) {
-      master_zone = iter->first;
-    }
-  }
-  
-  for (auto& item : zones) {
-    RGWZone& zone = item.second;
-    zone.log_data = log_data;
-
-    RGWZoneParams zone_params(zone.id, zone.name);
-    int ret = zone_params.init(dpp, cct, sysobj_svc, y);
-    if (ret < 0) {
-      ldpp_dout(dpp, 0) << "WARNING: could not read zone params for zone id=" << zone.id << " name=" << zone.name << dendl;
-      continue;
-    }
-
-    for (auto& pitem : zone_params.placement_pools) {
-      const string& placement_name = pitem.first;
-      if (placement_targets.find(placement_name) == placement_targets.end()) {
-        RGWZoneGroupPlacementTarget placement_target;
-        placement_target.name = placement_name;
-        placement_targets[placement_name] = placement_target;
-      }
-    }
-  }
-
-  if (default_placement.empty() && !placement_targets.empty()) {
-    default_placement.init(placement_targets.begin()->first, RGW_STORAGE_CLASS_STANDARD);
-  }
-}
-
-int RGWZoneGroup::remove_zone(const DoutPrefixProvider *dpp, const std::string& zone_id, optional_yield y)
-{
-  auto iter = zones.find(zone_id);
-  if (iter == zones.end()) {
-    ldpp_dout(dpp, 0) << "zone id " << zone_id << " is not a part of zonegroup "
-        << name << dendl;
-    return -ENOENT;
-  }
-
-  zones.erase(iter);
-
-  post_process_params(dpp, y);
-
-  return update(dpp, y);
-}
-
-void RGWDefaultSystemMetaObjInfo::dump(Formatter *f) const {
-  encode_json("default_id", default_id, f);
-}
-
-void RGWDefaultSystemMetaObjInfo::decode_json(JSONObj *obj) {
-  JSONDecoder::decode_json("default_id", default_id, obj);
-}
-
-int RGWSystemMetaObj::rename(const DoutPrefixProvider *dpp, const string& new_name, optional_yield y)
-{
-  string new_id;
-  int ret = read_id(dpp, new_name, new_id, y);
-  if (!ret) {
-    return -EEXIST;
-  }
-  if (ret < 0 && ret != -ENOENT) {
-    ldpp_dout(dpp, 0) << "Error read_id " << new_name << ": " << cpp_strerror(-ret) << dendl;
-    return ret;
-  }
-  string old_name = name;
-  name = new_name;
-  ret = update(dpp, y);
-  if (ret < 0) {
-    ldpp_dout(dpp, 0) << "Error storing new obj info " << new_name << ": " << cpp_strerror(-ret) << dendl;
-    return ret;
-  }
-  ret = store_name(dpp, true, y);
-  if (ret < 0) {
-    ldpp_dout(dpp, 0) << "Error storing new name " << new_name << ": " << cpp_strerror(-ret) << dendl;
-    return ret;
-  }
-  /* delete old name */
-  rgw_pool pool(get_pool(cct));
-  string oid = get_names_oid_prefix() + old_name;
-  rgw_raw_obj old_name_obj(pool, oid);
-  auto sysobj = sysobj_svc->get_obj(old_name_obj);
-  ret = sysobj.wop().remove(dpp, y);
-  if (ret < 0) {
-    ldpp_dout(dpp, 0) << "Error delete old obj name  " << old_name << ": " << cpp_strerror(-ret) << dendl;
-    return ret;
-  }
-
-  return ret;
-}
-
-int RGWSystemMetaObj::read(const DoutPrefixProvider *dpp, optional_yield y)
-{
-  int ret = read_id(dpp, name, id, y);
-  if (ret < 0) {
-    return ret;
-  }
-
-  return read_info(dpp, id, y);
-}
-
-int RGWZoneParams::create_default(const DoutPrefixProvider *dpp, optional_yield y, bool old_format)
-{
-  name = default_zone_name;
-
-  int r = create(dpp, y);
-  if (r < 0) {
-    return r;
-  }
-
-  if (old_format) {
-    name = id;
-  }
-
-  return r;
-}
-
-const string& RGWZoneParams::get_compression_type(const rgw_placement_rule& placement_rule) const
-{
-  static const std::string NONE{"none"};
-  auto p = placement_pools.find(placement_rule.name);
-  if (p == placement_pools.end()) {
-    return NONE;
-  }
-  const auto& type = p->second.get_compression_type(placement_rule.get_storage_class());
-  return !type.empty() ? type : NONE;
-}
-
-// run an MD5 hash on the zone_id and return the first 32 bits
-static uint32_t gen_short_zone_id(const std::string zone_id)
-{
-  unsigned char md5[CEPH_CRYPTO_MD5_DIGESTSIZE];
-  MD5 hash;
-  // Allow use of MD5 digest in FIPS mode for non-cryptographic purposes
-  hash.SetFlags(EVP_MD_CTX_FLAG_NON_FIPS_ALLOW);
-  hash.Update((const unsigned char *)zone_id.c_str(), zone_id.size());
-  hash.Final(md5);
-
-  uint32_t short_id;
-  memcpy((char *)&short_id, md5, sizeof(short_id));
-  return std::max(short_id, 1u);
-}
-
-int RGWPeriodMap::update(const RGWZoneGroup& zonegroup, CephContext *cct)
-{
-  if (zonegroup.is_master_zonegroup() && (!master_zonegroup.empty() && zonegroup.get_id() != master_zonegroup)) {
-    ldout(cct,0) << "Error updating periodmap, multiple master zonegroups configured "<< dendl;
-    ldout(cct,0) << "master zonegroup: " << master_zonegroup << " and  " << zonegroup.get_id() <<dendl;
-    return -EINVAL;
-  }
-  map<string, RGWZoneGroup>::iterator iter = zonegroups.find(zonegroup.get_id());
-  if (iter != zonegroups.end()) {
-    RGWZoneGroup& old_zonegroup = iter->second;
-    if (!old_zonegroup.api_name.empty()) {
-      zonegroups_by_api.erase(old_zonegroup.api_name);
-    }
-  }
-  zonegroups[zonegroup.get_id()] = zonegroup;
-
-  if (!zonegroup.api_name.empty()) {
-    zonegroups_by_api[zonegroup.api_name] = zonegroup;
-  }
-
-  if (zonegroup.is_master_zonegroup()) {
-    master_zonegroup = zonegroup.get_id();
-  } else if (master_zonegroup == zonegroup.get_id()) {
-    master_zonegroup = "";
-  }
-
-  for (auto& i : zonegroup.zones) {
-    auto& zone = i.second;
-    if (short_zone_ids.find(zone.id) != short_zone_ids.end()) {
-      continue;
-    }
-    // calculate the zone's short id
-    uint32_t short_id = gen_short_zone_id(zone.id);
-
-    // search for an existing zone with the same short id
-    for (auto& s : short_zone_ids) {
-      if (s.second == short_id) {
-        ldout(cct, 0) << "New zone '" << zone.name << "' (" << zone.id
-            << ") generates the same short_zone_id " << short_id
-            << " as existing zone id " << s.first << dendl;
-        return -EEXIST;
-      }
-    }
-
-    short_zone_ids[zone.id] = short_id;
-  }
-
-  return 0;
-}
-
-uint32_t RGWPeriodMap::get_zone_short_id(const string& zone_id) const
-{
-  auto i = short_zone_ids.find(zone_id);
-  if (i == short_zone_ids.end()) {
-    return 0;
-  }
-  return i->second;
-}
-
-bool RGWPeriodMap::find_zone_by_name(const string& zone_name,
-                                     RGWZoneGroup *zonegroup,
-                                     RGWZone *zone) const
-{
-  for (auto& iter : zonegroups) {
-    auto& zg = iter.second;
-    for (auto& ziter : zg.zones) {
-      auto& z = ziter.second;
-
-      if (z.name == zone_name) {
-        *zonegroup = zg;
-        *zone = z;
-        return true;
-      }
-    }
-  }
-
-  return false;
-}
-
-namespace rgw {
-
-int read_realm(const DoutPrefixProvider* dpp, optional_yield y,
-               sal::ConfigStore* cfgstore,
-               std::string_view realm_id,
-               std::string_view realm_name,
-               RGWRealm& info,
-               std::unique_ptr<sal::RealmWriter>* writer)
-{
-  if (!realm_id.empty()) {
-    return cfgstore->read_realm_by_id(dpp, y, realm_id, info, writer);
-  }
-  if (!realm_name.empty()) {
-    return cfgstore->read_realm_by_name(dpp, y, realm_name, info, writer);
-  }
-  return cfgstore->read_default_realm(dpp, y, info, writer);
-}
-
-int create_realm(const DoutPrefixProvider* dpp, optional_yield y,
-                 sal::ConfigStore* cfgstore, bool exclusive,
-                 RGWRealm& info,
-                 std::unique_ptr<sal::RealmWriter>* writer_out)
-{
-  if (info.name.empty()) {
-    ldpp_dout(dpp, -1) << __func__ << " requires a realm name" << dendl;
-    return -EINVAL;
-  }
-  if (info.id.empty()) {
-    info.id = gen_random_uuid();
-  }
-
-  // if the realm already has a current_period, just make sure it exists
-  std::optional<RGWPeriod> period;
-  if (!info.current_period.empty()) {
-    period.emplace();
-    int r = cfgstore->read_period(dpp, y, info.current_period,
-                                  std::nullopt, *period);
-    if (r < 0) {
-      ldpp_dout(dpp, -1) << __func__ << " failed to read realm's current_period="
-          << info.current_period << " with " << cpp_strerror(r) << dendl;
-      return r;
-    }
-  }
-
-  // create the realm
-  std::unique_ptr<sal::RealmWriter> writer;
-  int r = cfgstore->create_realm(dpp, y, exclusive, info, &writer);
-  if (r < 0) {
-    return r;
-  }
-
-  if (!period) {
-    // initialize and exclusive-create the initial period
-    period.emplace();
-    period->id = gen_random_uuid();
-    period->period_map.id = period->id;
-    period->epoch = FIRST_EPOCH;
-    period->realm_id = info.id;
-    period->realm_name = info.name;
-
-    r = cfgstore->create_period(dpp, y, true, *period);
-    if (r < 0) {
-      ldpp_dout(dpp, -1) << __func__ << " failed to create the initial period id="
-          << period->id << " for realm " << info.name
-          << " with " << cpp_strerror(r) << dendl;
-      return r;
-    }
-  }
-
-  // update the realm's current_period
-  r = realm_set_current_period(dpp, y, cfgstore, *writer, info, *period);
-  if (r < 0) {
-    return r;
-  }
-
-  // try to set as default. may race with another create, so pass exclusive=true
-  // so we don't override an existing default
-  r = set_default_realm(dpp, y, cfgstore, info, true);
-  if (r < 0 && r != -EEXIST) {
-    ldpp_dout(dpp, 0) << "WARNING: failed to set realm as default: "
-        << cpp_strerror(r) << dendl;
-  }
-
-  if (writer_out) {
-    *writer_out = std::move(writer);
-  }
-  return 0;
-}
-
-int set_default_realm(const DoutPrefixProvider* dpp, optional_yield y,
-                      sal::ConfigStore* cfgstore, const RGWRealm& info,
-                      bool exclusive)
-{
-  return cfgstore->write_default_realm_id(dpp, y, exclusive, info.id);
-}
-
-int realm_set_current_period(const DoutPrefixProvider* dpp, optional_yield y,
-                             sal::ConfigStore* cfgstore,
-                             sal::RealmWriter& writer, RGWRealm& realm,
-                             const RGWPeriod& period)
-{
-  // update realm epoch to match the period's
-  if (realm.epoch > period.realm_epoch) {
-    ldpp_dout(dpp, -1) << __func__ << " with old realm epoch "
-        << period.realm_epoch << ", current epoch=" << realm.epoch << dendl;
-    return -EINVAL;
-  }
-  if (realm.epoch == period.realm_epoch && realm.current_period != period.id) {
-    ldpp_dout(dpp, -1) << __func__ << " with same realm epoch "
-        << period.realm_epoch << ", but different period id "
-        << period.id << " != " << realm.current_period << dendl;
-    return -EINVAL;
-  }
-
-  realm.epoch = period.realm_epoch;
-  realm.current_period = period.id;
-
-  // update the realm object
-  int r = writer.write(dpp, y, realm);
-  if (r < 0) {
-    ldpp_dout(dpp, -1) << __func__ << " failed to overwrite realm "
-        << realm.name << " with " << cpp_strerror(r) << dendl;
-    return r;
-  }
-
-  // reflect the zonegroup and period config
-  (void) reflect_period(dpp, y, cfgstore, period);
-  return 0;
-}
-
-int reflect_period(const DoutPrefixProvider* dpp, optional_yield y,
-                   sal::ConfigStore* cfgstore, const RGWPeriod& info)
-{
-  // overwrite the local period config and zonegroup objects
-  constexpr bool exclusive = false;
-
-  int r = cfgstore->write_period_config(dpp, y, exclusive, info.realm_id,
-                                        info.period_config);
-  if (r < 0) {
-    ldpp_dout(dpp, -1) << __func__ << " failed to store period config for realm id="
-        << info.realm_id << " with " << cpp_strerror(r) << dendl;
-    return r;
-  }
-
-  for (auto& [zonegroup_id, zonegroup] : info.period_map.zonegroups) {
-    r = cfgstore->create_zonegroup(dpp, y, exclusive, zonegroup, nullptr);
-    if (r < 0) {
-      ldpp_dout(dpp, -1) << __func__ << " failed to store zonegroup id="
-          << zonegroup_id << " with " << cpp_strerror(r) << dendl;
-      return r;
-    }
-    if (zonegroup.is_master) {
-      // set master as default if no default exists
-      constexpr bool exclusive = true;
-      r = set_default_zonegroup(dpp, y, cfgstore, zonegroup, exclusive);
-      if (r == 0) {
-        ldpp_dout(dpp, 1) << "Set the period's master zonegroup "
-            << zonegroup.name << " as the default" << dendl;
-      }
-    }
-  }
-  return 0;
-}
-
-std::string get_staging_period_id(std::string_view realm_id)
-{
-  return string_cat_reserve(realm_id, ":staging");
-}
-
-void fork_period(const DoutPrefixProvider* dpp, RGWPeriod& info)
-{
-  ldpp_dout(dpp, 20) << __func__ << " realm id=" << info.realm_id
-      << " period id=" << info.id << dendl;
-
-  info.predecessor_uuid = std::move(info.id);
-  info.id = get_staging_period_id(info.realm_id);
-  info.period_map.reset();
-  info.realm_epoch++;
-}
-
-int update_period(const DoutPrefixProvider* dpp, optional_yield y,
-                  sal::ConfigStore* cfgstore, RGWPeriod& info)
-{
-  // clear zone short ids of removed zones. period_map.update() will add the
-  // remaining zones back
-  info.period_map.short_zone_ids.clear();
-
-  // list all zonegroups in the realm
-  rgw::sal::ListResult<std::string> listing;
-  std::array<std::string, 1000> zonegroup_names; // list in pages of 1000
-  do {
-    int ret = cfgstore->list_zonegroup_names(dpp, y, listing.next,
-                                             zonegroup_names, listing);
-    if (ret < 0) {
-      std::cerr << "failed to list zonegroups: " << cpp_strerror(-ret) << std::endl;
-      return -ret;
-    }
-    for (const auto& name : listing.entries) {
-      RGWZoneGroup zg;
-      ret = cfgstore->read_zonegroup_by_name(dpp, y, name, zg, nullptr);
-      if (ret < 0) {
-        ldpp_dout(dpp, 0) << "WARNING: failed to read zonegroup "
-            << name << ": " << cpp_strerror(-ret) << dendl;
-        continue;
-      }
-
-      if (zg.realm_id != info.realm_id) {
-        ldpp_dout(dpp, 20) << "skipping zonegroup " << zg.get_name()
-            << " with realm id " << zg.realm_id
-            << ", not on our realm " << info.realm_id << dendl;
-        continue;
-      }
-
-      if (zg.master_zone.empty()) {
-        ldpp_dout(dpp, 0) << "ERROR: zonegroup " << zg.get_name() << " should have a master zone " << dendl;
-        return -EINVAL;
-      }
-
-      if (zg.zones.find(zg.master_zone) == zg.zones.end()) {
-        ldpp_dout(dpp, 0) << "ERROR: zonegroup " << zg.get_name()
-                     << " has a non existent master zone "<< dendl;
-        return -EINVAL;
-      }
-
-      if (zg.is_master_zonegroup()) {
-        info.master_zonegroup = zg.get_id();
-        info.master_zone = zg.master_zone;
-      }
-
-      ret = info.period_map.update(zg, dpp->get_cct());
-      if (ret < 0) {
-        return ret;
-      }
-    } // foreach name in listing.entries
-  } while (!listing.next.empty());
-
-  // read the realm's current period config
-  int ret = cfgstore->read_period_config(dpp, y, info.realm_id,
-                                         info.period_config);
-  if (ret < 0 && ret != -ENOENT) {
-    ldpp_dout(dpp, 0) << "ERROR: failed to read period config: "
-        << cpp_strerror(ret) << dendl;
-    return ret;
-  }
-
-  return 0;
-}
-
-int commit_period(const DoutPrefixProvider* dpp, optional_yield y,
-                  sal::ConfigStore* cfgstore, sal::Driver* driver,
-                  RGWRealm& realm, sal::RealmWriter& realm_writer,
-                  const RGWPeriod& current_period,
-                  RGWPeriod& info, std::ostream& error_stream,
-                  bool force_if_stale)
-{
-  auto zone_svc = static_cast<rgw::sal::RadosStore*>(driver)->svc()->zone; // XXX
-
-  ldpp_dout(dpp, 20) << __func__ << " realm " << realm.id
-      << " period " << current_period.id << dendl;
-  // gateway must be in the master zone to commit
-  if (info.master_zone != zone_svc->get_zone_params().id) {
-    error_stream << "Cannot commit period on zone "
-        << zone_svc->get_zone_params().id << ", it must be sent to "
-        "the period's master zone " << info.master_zone << '.' << std::endl;
-    return -EINVAL;
-  }
-  // period predecessor must match current period
-  if (info.predecessor_uuid != current_period.id) {
-    error_stream << "Period predecessor " << info.predecessor_uuid
-        << " does not match current period " << current_period.id
-        << ". Use 'period pull' to get the latest period from the master, "
-        "reapply your changes, and try again." << std::endl;
-    return -EINVAL;
-  }
-  // realm epoch must be 1 greater than current period
-  if (info.realm_epoch != current_period.realm_epoch + 1) {
-    error_stream << "Period's realm epoch " << info.realm_epoch
-        << " does not come directly after current realm epoch "
-        << current_period.realm_epoch << ". Use 'realm pull' to get the "
-        "latest realm and period from the master zone, reapply your changes, "
-        "and try again." << std::endl;
-    return -EINVAL;
-  }
-  // did the master zone change?
-  if (info.master_zone != current_period.master_zone) {
-    // store the current metadata sync status in the period
-    int r = info.update_sync_status(dpp, driver, current_period,
-                                    error_stream, force_if_stale);
-    if (r < 0) {
-      ldpp_dout(dpp, 0) << "failed to update metadata sync status: "
-          << cpp_strerror(-r) << dendl;
-      return r;
-    }
-    // create an object with a new period id
-    info.period_map.id = info.id = gen_random_uuid();
-    info.epoch = FIRST_EPOCH;
-
-    constexpr bool exclusive = true;
-    r = cfgstore->create_period(dpp, y, exclusive, info);
-    if (r < 0) {
-      ldpp_dout(dpp, 0) << "failed to create new period: " << cpp_strerror(-r) << dendl;
-      return r;
-    }
-    // set as current period
-    r = realm_set_current_period(dpp, y, cfgstore, realm_writer, realm, info);
-    if (r < 0) {
-      ldpp_dout(dpp, 0) << "failed to update realm's current period: "
-          << cpp_strerror(-r) << dendl;
-      return r;
-    }
-    ldpp_dout(dpp, 4) << "Promoted to master zone and committed new period "
-        << info.id << dendl;
-    (void) cfgstore->realm_notify_new_period(dpp, y, info);
-    return 0;
-  }
-  // period must be based on current epoch
-  if (info.epoch != current_period.epoch) {
-    error_stream << "Period epoch " << info.epoch << " does not match "
-        "predecessor epoch " << current_period.epoch << ". Use "
-        "'period pull' to get the latest epoch from the master zone, "
-        "reapply your changes, and try again." << std::endl;
-    return -EINVAL;
-  }
-  // set period as next epoch
-  info.id = current_period.id;
-  info.epoch = current_period.epoch + 1;
-  info.predecessor_uuid = current_period.predecessor_uuid;
-  info.realm_epoch = current_period.realm_epoch;
-  // write the period
-  constexpr bool exclusive = true;
-  int r = cfgstore->create_period(dpp, y, exclusive, info);
-  if (r == -EEXIST) {
-    // already have this epoch (or a more recent one)
-    return 0;
-  }
-  if (r < 0) {
-    ldpp_dout(dpp, 0) << "failed to store period: " << cpp_strerror(r) << dendl;
-    return r;
-  }
-  r = reflect_period(dpp, y, cfgstore, info);
-  if (r < 0) {
-    ldpp_dout(dpp, 0) << "failed to update local objects: " << cpp_strerror(r) << dendl;
-    return r;
-  }
-  ldpp_dout(dpp, 4) << "Committed new epoch " << info.epoch
-      << " for period " << info.id << dendl;
-  (void) cfgstore->realm_notify_new_period(dpp, y, info);
-  return 0;
-}
-
-
-int read_zonegroup(const DoutPrefixProvider* dpp, optional_yield y,
-                   sal::ConfigStore* cfgstore,
-                   std::string_view zonegroup_id,
-                   std::string_view zonegroup_name,
-                   RGWZoneGroup& info,
-                   std::unique_ptr<sal::ZoneGroupWriter>* writer)
-{
-  if (!zonegroup_id.empty()) {
-    return cfgstore->read_zonegroup_by_id(dpp, y, zonegroup_id, info, writer);
-  }
-  if (!zonegroup_name.empty()) {
-    return cfgstore->read_zonegroup_by_name(dpp, y, zonegroup_name, info, writer);
-  }
-
-  std::string realm_id;
-  int r = cfgstore->read_default_realm_id(dpp, y, realm_id);
-  if (r == -ENOENT) {
-    return cfgstore->read_zonegroup_by_name(dpp, y, default_zonegroup_name,
-                                            info, writer);
-  }
-  if (r < 0) {
-    return r;
-  }
-  return cfgstore->read_default_zonegroup(dpp, y, realm_id, info, writer);
-}
-
-int create_zonegroup(const DoutPrefixProvider* dpp, optional_yield y,
-                     sal::ConfigStore* cfgstore, bool exclusive,
-                     RGWZoneGroup& info)
-{
-  if (info.name.empty()) {
-    ldpp_dout(dpp, -1) << __func__ << " requires a zonegroup name" << dendl;
-    return -EINVAL;
-  }
-  if (info.id.empty()) {
-    info.id = gen_random_uuid();
-  }
-
-  // insert the default placement target if it doesn't exist
-  constexpr std::string_view default_placement_name = "default-placement";
-
-  RGWZoneGroupPlacementTarget placement_target;
-  placement_target.name = default_placement_name;
-
-  info.placement_targets.emplace(default_placement_name, placement_target);
-  if (info.default_placement.name.empty()) {
-    info.default_placement.name = default_placement_name;
-  }
-
-  int r = cfgstore->create_zonegroup(dpp, y, exclusive, info, nullptr);
-  if (r < 0) {
-    ldpp_dout(dpp, 0) << "failed to create zonegroup with "
-        << cpp_strerror(r) << dendl;
-    return r;
-  }
-
-  // try to set as default. may race with another create, so pass exclusive=true
-  // so we don't override an existing default
-  r = set_default_zonegroup(dpp, y, cfgstore, info, true);
-  if (r < 0 && r != -EEXIST) {
-    ldpp_dout(dpp, 0) << "WARNING: failed to set zonegroup as default: "
-        << cpp_strerror(r) << dendl;
-  }
-
-  return 0;
-}
-
-int set_default_zonegroup(const DoutPrefixProvider* dpp, optional_yield y,
-                          sal::ConfigStore* cfgstore, const RGWZoneGroup& info,
-                          bool exclusive)
-{
-  return cfgstore->write_default_zonegroup_id(
-      dpp, y, exclusive, info.realm_id, info.id);
-}
-
-int remove_zone_from_group(const DoutPrefixProvider* dpp,
-                           RGWZoneGroup& zonegroup,
-                           const rgw_zone_id& zone_id)
-{
-  auto z = zonegroup.zones.find(zone_id);
-  if (z == zonegroup.zones.end()) {
-    return -ENOENT;
-  }
-  zonegroup.zones.erase(z);
-
-  if (zonegroup.master_zone == zone_id) {
-    // choose a new master zone
-    auto m = zonegroup.zones.begin();
-    if (m != zonegroup.zones.end()) {
-      zonegroup.master_zone = m->first;
-      ldpp_dout(dpp, 0) << "NOTICE: promoted " << m->second.name
-         << " as new master_zone of zonegroup " << zonegroup.name << dendl;
-    } else {
-      zonegroup.master_zone.clear();
-      ldpp_dout(dpp, 0) << "NOTICE: cleared master_zone of zonegroup "
-          << zonegroup.name << dendl;
-    }
-  }
-
-  const bool log_data = zonegroup.zones.size() > 1;
-  for (auto& [id, zone] : zonegroup.zones) {
-    zone.log_data = log_data;
-  }
-
-  return 0;
-}
-
-// try to remove the given zone id from every zonegroup in the cluster
-static int remove_zone_from_groups(const DoutPrefixProvider* dpp,
-                                   optional_yield y,
-                                   sal::ConfigStore* cfgstore,
-                                   const rgw_zone_id& zone_id)
-{
-  std::array<std::string, 128> zonegroup_names;
-  sal::ListResult<std::string> listing;
-  do {
-    int r = cfgstore->list_zonegroup_names(dpp, y, listing.next,
-                                           zonegroup_names, listing);
-    if (r < 0) {
-      ldpp_dout(dpp, 0) << "failed to list zonegroups with "
-          << cpp_strerror(r) << dendl;
-      return r;
-    }
-
-    for (const auto& name : listing.entries) {
-      RGWZoneGroup zonegroup;
-      std::unique_ptr<sal::ZoneGroupWriter> writer;
-      r = cfgstore->read_zonegroup_by_name(dpp, y, name, zonegroup, &writer);
-      if (r < 0) {
-        ldpp_dout(dpp, 0) << "WARNING: failed to load zonegroup " << name
-            << " with " << cpp_strerror(r) << dendl;
-        continue;
-      }
-
-      r = remove_zone_from_group(dpp, zonegroup, zone_id);
-      if (r < 0) {
-        continue;
-      }
-
-      // write the updated zonegroup
-      r = writer->write(dpp, y, zonegroup);
-      if (r < 0) {
-        ldpp_dout(dpp, 0) << "WARNING: failed to write zonegroup " << name
-            << " with " << cpp_strerror(r) << dendl;
-        continue;
-      }
-      ldpp_dout(dpp, 0) << "Removed zone from zonegroup " << name << dendl;
-    }
-  } while (!listing.next.empty());
-
-  return 0;
-}
-
-
-int read_zone(const DoutPrefixProvider* dpp, optional_yield y,
-              sal::ConfigStore* cfgstore,
-              std::string_view zone_id,
-              std::string_view zone_name,
-              RGWZoneParams& info,
-              std::unique_ptr<sal::ZoneWriter>* writer)
-{
-  if (!zone_id.empty()) {
-    return cfgstore->read_zone_by_id(dpp, y, zone_id, info, writer);
-  }
-  if (!zone_name.empty()) {
-    return cfgstore->read_zone_by_name(dpp, y, zone_name, info, writer);
-  }
-
-  std::string realm_id;
-  int r = cfgstore->read_default_realm_id(dpp, y, realm_id);
-  if (r == -ENOENT) {
-    return cfgstore->read_zone_by_name(dpp, y, default_zone_name, info, writer);
-  }
-  if (r < 0) {
-    return r;
-  }
-  return cfgstore->read_default_zone(dpp, y, realm_id, info, writer);
-}
-
-extern int get_zones_pool_set(const DoutPrefixProvider *dpp, optional_yield y,
-                              rgw::sal::ConfigStore* cfgstore,
-                              std::string_view my_zone_id,
-                              std::set<rgw_pool>& pools);
-
-int create_zone(const DoutPrefixProvider* dpp, optional_yield y,
-                sal::ConfigStore* cfgstore, bool exclusive,
-                RGWZoneParams& info, std::unique_ptr<sal::ZoneWriter>* writer)
-{
-  if (info.name.empty()) {
-    ldpp_dout(dpp, -1) << __func__ << " requires a zone name" << dendl;
-    return -EINVAL;
-  }
-  if (info.id.empty()) {
-    info.id = gen_random_uuid();
-  }
-
-  // add default placement with empty pool name
-  rgw_pool pool;
-  auto& placement = info.placement_pools["default-placement"];
-  placement.storage_classes.set_storage_class(
-      RGW_STORAGE_CLASS_STANDARD, &pool, nullptr);
-
-  // build a set of all pool names used by other zones
-  std::set<rgw_pool> pools;
-  int r = get_zones_pool_set(dpp, y, cfgstore, info.id, pools);
-  if (r < 0) {
-    return r;
-  }
-
-  // initialize pool names with the zone name prefix
-  r = init_zone_pool_names(dpp, y, pools, info);
-  if (r < 0) {
-    return r;
-  }
-
-  r = cfgstore->create_zone(dpp, y, exclusive, info, nullptr);
-  if (r < 0) {
-    ldpp_dout(dpp, 0) << "failed to create zone with "
-        << cpp_strerror(r) << dendl;
-    return r;
-  }
-
-  // try to set as default. may race with another create, so pass exclusive=true
-  // so we don't override an existing default
-  r = set_default_zone(dpp, y, cfgstore, info, true);
-  if (r < 0 && r != -EEXIST) {
-    ldpp_dout(dpp, 0) << "WARNING: failed to set zone as default: "
-        << cpp_strerror(r) << dendl;
-  }
-
-  return 0;
-
-}
-
-int set_default_zone(const DoutPrefixProvider* dpp, optional_yield y,
-                     sal::ConfigStore* cfgstore, const RGWZoneParams& info,
-                     bool exclusive)
-{
-  return cfgstore->write_default_zone_id(
-      dpp, y, exclusive, info.realm_id, info.id);
-}
-
-int delete_zone(const DoutPrefixProvider* dpp, optional_yield y,
-                sal::ConfigStore* cfgstore, const RGWZoneParams& info,
-                sal::ZoneWriter& writer)
-{
-  // remove this zone from any zonegroups that contain it
-  int r = remove_zone_from_groups(dpp, y, cfgstore, info.id);
-  if (r < 0) {
-    return r;
-  }
-
-  return writer.remove(dpp, y);
-}
-
-} // namespace rgw
-
-static inline int conf_to_uint64(const JSONFormattable& config, const string& key, uint64_t *pval)
-{
-  string sval;
-  if (config.find(key, &sval)) {
-    string err;
-    uint64_t val = strict_strtoll(sval.c_str(), 10, &err);
-    if (!err.empty()) {
-      return -EINVAL;
-    }
-    *pval = val;
-  }
-  return 0;
-}
-
-int RGWZoneGroupPlacementTier::update_params(const JSONFormattable& config)
-{
-  int r = -1;
-
-  if (config.exists("retain_head_object")) {
-    string s = config["retain_head_object"];
-    if (s == "true") {
-      retain_head_object = true;
-    } else {
-      retain_head_object = false;
-    }
-  }
-
-  if (tier_type == "cloud-s3") {
-    r = t.s3.update_params(config);
-  }
-
-  return r;
-}
-
-int RGWZoneGroupPlacementTier::clear_params(const JSONFormattable& config)
-{
-  if (config.exists("retain_head_object")) {
-    retain_head_object = false;
-  }
-
-  if (tier_type == "cloud-s3") {
-    t.s3.clear_params(config);
-  }
-
-  return 0;
-}
-
-int RGWZoneGroupPlacementTierS3::update_params(const JSONFormattable& config)
-{
-  int r = -1;
-
-  if (config.exists("endpoint")) {
-    endpoint = config["endpoint"];
-  }
-  if (config.exists("target_path")) {
-    target_path = config["target_path"];
-  }
-  if (config.exists("region")) {
-    region = config["region"];
-  }
-  if (config.exists("host_style")) {
-    string s;
-    s = config["host_style"];
-    if (s != "virtual") {
-      host_style = PathStyle;
-    } else {
-      host_style = VirtualStyle;
-    }
-  }
-  if (config.exists("target_storage_class")) {
-    target_storage_class = config["target_storage_class"];
-  }
-  if (config.exists("access_key")) {
-    key.id = config["access_key"];
-  }
-  if (config.exists("secret")) {
-    key.key = config["secret"];
-  }
-  if (config.exists("multipart_sync_threshold")) {
-    r = conf_to_uint64(config, "multipart_sync_threshold", &multipart_sync_threshold);
-    if (r < 0) {
-      multipart_sync_threshold = DEFAULT_MULTIPART_SYNC_PART_SIZE;
-    }
-  }
-
-  if (config.exists("multipart_min_part_size")) {
-    r = conf_to_uint64(config, "multipart_min_part_size", &multipart_min_part_size);
-    if (r < 0) {
-      multipart_min_part_size = DEFAULT_MULTIPART_SYNC_PART_SIZE;
-    }
-  }
-
-  if (config.exists("acls")) {
-    const JSONFormattable& cc = config["acls"];
-    if (cc.is_array()) {
-      for (auto& c : cc.array()) {
-        RGWTierACLMapping m;
-        m.init(c);
-        if (!m.source_id.empty()) {
-          acl_mappings[m.source_id] = m;
-        }
-      }
-    } else {
-      RGWTierACLMapping m;
-      m.init(cc);
-      if (!m.source_id.empty()) {
-        acl_mappings[m.source_id] = m;
-      }
-    }
-  }
-  return 0;
-}
-
-int RGWZoneGroupPlacementTierS3::clear_params(const JSONFormattable& config)
-{
-  if (config.exists("endpoint")) {
-    endpoint.clear();
-  }
-  if (config.exists("target_path")) {
-    target_path.clear();
-  }
-  if (config.exists("region")) {
-    region.clear();
-  }
-  if (config.exists("host_style")) {
-    /* default */
-    host_style = PathStyle;
-  }
-  if (config.exists("target_storage_class")) {
-    target_storage_class.clear();
-  }
-  if (config.exists("access_key")) {
-    key.id.clear();
-  }
-  if (config.exists("secret")) {
-    key.key.clear();
-  }
-  if (config.exists("multipart_sync_threshold")) {
-    multipart_sync_threshold = DEFAULT_MULTIPART_SYNC_PART_SIZE;
-  }
-  if (config.exists("multipart_min_part_size")) {
-    multipart_min_part_size = DEFAULT_MULTIPART_SYNC_PART_SIZE;
-  }
-  if (config.exists("acls")) {
-    const JSONFormattable& cc = config["acls"];
-    if (cc.is_array()) {
-      for (auto& c : cc.array()) {
-        RGWTierACLMapping m;
-        m.init(c);
-        acl_mappings.erase(m.source_id);
-      }
-    } else {
-      RGWTierACLMapping m;
-      m.init(cc);
-      acl_mappings.erase(m.source_id);
-    }
-  }
-  return 0;
-}
-
-void rgw_meta_sync_info::generate_test_instances(list<rgw_meta_sync_info*>& o)
-{
-  auto info = new rgw_meta_sync_info;
-  info->state = rgw_meta_sync_info::StateBuildingFullSyncMaps;
-  info->period = "periodid";
-  info->realm_epoch = 5;
-  o.push_back(info);
-  o.push_back(new rgw_meta_sync_info);
-}
-
-void rgw_meta_sync_marker::generate_test_instances(list<rgw_meta_sync_marker*>& o)
-{
-  auto marker = new rgw_meta_sync_marker;
-  marker->state = rgw_meta_sync_marker::IncrementalSync;
-  marker->marker = "01234";
-  marker->realm_epoch = 5;
-  o.push_back(marker);
-  o.push_back(new rgw_meta_sync_marker);
-}
-
-void rgw_meta_sync_status::generate_test_instances(list<rgw_meta_sync_status*>& o)
-{
-  o.push_back(new rgw_meta_sync_status);
-}
-
-void RGWZoneParams::generate_test_instances(list<RGWZoneParams*> &o)
-{
-  o.push_back(new RGWZoneParams);
-  o.push_back(new RGWZoneParams);
-}
-
-void RGWPeriodLatestEpochInfo::generate_test_instances(list<RGWPeriodLatestEpochInfo*> &o)
-{
-  RGWPeriodLatestEpochInfo *z = new RGWPeriodLatestEpochInfo;
-  o.push_back(z);
-  o.push_back(new RGWPeriodLatestEpochInfo);
-}
-
-void RGWZoneGroup::generate_test_instances(list<RGWZoneGroup*>& o)
-{
-  RGWZoneGroup *r = new RGWZoneGroup;
-  o.push_back(r);
-  o.push_back(new RGWZoneGroup);
-}
-
-void RGWPeriodLatestEpochInfo::dump(Formatter *f) const {
-  encode_json("latest_epoch", epoch, f);
-}
-
-void RGWPeriodLatestEpochInfo::decode_json(JSONObj *obj) {
-  JSONDecoder::decode_json("latest_epoch", epoch, obj);
-}
-
-void RGWNameToId::dump(Formatter *f) const {
-  encode_json("obj_id", obj_id, f);
-}
-
-void RGWNameToId::decode_json(JSONObj *obj) {
-  JSONDecoder::decode_json("obj_id", obj_id, obj);
-}
-
diff --git a/src/rgw/store/rados/rgw_zone.h b/src/rgw/store/rados/rgw_zone.h
deleted file mode 100644 (file)
index e1792a4..0000000
+++ /dev/null
@@ -1,1525 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab ft=cpp
-
-#ifndef CEPH_RGW_ZONE_H
-#define CEPH_RGW_ZONE_H
-
-#include <ostream>
-#include "rgw_common.h"
-#include "rgw_sal_fwd.h"
-#include "rgw_sync_policy.h"
-#include "rgw_zone_features.h"
-
-namespace rgw_zone_defaults {
-
-extern std::string zone_names_oid_prefix;
-extern std::string region_info_oid_prefix;
-extern std::string realm_names_oid_prefix;
-extern std::string zone_group_info_oid_prefix;
-extern std::string realm_info_oid_prefix;
-extern std::string default_region_info_oid;
-extern std::string default_zone_group_info_oid;
-extern std::string region_map_oid;
-extern std::string default_realm_info_oid;
-extern std::string default_zonegroup_name;
-extern std::string default_zone_name;
-extern std::string zonegroup_names_oid_prefix;
-extern std::string RGW_DEFAULT_ZONE_ROOT_POOL;
-extern std::string RGW_DEFAULT_ZONEGROUP_ROOT_POOL;
-extern std::string RGW_DEFAULT_REALM_ROOT_POOL;
-extern std::string RGW_DEFAULT_PERIOD_ROOT_POOL;
-extern std::string avail_pools;
-extern std::string default_storage_pool_suffix;
-
-}
-
-class JSONObj;
-class RGWSyncModulesManager;
-
-
-struct RGWNameToId {
-  std::string obj_id;
-
-  void encode(bufferlist& bl) const {
-    ENCODE_START(1, 1, bl);
-    encode(obj_id, bl);
-    ENCODE_FINISH(bl);
-  }
-
-  void decode(bufferlist::const_iterator& bl) {
-    DECODE_START(1, bl);
-    decode(obj_id, bl);
-    DECODE_FINISH(bl);
-  }
-
-  void dump(Formatter *f) const;
-  void decode_json(JSONObj *obj);
-};
-WRITE_CLASS_ENCODER(RGWNameToId)
-
-struct RGWDefaultSystemMetaObjInfo {
-  std::string default_id;
-
-  void encode(bufferlist& bl) const {
-    ENCODE_START(1, 1, bl);
-    encode(default_id, bl);
-    ENCODE_FINISH(bl);
-  }
-
-  void decode(bufferlist::const_iterator& bl) {
-    DECODE_START(1, bl);
-    decode(default_id, bl);
-    DECODE_FINISH(bl);
-  }
-
-  void dump(Formatter *f) const;
-  void decode_json(JSONObj *obj);
-};
-WRITE_CLASS_ENCODER(RGWDefaultSystemMetaObjInfo)
-
-class RGWSI_SysObj;
-class RGWSI_Zone;
-
-class RGWSystemMetaObj {
-public:
-  std::string id;
-  std::string name;
-
-  CephContext *cct{nullptr};
-  RGWSI_SysObj *sysobj_svc{nullptr};
-  RGWSI_Zone *zone_svc{nullptr};
-
-  int store_name(const DoutPrefixProvider *dpp, bool exclusive, optional_yield y);
-  int store_info(const DoutPrefixProvider *dpp, bool exclusive, optional_yield y);
-  int read_info(const DoutPrefixProvider *dpp, const std::string& obj_id, optional_yield y, bool old_format = false);
-  int read_id(const DoutPrefixProvider *dpp, const std::string& obj_name, std::string& obj_id, optional_yield y);
-  int read_default(const DoutPrefixProvider *dpp, 
-                   RGWDefaultSystemMetaObjInfo& default_info,
-                  const std::string& oid,
-                  optional_yield y);
-  /* read and use default id */
-  int use_default(const DoutPrefixProvider *dpp, optional_yield y, bool old_format = false);
-
-public:
-  RGWSystemMetaObj() {}
-  RGWSystemMetaObj(const std::string& _name): name(_name) {}
-  RGWSystemMetaObj(const std::string& _id, const std::string& _name) : id(_id), name(_name) {}
-  RGWSystemMetaObj(CephContext *_cct, RGWSI_SysObj *_sysobj_svc) {
-    reinit_instance(_cct, _sysobj_svc);
-  }
-  RGWSystemMetaObj(const std::string& _name, CephContext *_cct, RGWSI_SysObj *_sysobj_svc): name(_name) {
-    reinit_instance(_cct, _sysobj_svc);
-  }
-
-  const std::string& get_name() const { return name; }
-  const std::string& get_id() const { return id; }
-
-  void set_name(const std::string& _name) { name = _name;}
-  void set_id(const std::string& _id) { id = _id;}
-  void clear_id() { id.clear(); }
-
-  virtual ~RGWSystemMetaObj() {}
-
-  virtual void encode(bufferlist& bl) const {
-    ENCODE_START(1, 1, bl);
-    encode(id, bl);
-    encode(name, bl);
-    ENCODE_FINISH(bl);
-  }
-
-  virtual void decode(bufferlist::const_iterator& bl) {
-    DECODE_START(1, bl);
-    decode(id, bl);
-    decode(name, bl);
-    DECODE_FINISH(bl);
-  }
-
-  void reinit_instance(CephContext *_cct, RGWSI_SysObj *_sysobj_svc);
-  int init(const DoutPrefixProvider *dpp, CephContext *_cct, RGWSI_SysObj *_sysobj_svc,
-          optional_yield y,
-          bool setup_obj = true, bool old_format = false);
-  virtual int read_default_id(const DoutPrefixProvider *dpp, std::string& default_id, optional_yield y,
-                             bool old_format = false);
-  virtual int set_as_default(const DoutPrefixProvider *dpp, optional_yield y, bool exclusive = false);
-  int delete_default();
-  virtual int create(const DoutPrefixProvider *dpp, optional_yield y, bool exclusive = true);
-  int delete_obj(const DoutPrefixProvider *dpp, optional_yield y, bool old_format = false);
-  int rename(const DoutPrefixProvider *dpp, const std::string& new_name, optional_yield y);
-  int update(const DoutPrefixProvider *dpp, optional_yield y) { return store_info(dpp, false, y);}
-  int update_name(const DoutPrefixProvider *dpp, optional_yield y) { return store_name(dpp, false, y);}
-  int read(const DoutPrefixProvider *dpp, optional_yield y);
-  int write(const DoutPrefixProvider *dpp, bool exclusive, optional_yield y);
-
-  virtual rgw_pool get_pool(CephContext *cct) const = 0;
-  virtual const std::string get_default_oid(bool old_format = false) const = 0;
-  virtual const std::string& get_names_oid_prefix() const = 0;
-  virtual const std::string& get_info_oid_prefix(bool old_format = false) const = 0;
-  virtual std::string get_predefined_id(CephContext *cct) const = 0;
-  virtual const std::string& get_predefined_name(CephContext *cct) const = 0;
-
-  void dump(Formatter *f) const;
-  void decode_json(JSONObj *obj);
-};
-WRITE_CLASS_ENCODER(RGWSystemMetaObj)
-
-struct RGWZoneStorageClass {
-  boost::optional<rgw_pool> data_pool;
-  boost::optional<std::string> compression_type;
-
-  void encode(bufferlist& bl) const {
-    ENCODE_START(1, 1, bl);
-    encode(data_pool, bl);
-    encode(compression_type, bl);
-    ENCODE_FINISH(bl);
-  }
-
-  void decode(bufferlist::const_iterator& bl) {
-    DECODE_START(1, bl);
-    decode(data_pool, bl);
-    decode(compression_type, bl);
-    DECODE_FINISH(bl);
-  }
-
-  void dump(Formatter *f) const;
-  void decode_json(JSONObj *obj);
-};
-WRITE_CLASS_ENCODER(RGWZoneStorageClass)
-
-
-class RGWZoneStorageClasses {
-  std::map<std::string, RGWZoneStorageClass> m;
-
-  /* in memory only */
-  RGWZoneStorageClass *standard_class;
-
-public:
-  RGWZoneStorageClasses() {
-    standard_class = &m[RGW_STORAGE_CLASS_STANDARD];
-  }
-  RGWZoneStorageClasses(const RGWZoneStorageClasses& rhs) {
-    m = rhs.m;
-    standard_class = &m[RGW_STORAGE_CLASS_STANDARD];
-  }
-  RGWZoneStorageClasses& operator=(const RGWZoneStorageClasses& rhs) {
-    m = rhs.m;
-    standard_class = &m[RGW_STORAGE_CLASS_STANDARD];
-    return *this;
-  }
-
-  const RGWZoneStorageClass& get_standard() const {
-    return *standard_class;
-  }
-
-  bool find(const std::string& sc, const RGWZoneStorageClass **pstorage_class) const {
-    auto iter = m.find(sc);
-    if (iter == m.end()) {
-      return false;
-    }
-    *pstorage_class = &iter->second;
-    return true;
-  }
-
-  bool exists(const std::string& sc) const {
-    if (sc.empty()) {
-      return true;
-    }
-    auto iter = m.find(sc);
-    return (iter != m.end());
-  }
-
-  const std::map<std::string, RGWZoneStorageClass>& get_all() const {
-    return m;
-  }
-
-  std::map<std::string, RGWZoneStorageClass>& get_all() {
-    return m;
-  }
-
-  void set_storage_class(const std::string& sc, const rgw_pool *data_pool, const std::string *compression_type) {
-    const std::string *psc = &sc;
-    if (sc.empty()) {
-      psc = &RGW_STORAGE_CLASS_STANDARD;
-    }
-    RGWZoneStorageClass& storage_class = m[*psc];
-    if (data_pool) {
-      storage_class.data_pool = *data_pool;
-    }
-    if (compression_type) {
-      storage_class.compression_type = *compression_type;
-    }
-  }
-
-  void remove_storage_class(const std::string& sc) {
-    if (!sc.empty()) {
-      m.erase(sc);
-    }
-  }
-
-  void encode(bufferlist& bl) const {
-    ENCODE_START(1, 1, bl);
-    encode(m, bl);
-    ENCODE_FINISH(bl);
-  }
-
-  void decode(bufferlist::const_iterator& bl) {
-    DECODE_START(1, bl);
-    decode(m, bl);
-    standard_class = &m[RGW_STORAGE_CLASS_STANDARD];
-    DECODE_FINISH(bl);
-  }
-
-  void dump(Formatter *f) const;
-  void decode_json(JSONObj *obj);
-};
-WRITE_CLASS_ENCODER(RGWZoneStorageClasses)
-
-struct RGWZonePlacementInfo {
-  rgw_pool index_pool;
-  rgw_pool data_extra_pool; /* if not set we should use data_pool */
-  RGWZoneStorageClasses storage_classes;
-  rgw::BucketIndexType index_type;
-
-  RGWZonePlacementInfo() : index_type(rgw::BucketIndexType::Normal) {}
-
-  void encode(bufferlist& bl) const {
-    ENCODE_START(7, 1, bl);
-    encode(index_pool.to_str(), bl);
-    rgw_pool standard_data_pool = get_data_pool(RGW_STORAGE_CLASS_STANDARD);
-    encode(standard_data_pool.to_str(), bl);
-    encode(data_extra_pool.to_str(), bl);
-    encode((uint32_t)index_type, bl);
-    std::string standard_compression_type = get_compression_type(RGW_STORAGE_CLASS_STANDARD);
-    encode(standard_compression_type, bl);
-    encode(storage_classes, bl);
-    ENCODE_FINISH(bl);
-  }
-
-  void decode(bufferlist::const_iterator& bl) {
-    DECODE_START(7, bl);
-    std::string index_pool_str;
-    std::string data_pool_str;
-    decode(index_pool_str, bl);
-    index_pool = rgw_pool(index_pool_str);
-    decode(data_pool_str, bl);
-    rgw_pool standard_data_pool(data_pool_str);
-    if (struct_v >= 4) {
-      std::string data_extra_pool_str;
-      decode(data_extra_pool_str, bl);
-      data_extra_pool = rgw_pool(data_extra_pool_str);
-    }
-    if (struct_v >= 5) {
-      uint32_t it;
-      decode(it, bl);
-      index_type = (rgw::BucketIndexType)it;
-    }
-    std::string standard_compression_type;
-    if (struct_v >= 6) {
-      decode(standard_compression_type, bl);
-    }
-    if (struct_v >= 7) {
-      decode(storage_classes, bl);
-    } else {
-      storage_classes.set_storage_class(RGW_STORAGE_CLASS_STANDARD, &standard_data_pool,
-                                        (!standard_compression_type.empty() ? &standard_compression_type : nullptr));
-    }
-    DECODE_FINISH(bl);
-  }
-  const rgw_pool& get_data_extra_pool() const {
-    static rgw_pool no_pool;
-    if (data_extra_pool.empty()) {
-      return storage_classes.get_standard().data_pool.get_value_or(no_pool);
-    }
-    return data_extra_pool;
-  }
-  const rgw_pool& get_data_pool(const std::string& sc) const {
-    const RGWZoneStorageClass *storage_class;
-    static rgw_pool no_pool;
-
-    if (!storage_classes.find(sc, &storage_class)) {
-      return storage_classes.get_standard().data_pool.get_value_or(no_pool);
-    }
-
-    return storage_class->data_pool.get_value_or(no_pool);
-  }
-  const rgw_pool& get_standard_data_pool() const {
-    return get_data_pool(RGW_STORAGE_CLASS_STANDARD);
-  }
-
-  const std::string& get_compression_type(const std::string& sc) const {
-    const RGWZoneStorageClass *storage_class;
-    static std::string no_compression;
-
-    if (!storage_classes.find(sc, &storage_class)) {
-      return no_compression;
-    }
-    return storage_class->compression_type.get_value_or(no_compression);
-  }
-
-  bool storage_class_exists(const std::string& sc) const {
-    return storage_classes.exists(sc);
-  }
-
-  void dump(Formatter *f) const;
-  void decode_json(JSONObj *obj);
-
-};
-WRITE_CLASS_ENCODER(RGWZonePlacementInfo)
-
-struct RGWZoneParams : RGWSystemMetaObj {
-  rgw_pool domain_root;
-  rgw_pool control_pool;
-  rgw_pool gc_pool;
-  rgw_pool lc_pool;
-  rgw_pool log_pool;
-  rgw_pool intent_log_pool;
-  rgw_pool usage_log_pool;
-  rgw_pool user_keys_pool;
-  rgw_pool user_email_pool;
-  rgw_pool user_swift_pool;
-  rgw_pool user_uid_pool;
-  rgw_pool roles_pool;
-  rgw_pool reshard_pool;
-  rgw_pool otp_pool;
-  rgw_pool oidc_pool;
-  rgw_pool notif_pool;
-
-  RGWAccessKey system_key;
-
-  std::map<std::string, RGWZonePlacementInfo> placement_pools;
-
-  std::string realm_id;
-
-  JSONFormattable tier_config;
-
-  RGWZoneParams() : RGWSystemMetaObj() {}
-  explicit RGWZoneParams(const std::string& name) : RGWSystemMetaObj(name){}
-  RGWZoneParams(const rgw_zone_id& id, const std::string& name) : RGWSystemMetaObj(id.id, name) {}
-  RGWZoneParams(const rgw_zone_id& id, const std::string& name, const std::string& _realm_id)
-    : RGWSystemMetaObj(id.id, name), realm_id(_realm_id) {}
-  virtual ~RGWZoneParams();
-
-  rgw_pool get_pool(CephContext *cct) const override;
-  const std::string get_default_oid(bool old_format = false) const override;
-  const std::string& get_names_oid_prefix() const override;
-  const std::string& get_info_oid_prefix(bool old_format = false) const override;
-  std::string get_predefined_id(CephContext *cct) const override;
-  const std::string& get_predefined_name(CephContext *cct) const override;
-
-  int init(const DoutPrefixProvider *dpp, 
-           CephContext *_cct, RGWSI_SysObj *_sysobj_svc, optional_yield y,
-          bool setup_obj = true, bool old_format = false);
-  using RGWSystemMetaObj::init;
-  int read_default_id(const DoutPrefixProvider *dpp, std::string& default_id, optional_yield y, bool old_format = false) override;
-  int set_as_default(const DoutPrefixProvider *dpp, optional_yield y, bool exclusive = false) override;
-  int create_default(const DoutPrefixProvider *dpp, optional_yield y, bool old_format = false);
-  int create(const DoutPrefixProvider *dpp, optional_yield y, bool exclusive = true) override;
-  int fix_pool_names(const DoutPrefixProvider *dpp, optional_yield y);
-
-  const std::string& get_compression_type(const rgw_placement_rule& placement_rule) const;
-  
-  void encode(bufferlist& bl) const override {
-    ENCODE_START(14, 1, bl);
-    encode(domain_root, bl);
-    encode(control_pool, bl);
-    encode(gc_pool, bl);
-    encode(log_pool, bl);
-    encode(intent_log_pool, bl);
-    encode(usage_log_pool, bl);
-    encode(user_keys_pool, bl);
-    encode(user_email_pool, bl);
-    encode(user_swift_pool, bl);
-    encode(user_uid_pool, bl);
-    RGWSystemMetaObj::encode(bl);
-    encode(system_key, bl);
-    encode(placement_pools, bl);
-    rgw_pool unused_metadata_heap;
-    encode(unused_metadata_heap, bl);
-    encode(realm_id, bl);
-    encode(lc_pool, bl);
-    std::map<std::string, std::string, ltstr_nocase> old_tier_config;
-    encode(old_tier_config, bl);
-    encode(roles_pool, bl);
-    encode(reshard_pool, bl);
-    encode(otp_pool, bl);
-    encode(tier_config, bl);
-    encode(oidc_pool, bl);
-    encode(notif_pool, bl);
-    ENCODE_FINISH(bl);
-  }
-
-  void decode(bufferlist::const_iterator& bl) override {
-    DECODE_START(14, bl);
-    decode(domain_root, bl);
-    decode(control_pool, bl);
-    decode(gc_pool, bl);
-    decode(log_pool, bl);
-    decode(intent_log_pool, bl);
-    decode(usage_log_pool, bl);
-    decode(user_keys_pool, bl);
-    decode(user_email_pool, bl);
-    decode(user_swift_pool, bl);
-    decode(user_uid_pool, bl);
-    if (struct_v >= 6) {
-      RGWSystemMetaObj::decode(bl);
-    } else if (struct_v >= 2) {
-      decode(name, bl);
-      id = name;
-    }
-    if (struct_v >= 3)
-      decode(system_key, bl);
-    if (struct_v >= 4)
-      decode(placement_pools, bl);
-    if (struct_v >= 5) {
-      rgw_pool unused_metadata_heap;
-      decode(unused_metadata_heap, bl);
-    }
-    if (struct_v >= 6) {
-      decode(realm_id, bl);
-    }
-    if (struct_v >= 7) {
-      decode(lc_pool, bl);
-    } else {
-      lc_pool = log_pool.name + ":lc";
-    }
-    std::map<std::string, std::string, ltstr_nocase> old_tier_config;
-    if (struct_v >= 8) {
-      decode(old_tier_config, bl);
-    }
-    if (struct_v >= 9) {
-      decode(roles_pool, bl);
-    } else {
-      roles_pool = name + ".rgw.meta:roles";
-    }
-    if (struct_v >= 10) {
-      decode(reshard_pool, bl);
-    } else {
-      reshard_pool = log_pool.name + ":reshard";
-    }
-    if (struct_v >= 11) {
-      ::decode(otp_pool, bl);
-    } else {
-      otp_pool = name + ".rgw.otp";
-    }
-    if (struct_v >= 12) {
-      ::decode(tier_config, bl);
-    } else {
-      for (auto& kv : old_tier_config) {
-        tier_config.set(kv.first, kv.second);
-      }
-    }
-    if (struct_v >= 13) {
-      ::decode(oidc_pool, bl);
-    } else {
-      oidc_pool = name + ".rgw.meta:oidc";
-    }
-    if (struct_v >= 14) {
-      decode(notif_pool, bl);
-    } else {
-      notif_pool = log_pool.name + ":notif";
-    }
-    DECODE_FINISH(bl);
-  }
-  void dump(Formatter *f) const;
-  void decode_json(JSONObj *obj);
-  static void generate_test_instances(std::list<RGWZoneParams*>& o);
-
-  bool get_placement(const std::string& placement_id, RGWZonePlacementInfo *placement) const {
-    auto iter = placement_pools.find(placement_id);
-    if (iter == placement_pools.end()) {
-      return false;
-    }
-    *placement = iter->second;
-    return true;
-  }
-
-  /*
-   * return data pool of the head object
-   */
-  bool get_head_data_pool(const rgw_placement_rule& placement_rule, const rgw_obj& obj, rgw_pool *pool) const {
-    const rgw_data_placement_target& explicit_placement = obj.bucket.explicit_placement;
-    if (!explicit_placement.data_pool.empty()) {
-      if (!obj.in_extra_data) {
-        *pool = explicit_placement.data_pool;
-      } else {
-        *pool = explicit_placement.get_data_extra_pool();
-      }
-      return true;
-    }
-    if (placement_rule.empty()) {
-      return false;
-    }
-    auto iter = placement_pools.find(placement_rule.name);
-    if (iter == placement_pools.end()) {
-      return false;
-    }
-    if (!obj.in_extra_data) {
-      *pool = iter->second.get_data_pool(placement_rule.storage_class);
-    } else {
-      *pool = iter->second.get_data_extra_pool();
-    }
-    return true;
-  }
-
-  bool valid_placement(const rgw_placement_rule& rule) const {
-    auto iter = placement_pools.find(rule.name);
-    if (iter == placement_pools.end()) {
-      return false;
-    }
-    return iter->second.storage_class_exists(rule.storage_class);
-  }
-};
-WRITE_CLASS_ENCODER(RGWZoneParams)
-
-
-struct RGWZone {
-  std::string id;
-  std::string name;
-  std::list<std::string> endpoints;
-  bool log_meta;
-  bool log_data;
-  bool read_only;
-  std::string tier_type;
-
-  std::string redirect_zone;
-
-/**
- * Represents the number of shards for the bucket index object, a value of zero
- * indicates there is no sharding. By default (no sharding, the name of the object
- * is '.dir.{marker}', with sharding, the name is '.dir.{marker}.{sharding_id}',
- * sharding_id is zero-based value. It is not recommended to set a too large value
- * (e.g. thousand) as it increases the cost for bucket listing.
- */
-  uint32_t bucket_index_max_shards;
-
-  // pre-shard buckets on creation to enable some write-parallism by default,
-  // delay the need to reshard as the bucket grows, and (in multisite) get some
-  // bucket index sharding where dynamic resharding is not supported
-  static constexpr uint32_t default_bucket_index_max_shards = 11;
-
-  bool sync_from_all;
-  std::set<std::string> sync_from; /* list of zones to sync from */
-
-  rgw::zone_features::set supported_features;
-
-  RGWZone()
-    : log_meta(false), log_data(false), read_only(false),
-      bucket_index_max_shards(default_bucket_index_max_shards),
-      sync_from_all(true) {}
-
-  void encode(bufferlist& bl) const {
-    ENCODE_START(8, 1, bl);
-    encode(name, bl);
-    encode(endpoints, bl);
-    encode(log_meta, bl);
-    encode(log_data, bl);
-    encode(bucket_index_max_shards, bl);
-    encode(id, bl);
-    encode(read_only, bl);
-    encode(tier_type, bl);
-    encode(sync_from_all, bl);
-    encode(sync_from, bl);
-    encode(redirect_zone, bl);
-    encode(supported_features, bl);
-    ENCODE_FINISH(bl);
-  }
-
-  void decode(bufferlist::const_iterator& bl) {
-    DECODE_START(8, bl);
-    decode(name, bl);
-    if (struct_v < 4) {
-      id = name;
-    }
-    decode(endpoints, bl);
-    if (struct_v >= 2) {
-      decode(log_meta, bl);
-      decode(log_data, bl);
-    }
-    if (struct_v >= 3) {
-      decode(bucket_index_max_shards, bl);
-    }
-    if (struct_v >= 4) {
-      decode(id, bl);
-      decode(read_only, bl);
-    }
-    if (struct_v >= 5) {
-      decode(tier_type, bl);
-    }
-    if (struct_v >= 6) {
-      decode(sync_from_all, bl);
-      decode(sync_from, bl);
-    }
-    if (struct_v >= 7) {
-      decode(redirect_zone, bl);
-    }
-    if (struct_v >= 8) {
-      decode(supported_features, bl);
-    }
-    DECODE_FINISH(bl);
-  }
-  void dump(Formatter *f) const;
-  void decode_json(JSONObj *obj);
-  static void generate_test_instances(std::list<RGWZone*>& o);
-
-  bool is_read_only() const { return read_only; }
-
-  bool syncs_from(const std::string& zone_name) const {
-    return (sync_from_all || sync_from.find(zone_name) != sync_from.end());
-  }
-
-  bool supports(std::string_view feature) const {
-    return supported_features.contains(feature);
-  }
-};
-WRITE_CLASS_ENCODER(RGWZone)
-
-struct RGWDefaultZoneGroupInfo {
-  std::string default_zonegroup;
-
-  void encode(bufferlist& bl) const {
-    ENCODE_START(1, 1, bl);
-    encode(default_zonegroup, bl);
-    ENCODE_FINISH(bl);
-  }
-
-  void decode(bufferlist::const_iterator& bl) {
-    DECODE_START(1, bl);
-    decode(default_zonegroup, bl);
-    DECODE_FINISH(bl);
-  }
-  void dump(Formatter *f) const;
-  void decode_json(JSONObj *obj);
-  //todo: implement ceph-dencoder
-};
-WRITE_CLASS_ENCODER(RGWDefaultZoneGroupInfo)
-
-struct RGWTierACLMapping {
-  ACLGranteeTypeEnum type{ACL_TYPE_CANON_USER};
-  std::string source_id;
-  std::string dest_id;
-
-  RGWTierACLMapping() = default;
-
-  RGWTierACLMapping(ACLGranteeTypeEnum t,
-             const std::string& s,
-             const std::string& d) : type(t),
-  source_id(s),
-  dest_id(d) {}
-
-  void init(const JSONFormattable& config) {
-    const std::string& t = config["type"];
-
-    if (t == "email") {
-      type = ACL_TYPE_EMAIL_USER;
-    } else if (t == "uri") {
-      type = ACL_TYPE_GROUP;
-    } else {
-      type = ACL_TYPE_CANON_USER;
-    }
-
-    source_id = config["source_id"];
-    dest_id = config["dest_id"];
-  }
-
-  void encode(bufferlist& bl) const {
-    ENCODE_START(1, 1, bl);
-    encode((uint32_t)type, bl);
-    encode(source_id, bl);
-    encode(dest_id, bl);
-    ENCODE_FINISH(bl);
-  }
-
-  void decode(bufferlist::const_iterator& bl) {
-    DECODE_START(1, bl);
-    uint32_t it;
-    decode(it, bl);
-    type = (ACLGranteeTypeEnum)it;
-    decode(source_id, bl);
-    decode(dest_id, bl);
-    DECODE_FINISH(bl);
-  }
-  void dump(Formatter *f) const;
-  void decode_json(JSONObj *obj);
-};
-WRITE_CLASS_ENCODER(RGWTierACLMapping)
-
-struct RGWZoneGroupPlacementTierS3 {
-#define DEFAULT_MULTIPART_SYNC_PART_SIZE (32 * 1024 * 1024)
-  std::string endpoint;
-  RGWAccessKey key;
-  std::string region;
-  HostStyle host_style{PathStyle};
-  std::string target_storage_class;
-
-  /* Should below be bucket/zone specific?? */
-  std::string target_path;
-  std::map<std::string, RGWTierACLMapping> acl_mappings;
-
-  uint64_t multipart_sync_threshold{DEFAULT_MULTIPART_SYNC_PART_SIZE};
-  uint64_t multipart_min_part_size{DEFAULT_MULTIPART_SYNC_PART_SIZE};
-
-  int update_params(const JSONFormattable& config);
-  int clear_params(const JSONFormattable& config);
-
-  void encode(bufferlist& bl) const {
-    ENCODE_START(1, 1, bl);
-    encode(endpoint, bl);
-    encode(key, bl);
-    encode(region, bl);
-    encode((uint32_t)host_style, bl);
-    encode(target_storage_class, bl);
-    encode(target_path, bl);
-    encode(acl_mappings, bl);
-    encode(multipart_sync_threshold, bl);
-    encode(multipart_min_part_size, bl);
-    ENCODE_FINISH(bl);
-  }
-
-  void decode(bufferlist::const_iterator& bl) {
-    DECODE_START(1, bl);
-    decode(endpoint, bl);
-    decode(key, bl);
-    decode(region, bl);
-
-    uint32_t it;
-    decode(it, bl);
-    host_style = (HostStyle)it;
-
-    decode(target_storage_class, bl);
-    decode(target_path, bl);
-    decode(acl_mappings, bl);
-    decode(multipart_sync_threshold, bl);
-    decode(multipart_min_part_size, bl);
-    DECODE_FINISH(bl);
-  }
-  void dump(Formatter *f) const;
-  void decode_json(JSONObj *obj);
-};
-WRITE_CLASS_ENCODER(RGWZoneGroupPlacementTierS3)
-
-struct RGWZoneGroupPlacementTier {
-  std::string tier_type;
-  std::string storage_class;
-  bool retain_head_object = false;
-
-  struct _tier {
-    RGWZoneGroupPlacementTierS3 s3;
-  } t;
-
-  int update_params(const JSONFormattable& config);
-  int clear_params(const JSONFormattable& config);
-
-  void encode(bufferlist& bl) const {
-    ENCODE_START(1, 1, bl);
-    encode(tier_type, bl);
-    encode(storage_class, bl);
-    encode(retain_head_object, bl);
-    if (tier_type == "cloud-s3") {
-      encode(t.s3, bl);
-    }
-    ENCODE_FINISH(bl);
-  }
-
-  void decode(bufferlist::const_iterator& bl) {
-    DECODE_START(1, bl);
-    decode(tier_type, bl);
-    decode(storage_class, bl);
-    decode(retain_head_object, bl);
-    if (tier_type == "cloud-s3") {
-      decode(t.s3, bl);
-    }
-    DECODE_FINISH(bl);
-  }
-
-  void dump(Formatter *f) const;
-  void decode_json(JSONObj *obj);
-};
-WRITE_CLASS_ENCODER(RGWZoneGroupPlacementTier)
-
-struct RGWZoneGroupPlacementTarget {
-  std::string name;
-  std::set<std::string> tags;
-  std::set<std::string> storage_classes;
-  std::map<std::string, RGWZoneGroupPlacementTier> tier_targets;
-
-  bool user_permitted(const std::list<std::string>& user_tags) const {
-    if (tags.empty()) {
-      return true;
-    }
-    for (auto& rule : user_tags) {
-      if (tags.find(rule) != tags.end()) {
-        return true;
-      }
-    }
-    return false;
-  }
-
-  void encode(bufferlist& bl) const {
-    ENCODE_START(3, 1, bl);
-    encode(name, bl);
-    encode(tags, bl);
-    encode(storage_classes, bl);
-    encode(tier_targets, bl);
-    ENCODE_FINISH(bl);
-  }
-
-  void decode(bufferlist::const_iterator& bl) {
-    DECODE_START(3, bl);
-    decode(name, bl);
-    decode(tags, bl);
-    if (struct_v >= 2) {
-      decode(storage_classes, bl);
-    }
-    if (storage_classes.empty()) {
-      storage_classes.insert(RGW_STORAGE_CLASS_STANDARD);
-    }
-    if (struct_v >= 3) {
-      decode(tier_targets, bl);
-    }
-    DECODE_FINISH(bl);
-  }
-  void dump(Formatter *f) const;
-  void decode_json(JSONObj *obj);
-};
-WRITE_CLASS_ENCODER(RGWZoneGroupPlacementTarget)
-
-struct RGWZoneGroup : public RGWSystemMetaObj {
-  std::string api_name;
-  std::list<std::string> endpoints;
-  bool is_master = false;
-
-  rgw_zone_id master_zone;
-  std::map<rgw_zone_id, RGWZone> zones;
-
-  std::map<std::string, RGWZoneGroupPlacementTarget> placement_targets;
-  rgw_placement_rule default_placement;
-
-  std::list<std::string> hostnames;
-  std::list<std::string> hostnames_s3website;
-  // TODO: Maybe convert hostnames to a map<std::string,std::list<std::string>> for
-  // endpoint_type->hostnames
-/*
-20:05 < _robbat21irssi> maybe I do someting like: if (hostname_map.empty()) { populate all map keys from hostnames; };
-20:05 < _robbat21irssi> but that's a later compatability migration planning bit
-20:06 < yehudasa> more like if (!hostnames.empty()) {
-20:06 < yehudasa> for (std::list<std::string>::iterator iter = hostnames.begin(); iter != hostnames.end(); ++iter) {
-20:06 < yehudasa> hostname_map["s3"].append(iter->second);
-20:07 < yehudasa> hostname_map["s3website"].append(iter->second);
-20:07 < yehudasa> s/append/push_back/g
-20:08 < _robbat21irssi> inner loop over APIs
-20:08 < yehudasa> yeah, probably
-20:08 < _robbat21irssi> s3, s3website, swift, swith_auth, swift_website
-*/
-  std::map<std::string, std::list<std::string> > api_hostname_map;
-  std::map<std::string, std::list<std::string> > api_endpoints_map;
-
-  std::string realm_id;
-
-  rgw_sync_policy_info sync_policy;
-  rgw::zone_features::set enabled_features;
-
-  RGWZoneGroup(): is_master(false){}
-  RGWZoneGroup(const std::string &id, const std::string &name):RGWSystemMetaObj(id, name) {}
-  explicit RGWZoneGroup(const std::string &_name):RGWSystemMetaObj(_name) {}
-  RGWZoneGroup(const std::string &_name, bool _is_master, CephContext *cct, RGWSI_SysObj* sysobj_svc,
-              const std::string& _realm_id, const std::list<std::string>& _endpoints)
-    : RGWSystemMetaObj(_name, cct , sysobj_svc), endpoints(_endpoints), is_master(_is_master),
-      realm_id(_realm_id) {}
-  virtual ~RGWZoneGroup();
-
-  bool is_master_zonegroup() const { return is_master;}
-  void update_master(const DoutPrefixProvider *dpp, bool _is_master, optional_yield y) {
-    is_master = _is_master;
-    post_process_params(dpp, y);
-  }
-  void post_process_params(const DoutPrefixProvider *dpp, optional_yield y);
-
-  void encode(bufferlist& bl) const override {
-    ENCODE_START(6, 1, bl);
-    encode(name, bl);
-    encode(api_name, bl);
-    encode(is_master, bl);
-    encode(endpoints, bl);
-    encode(master_zone, bl);
-    encode(zones, bl);
-    encode(placement_targets, bl);
-    encode(default_placement, bl);
-    encode(hostnames, bl);
-    encode(hostnames_s3website, bl);
-    RGWSystemMetaObj::encode(bl);
-    encode(realm_id, bl);
-    encode(sync_policy, bl);
-    encode(enabled_features, bl);
-    ENCODE_FINISH(bl);
-  }
-
-  void decode(bufferlist::const_iterator& bl) override {
-    DECODE_START(6, bl);
-    decode(name, bl);
-    decode(api_name, bl);
-    decode(is_master, bl);
-    decode(endpoints, bl);
-    decode(master_zone, bl);
-    decode(zones, bl);
-    decode(placement_targets, bl);
-    decode(default_placement, bl);
-    if (struct_v >= 2) {
-      decode(hostnames, bl);
-    }
-    if (struct_v >= 3) {
-      decode(hostnames_s3website, bl);
-    }
-    if (struct_v >= 4) {
-      RGWSystemMetaObj::decode(bl);
-      decode(realm_id, bl);
-    } else {
-      id = name;
-    }
-    if (struct_v >= 5) {
-      decode(sync_policy, bl);
-    }
-    if (struct_v >= 6) {
-      decode(enabled_features, bl);
-    }
-    DECODE_FINISH(bl);
-  }
-
-  int read_default_id(const DoutPrefixProvider *dpp, std::string& default_id, optional_yield y, bool old_format = false) override;
-  int set_as_default(const DoutPrefixProvider *dpp, optional_yield y, bool exclusive = false) override;
-  int create_default(const DoutPrefixProvider *dpp, optional_yield y, bool old_format = false);
-  int equals(const std::string& other_zonegroup) const;
-  int add_zone(const DoutPrefixProvider *dpp, 
-               const RGWZoneParams& zone_params, bool *is_master, bool *read_only,
-               const std::list<std::string>& endpoints, const std::string *ptier_type,
-               bool *psync_from_all, std::list<std::string>& sync_from,
-               std::list<std::string>& sync_from_rm, std::string *predirect_zone,
-               std::optional<int> bucket_index_max_shards, RGWSyncModulesManager *sync_mgr,
-               const rgw::zone_features::set& enable_features,
-               const rgw::zone_features::set& disable_features,
-              optional_yield y);
-  int remove_zone(const DoutPrefixProvider *dpp, const std::string& zone_id, optional_yield y);
-  int rename_zone(const DoutPrefixProvider *dpp, const RGWZoneParams& zone_params, optional_yield y);
-  rgw_pool get_pool(CephContext *cct) const override;
-  const std::string get_default_oid(bool old_region_format = false) const override;
-  const std::string& get_info_oid_prefix(bool old_region_format = false) const override;
-  const std::string& get_names_oid_prefix() const override;
-  std::string get_predefined_id(CephContext *cct) const override;
-  const std::string& get_predefined_name(CephContext *cct) const override;
-
-  void dump(Formatter *f) const;
-  void decode_json(JSONObj *obj);
-  static void generate_test_instances(std::list<RGWZoneGroup*>& o);
-
-  bool supports(std::string_view feature) const {
-    return enabled_features.contains(feature);
-  }
-};
-WRITE_CLASS_ENCODER(RGWZoneGroup)
-
-struct RGWPeriodMap
-{
-  std::string id;
-  std::map<std::string, RGWZoneGroup> zonegroups;
-  std::map<std::string, RGWZoneGroup> zonegroups_by_api;
-  std::map<std::string, uint32_t> short_zone_ids;
-
-  std::string master_zonegroup;
-
-  void encode(bufferlist& bl) const;
-  void decode(bufferlist::const_iterator& bl);
-
-  int update(const RGWZoneGroup& zonegroup, CephContext *cct);
-
-  void dump(Formatter *f) const;
-  void decode_json(JSONObj *obj);
-
-  void reset() {
-    zonegroups.clear();
-    zonegroups_by_api.clear();
-    master_zonegroup.clear();
-  }
-
-  uint32_t get_zone_short_id(const std::string& zone_id) const;
-
-  bool find_zone_by_id(const rgw_zone_id& zone_id,
-                       RGWZoneGroup *zonegroup,
-                       RGWZone *zone) const;
-  bool find_zone_by_name(const std::string& zone_id,
-                       RGWZoneGroup *zonegroup,
-                       RGWZone *zone) const;
-};
-WRITE_CLASS_ENCODER(RGWPeriodMap)
-
-struct RGWPeriodConfig
-{
-  RGWQuota quota;
-  RGWRateLimitInfo user_ratelimit;
-  RGWRateLimitInfo bucket_ratelimit;
-  // rate limit unauthenticated user
-  RGWRateLimitInfo anon_ratelimit;
-
-  void encode(bufferlist& bl) const {
-    ENCODE_START(2, 1, bl);
-    encode(quota.bucket_quota, bl);
-    encode(quota.user_quota, bl);
-    encode(bucket_ratelimit, bl);
-    encode(user_ratelimit, bl);
-    encode(anon_ratelimit, bl);
-    ENCODE_FINISH(bl);
-  }
-
-  void decode(bufferlist::const_iterator& bl) {
-    DECODE_START(2, bl);
-    decode(quota.bucket_quota, bl);
-    decode(quota.user_quota, bl);
-    if (struct_v >= 2) {
-      decode(bucket_ratelimit, bl);
-      decode(user_ratelimit, bl);
-      decode(anon_ratelimit, bl);
-    }
-    DECODE_FINISH(bl);
-  }
-
-  void dump(Formatter *f) const;
-  void decode_json(JSONObj *obj);
-
-  // the period config must be stored in a local object outside of the period,
-  // so that it can be used in a default configuration where no realm/period
-  // exists
-  int read(const DoutPrefixProvider *dpp, RGWSI_SysObj *sysobj_svc, const std::string& realm_id, optional_yield y);
-  int write(const DoutPrefixProvider *dpp, RGWSI_SysObj *sysobj_svc, const std::string& realm_id, optional_yield y);
-
-  static std::string get_oid(const std::string& realm_id);
-  static rgw_pool get_pool(CephContext *cct);
-};
-WRITE_CLASS_ENCODER(RGWPeriodConfig)
-
-class RGWRealm;
-class RGWPeriod;
-
-class RGWRealm : public RGWSystemMetaObj
-{
-public:
-  std::string current_period;
-  epoch_t epoch{0}; //< realm epoch, incremented for each new period
-
-  int create_control(const DoutPrefixProvider *dpp, bool exclusive, optional_yield y);
-  int delete_control(const DoutPrefixProvider *dpp, optional_yield y);
-public:
-  RGWRealm() {}
-  RGWRealm(const std::string& _id, const std::string& _name = "") : RGWSystemMetaObj(_id, _name) {}
-  RGWRealm(CephContext *_cct, RGWSI_SysObj *_sysobj_svc): RGWSystemMetaObj(_cct, _sysobj_svc) {}
-  RGWRealm(const std::string& _name, CephContext *_cct, RGWSI_SysObj *_sysobj_svc): RGWSystemMetaObj(_name, _cct, _sysobj_svc){}
-  virtual ~RGWRealm() override;
-
-  void encode(bufferlist& bl) const override {
-    ENCODE_START(1, 1, bl);
-    RGWSystemMetaObj::encode(bl);
-    encode(current_period, bl);
-    encode(epoch, bl);
-    ENCODE_FINISH(bl);
-  }
-
-  void decode(bufferlist::const_iterator& bl) override {
-    DECODE_START(1, bl);
-    RGWSystemMetaObj::decode(bl);
-    decode(current_period, bl);
-    decode(epoch, bl);
-    DECODE_FINISH(bl);
-  }
-
-  int create(const DoutPrefixProvider *dpp, optional_yield y, bool exclusive = true) override;
-  int delete_obj(const DoutPrefixProvider *dpp, optional_yield y);
-  rgw_pool get_pool(CephContext *cct) const override;
-  const std::string get_default_oid(bool old_format = false) const override;
-  const std::string& get_names_oid_prefix() const override;
-  const std::string& get_info_oid_prefix(bool old_format = false) const override;
-  std::string get_predefined_id(CephContext *cct) const override;
-  const std::string& get_predefined_name(CephContext *cct) const override;
-
-  using RGWSystemMetaObj::read_id; // expose as public for radosgw-admin
-
-  void dump(Formatter *f) const;
-  void decode_json(JSONObj *obj);
-  static void generate_test_instances(std::list<RGWRealm*>& o);
-
-  const std::string& get_current_period() const {
-    return current_period;
-  }
-  int set_current_period(const DoutPrefixProvider *dpp, RGWPeriod& period, optional_yield y);
-  void clear_current_period_and_epoch() {
-    current_period.clear();
-    epoch = 0;
-  }
-  epoch_t get_epoch() const { return epoch; }
-
-  std::string get_control_oid() const;
-  /// send a notify on the realm control object
-  int notify_zone(const DoutPrefixProvider *dpp, bufferlist& bl, optional_yield y);
-  /// notify the zone of a new period
-  int notify_new_period(const DoutPrefixProvider *dpp, const RGWPeriod& period, optional_yield y);
-
-  int find_zone(const DoutPrefixProvider *dpp,
-                const rgw_zone_id& zid,
-                RGWPeriod *pperiod,
-                RGWZoneGroup *pzonegroup,
-                bool *pfound,
-                optional_yield y) const;
-};
-WRITE_CLASS_ENCODER(RGWRealm)
-
-struct RGWPeriodLatestEpochInfo {
-  epoch_t epoch = 0;
-
-  void encode(bufferlist& bl) const {
-    ENCODE_START(1, 1, bl);
-    encode(epoch, bl);
-    ENCODE_FINISH(bl);
-  }
-
-  void decode(bufferlist::const_iterator& bl) {
-    DECODE_START(1, bl);
-    decode(epoch, bl);
-    DECODE_FINISH(bl);
-  }
-
-  void dump(Formatter *f) const;
-  void decode_json(JSONObj *obj);
-  static void generate_test_instances(std::list<RGWPeriodLatestEpochInfo*>& o);
-};
-WRITE_CLASS_ENCODER(RGWPeriodLatestEpochInfo)
-
-
-/*
- * The RGWPeriod object contains the entire configuration of a
- * RGWRealm, including its RGWZoneGroups and RGWZones. Consistency of
- * this configuration is maintained across all zones by passing around
- * the RGWPeriod object in its JSON representation.
- *
- * If a new configuration changes which zone is the metadata master
- * zone (i.e., master zone of the master zonegroup), then a new
- * RGWPeriod::id (a uuid) is generated, its RGWPeriod::realm_epoch is
- * incremented, and the RGWRealm object is updated to reflect that new
- * current_period id and epoch. If the configuration changes BUT which
- * zone is the metadata master does NOT change, then only the
- * RGWPeriod::epoch is incremented (and the RGWPeriod::id remains the
- * same).
- *
- * When a new RGWPeriod is created with a new RGWPeriod::id (uuid), it
- * is linked back to its predecessor RGWPeriod through the
- * RGWPeriod::predecessor_uuid field, thus creating a "linked
- * list"-like structure of RGWPeriods back to the cluster's creation.
- */
-class RGWPeriod
-{
-public:
-  std::string id; //< a uuid
-  epoch_t epoch{0};
-  std::string predecessor_uuid;
-  std::vector<std::string> sync_status;
-  RGWPeriodMap period_map;
-  RGWPeriodConfig period_config;
-  std::string master_zonegroup;
-  rgw_zone_id master_zone;
-
-  std::string realm_id;
-  std::string realm_name;
-  epoch_t realm_epoch{1}; //< realm epoch when period was made current
-
-  CephContext *cct{nullptr};
-  RGWSI_SysObj *sysobj_svc{nullptr};
-
-  int read_info(const DoutPrefixProvider *dpp, optional_yield y);
-  int read_latest_epoch(const DoutPrefixProvider *dpp,
-                        RGWPeriodLatestEpochInfo& epoch_info,
-                       optional_yield y,
-                        RGWObjVersionTracker *objv = nullptr);
-  int use_latest_epoch(const DoutPrefixProvider *dpp, optional_yield y);
-  int use_current_period();
-
-  const std::string get_period_oid() const;
-  const std::string get_period_oid_prefix() const;
-
-  // gather the metadata sync status for each shard; only for use on master zone
-  int update_sync_status(const DoutPrefixProvider *dpp, 
-                         rgw::sal::Driver* driver,
-                         const RGWPeriod &current_period,
-                         std::ostream& error_stream, bool force_if_stale);
-
-public:
-  RGWPeriod() {}
-
-  explicit RGWPeriod(const std::string& period_id, epoch_t _epoch = 0)
-    : id(period_id), epoch(_epoch) {}
-
-  const std::string& get_id() const { return id; }
-  epoch_t get_epoch() const { return epoch; }
-  epoch_t get_realm_epoch() const { return realm_epoch; }
-  const std::string& get_predecessor() const { return predecessor_uuid; }
-  const rgw_zone_id& get_master_zone() const { return master_zone; }
-  const std::string& get_master_zonegroup() const { return master_zonegroup; }
-  const std::string& get_realm() const { return realm_id; }
-  const std::string& get_realm_name() const { return realm_name; }
-  const RGWPeriodMap& get_map() const { return period_map; }
-  RGWPeriodConfig& get_config() { return period_config; }
-  const RGWPeriodConfig& get_config() const { return period_config; }
-  const std::vector<std::string>& get_sync_status() const { return sync_status; }
-  rgw_pool get_pool(CephContext *cct) const;
-  const std::string& get_latest_epoch_oid() const;
-  const std::string& get_info_oid_prefix() const;
-
-  void set_user_quota(RGWQuotaInfo& user_quota) {
-    period_config.quota.user_quota = user_quota;
-  }
-
-  void set_bucket_quota(RGWQuotaInfo& bucket_quota) {
-    period_config.quota.bucket_quota = bucket_quota;
-  }
-
-  void set_id(const std::string& _id) {
-    this->id = _id;
-    period_map.id = _id;
-  }
-  void set_epoch(epoch_t epoch) { this->epoch = epoch; }
-  void set_realm_epoch(epoch_t epoch) { realm_epoch = epoch; }
-
-  void set_predecessor(const std::string& predecessor)
-  {
-    predecessor_uuid = predecessor;
-  }
-
-  void set_realm_id(const std::string& _realm_id) {
-    realm_id = _realm_id;
-  }
-
-  int reflect(const DoutPrefixProvider *dpp, optional_yield y);
-
-  int get_zonegroup(RGWZoneGroup& zonegroup,
-                   const std::string& zonegroup_id) const;
-
-  bool is_single_zonegroup() const
-  {
-      return (period_map.zonegroups.size() <= 1);
-  }
-
-  /*
-    returns true if there are several zone groups with a least one zone
-   */
-  bool is_multi_zonegroups_with_zones() const
-  {
-    int count = 0;
-    for (const auto& zg:  period_map.zonegroups) {
-      if (zg.second.zones.size() > 0) {
-       if (count++ > 0) {
-         return true;
-       }
-      }
-    }
-    return false;
-  }
-
-  bool find_zone(const DoutPrefixProvider *dpp,
-                const rgw_zone_id& zid,
-                RGWZoneGroup *pzonegroup,
-                optional_yield y) const;
-
-  int get_latest_epoch(const DoutPrefixProvider *dpp, epoch_t& epoch, optional_yield y);
-  int set_latest_epoch(const DoutPrefixProvider *dpp, optional_yield y,
-                      epoch_t epoch, bool exclusive = false,
-                       RGWObjVersionTracker *objv = nullptr);
-  // update latest_epoch if the given epoch is higher, else return -EEXIST
-  int update_latest_epoch(const DoutPrefixProvider *dpp, epoch_t epoch, optional_yield y);
-
-  int init(const DoutPrefixProvider *dpp, CephContext *_cct, RGWSI_SysObj *_sysobj_svc, const std::string &period_realm_id, optional_yield y,
-          const std::string &period_realm_name = "", bool setup_obj = true);
-  int init(const DoutPrefixProvider *dpp, CephContext *_cct, RGWSI_SysObj *_sysobj_svc, optional_yield y, bool setup_obj = true);  
-
-  int create(const DoutPrefixProvider *dpp, optional_yield y, bool exclusive = true);
-  int delete_obj(const DoutPrefixProvider *dpp, optional_yield y);
-  int store_info(const DoutPrefixProvider *dpp, bool exclusive, optional_yield y);
-  int add_zonegroup(const DoutPrefixProvider *dpp, const RGWZoneGroup& zonegroup, optional_yield y);
-
-  void fork();
-  int update(const DoutPrefixProvider *dpp, optional_yield y);
-
-  // commit a staging period; only for use on master zone
-  int commit(const DoutPrefixProvider *dpp,
-            rgw::sal::Driver* driver,
-             RGWRealm& realm, const RGWPeriod &current_period,
-             std::ostream& error_stream, optional_yield y,
-            bool force_if_stale = false);
-
-  void encode(bufferlist& bl) const {
-    ENCODE_START(1, 1, bl);
-    encode(id, bl);
-    encode(epoch, bl);
-    encode(realm_epoch, bl);
-    encode(predecessor_uuid, bl);
-    encode(sync_status, bl);
-    encode(period_map, bl);
-    encode(master_zone, bl);
-    encode(master_zonegroup, bl);
-    encode(period_config, bl);
-    encode(realm_id, bl);
-    encode(realm_name, bl);
-    ENCODE_FINISH(bl);
-  }
-
-  void decode(bufferlist::const_iterator& bl) {
-    DECODE_START(1, bl);
-    decode(id, bl);
-    decode(epoch, bl);
-    decode(realm_epoch, bl);
-    decode(predecessor_uuid, bl);
-    decode(sync_status, bl);
-    decode(period_map, bl);
-    decode(master_zone, bl);
-    decode(master_zonegroup, bl);
-    decode(period_config, bl);
-    decode(realm_id, bl);
-    decode(realm_name, bl);
-    DECODE_FINISH(bl);
-  }
-  void dump(Formatter *f) const;
-  void decode_json(JSONObj *obj);
-  static void generate_test_instances(std::list<RGWPeriod*>& o);
-
-  static std::string get_staging_id(const std::string& realm_id) {
-    return realm_id + ":staging";
-  }
-};
-WRITE_CLASS_ENCODER(RGWPeriod)
-
-namespace rgw {
-
-/// Look up a realm by its id. If no id is given, look it up by name.
-/// If no name is given, fall back to the cluster's default realm.
-int read_realm(const DoutPrefixProvider* dpp, optional_yield y,
-               sal::ConfigStore* cfgstore,
-               std::string_view realm_id,
-               std::string_view realm_name,
-               RGWRealm& info,
-               std::unique_ptr<sal::RealmWriter>* writer = nullptr);
-
-/// Create a realm and its initial period. If the info.id is empty, a
-/// random uuid will be generated.
-int create_realm(const DoutPrefixProvider* dpp, optional_yield y,
-                 sal::ConfigStore* cfgstore, bool exclusive,
-                 RGWRealm& info,
-                 std::unique_ptr<sal::RealmWriter>* writer = nullptr);
-
-/// Set the given realm as the cluster's default realm.
-int set_default_realm(const DoutPrefixProvider* dpp, optional_yield y,
-                      sal::ConfigStore* cfgstore, const RGWRealm& info,
-                      bool exclusive = false);
-
-/// Update the current_period of an existing realm.
-int realm_set_current_period(const DoutPrefixProvider* dpp, optional_yield y,
-                             sal::ConfigStore* cfgstore,
-                             sal::RealmWriter& writer, RGWRealm& realm,
-                             const RGWPeriod& period);
-
-/// Overwrite the local zonegroup and period config objects with the new
-/// configuration contained in the given period.
-int reflect_period(const DoutPrefixProvider* dpp, optional_yield y,
-                   sal::ConfigStore* cfgstore, const RGWPeriod& info);
-
-/// Return the staging period id for the given realm.
-std::string get_staging_period_id(std::string_view realm_id);
-
-/// Convert the given period into a separate staging period, where
-/// radosgw-admin can make changes to it without effecting the running
-/// configuration.
-void fork_period(const DoutPrefixProvider* dpp, RGWPeriod& info);
-
-/// Read all zonegroups in the period's realm and add them to the period.
-int update_period(const DoutPrefixProvider* dpp, optional_yield y,
-                  sal::ConfigStore* cfgstore, RGWPeriod& info);
-
-/// Validates the given 'staging' period and tries to commit it as the
-/// realm's new current period.
-int commit_period(const DoutPrefixProvider* dpp, optional_yield y,
-                  sal::ConfigStore* cfgstore, sal::Driver* driver,
-                  RGWRealm& realm, sal::RealmWriter& realm_writer,
-                  const RGWPeriod& current_period,
-                  RGWPeriod& info, std::ostream& error_stream,
-                  bool force_if_stale);
-
-
-/// Look up a zonegroup by its id. If no id is given, look it up by name.
-/// If no name is given, fall back to the cluster's default zonegroup.
-int read_zonegroup(const DoutPrefixProvider* dpp, optional_yield y,
-                   sal::ConfigStore* cfgstore,
-                   std::string_view zonegroup_id,
-                   std::string_view zonegroup_name,
-                   RGWZoneGroup& info,
-                   std::unique_ptr<sal::ZoneGroupWriter>* writer = nullptr);
-
-/// Initialize and create the given zonegroup. If the given info.id is empty,
-/// a random uuid will be generated. May fail with -EEXIST.
-int create_zonegroup(const DoutPrefixProvider* dpp, optional_yield y,
-                     sal::ConfigStore* cfgstore, bool exclusive,
-                     RGWZoneGroup& info);
-
-/// Set the given zonegroup as its realm's default zonegroup.
-int set_default_zonegroup(const DoutPrefixProvider* dpp, optional_yield y,
-                          sal::ConfigStore* cfgstore, const RGWZoneGroup& info,
-                          bool exclusive = false);
-
-/// Add a zone to the zonegroup, or update an existing zone entry.
-int add_zone_to_group(const DoutPrefixProvider* dpp,
-                      RGWZoneGroup& zonegroup,
-                      const RGWZoneParams& zone_params,
-                      const bool *pis_master, const bool *pread_only,
-                      const std::list<std::string>& endpoints,
-                      const std::string *ptier_type,
-                      const bool *psync_from_all,
-                      const std::list<std::string>& sync_from,
-                      const std::list<std::string>& sync_from_rm,
-                      const std::string *predirect_zone,
-                      std::optional<int> bucket_index_max_shards,
-                      const rgw::zone_features::set& enable_features,
-                      const rgw::zone_features::set& disable_features);
-
-/// Remove a zone by id from its zonegroup, promoting a new master zone if
-/// necessary.
-int remove_zone_from_group(const DoutPrefixProvider* dpp,
-                           RGWZoneGroup& info,
-                           const rgw_zone_id& zone_id);
-
-
-/// Look up a zone by its id. If no id is given, look it up by name. If no name
-/// is given, fall back to the realm's default zone.
-int read_zone(const DoutPrefixProvider* dpp, optional_yield y,
-              sal::ConfigStore* cfgstore,
-              std::string_view zone_id,
-              std::string_view zone_name,
-              RGWZoneParams& info,
-              std::unique_ptr<sal::ZoneWriter>* writer = nullptr);
-
-/// Initialize and create a new zone. If the given info.id is empty, a random
-/// uuid will be generated. Pool names are initialized with the zone name as a
-/// prefix. If any pool names conflict with existing zones, a random suffix is
-/// added.
-int create_zone(const DoutPrefixProvider* dpp, optional_yield y,
-                sal::ConfigStore* cfgstore, bool exclusive,
-                RGWZoneParams& info,
-                std::unique_ptr<sal::ZoneWriter>* writer = nullptr);
-
-/// Initialize the zone's pool names using the zone name as a prefix. If a pool
-/// name conflicts with an existing zone's pool, add a unique suffix.
-int init_zone_pool_names(const DoutPrefixProvider *dpp, optional_yield y,
-                         const std::set<rgw_pool>& pools, RGWZoneParams& info);
-
-/// Set the given zone as its realm's default zone.
-int set_default_zone(const DoutPrefixProvider* dpp, optional_yield y,
-                      sal::ConfigStore* cfgstore, const RGWZoneParams& info,
-                      bool exclusive = false);
-
-/// Delete an existing zone and remove it from any zonegroups that contain it.
-int delete_zone(const DoutPrefixProvider* dpp, optional_yield y,
-                sal::ConfigStore* cfgstore, const RGWZoneParams& info,
-                sal::ZoneWriter& writer);
-
-} // namespace rgw
-
-#endif
index 0a38a4698600740531dfefba096603d8b17d8256..8909788a4f85b679c48786e2bf9491e8f8cbc719 100644 (file)
@@ -320,7 +320,7 @@ add_executable(ceph_test_librgw_file_nfsns
 target_include_directories(ceph_test_librgw_file_nfsns
   PUBLIC "${LUA_INCLUDE_DIR}"
   SYSTEM PRIVATE "${CMAKE_SOURCE_DIR}/src/rgw"
-  SYSTEM PRIVATE "${CMAKE_SOURCE_DIR}/src/rgw/store/rados")
+  SYSTEM PRIVATE "${CMAKE_SOURCE_DIR}/src/rgw/driver/rados")
 target_link_libraries(ceph_test_librgw_file_nfsns
   rgw
   librados
@@ -352,7 +352,7 @@ add_executable(ceph_test_librgw_file_marker
 target_include_directories(ceph_test_librgw_file_marker
   PUBLIC "${LUA_INCLUDE_DIR}"
   SYSTEM PRIVATE "${CMAKE_SOURCE_DIR}/src/rgw"
-  SYSTEM PRIVATE "${CMAKE_SOURCE_DIR}/src/rgw/store/rados")
+  SYSTEM PRIVATE "${CMAKE_SOURCE_DIR}/src/rgw/driver/rados")
 target_link_libraries(ceph_test_librgw_file_marker
   rgw
   librados
@@ -370,7 +370,7 @@ add_executable(ceph_test_librgw_file_xattr
 target_include_directories(ceph_test_librgw_file_xattr
   PUBLIC "${LUA_INCLUDE_DIR}"
   SYSTEM PRIVATE "${CMAKE_SOURCE_DIR}/src/rgw"
-  SYSTEM PRIVATE "${CMAKE_SOURCE_DIR}/src/rgw/store/rados")
+  SYSTEM PRIVATE "${CMAKE_SOURCE_DIR}/src/rgw/driver/rados")
 target_link_libraries(ceph_test_librgw_file_xattr
   rgw
   librados
@@ -399,7 +399,7 @@ add_executable(test_rgw_ldap
   )
 target_include_directories(test_rgw_ldap
   SYSTEM PRIVATE "${CMAKE_SOURCE_DIR}/src/rgw"
-  SYSTEM PRIVATE "${CMAKE_SOURCE_DIR}/src/rgw/store/rados")
+  SYSTEM PRIVATE "${CMAKE_SOURCE_DIR}/src/rgw/driver/rados")
 target_link_libraries(test_rgw_ldap
   librados
   ceph-common