rgw_bucket_encryption.cc
rgw_tracer.cc
rgw_lua_background.cc
- rgw_dedup.cc
- rgw_dedup_table.cc
- rgw_dedup_store.cc
- rgw_dedup_utils.cc
- rgw_dedup_cluster.cc
rgw_data_access.cc
rgw_realm_watcher.cc
rgw_bucket_logging.cc
driver/rados/config/realm_watcher.cc
driver/rados/config/store.cc
driver/rados/config/zone.cc
- driver/rados/config/zonegroup.cc)
+ driver/rados/config/zonegroup.cc
+ driver/rados/rgw_dedup.cc
+ driver/rados/rgw_dedup_table.cc
+ driver/rados/rgw_dedup_store.cc
+ driver/rados/rgw_dedup_utils.cc
+ driver/rados/rgw_dedup_cluster.cc)
endif()
if(WITH_RADOSGW_AMQP_ENDPOINT)
list(APPEND librgw_common_srcs rgw_amqp.cc)
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2;
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Author: Gabriel BenHanokh <gbenhano@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "include/rados/rados_types.hpp"
+#include "include/rados/buffer.h"
+#include "include/rados/librados.hpp"
+#include "rgw_tools.h"
+#include "svc_zone.h"
+#include "common/config.h"
+#include "common/Cond.h"
+#include "common/debug.h"
+#include "common/errno.h"
+#include "rgw_common.h"
+#include "rgw_sal.h"
+#include "rgw_zone.h"
+#include "rgw_cache.h"
+#include "rgw_acl.h"
+#include "rgw_acl_s3.h" /* for dumping s3policy in debug log */
+#include "rgw_aio_throttle.h"
+#include "driver/rados/rgw_bucket.h"
+#include "rgw_sal_config.h"
+#include "rgw_lib.h"
+#include "rgw_placement_types.h"
+#include "driver/rados/rgw_bucket.h"
+#include "driver/rados/rgw_sal_rados.h"
+#include "cls/rgw/cls_rgw_ops.h"
+#include "cls/rgw/cls_rgw_client.h"
+#include "cls/rgw/cls_rgw_const.h"
+#include "cls/refcount/cls_refcount_client.h"
+#include "cls/version/cls_version_client.h"
+#include "fmt/ranges.h"
+#include "osd/osd_types.h"
+#include "common/ceph_crypto.h"
+
+#include <filesystem>
+#include <algorithm>
+#include <iostream>
+#include <fstream>
+#include <stdlib.h>
+#include <time.h>
+#include <sstream>
+#include <errno.h>
+#include <dirent.h>
+#include <stdexcept>
+#include <limits>
+#include <climits>
+#include <cinttypes>
+#include <cstring>
+#include <span>
+#include <mutex>
+#include <thread>
+
+//using namespace std::chrono_literals;
+using namespace librados;
+using namespace std;
+using namespace rgw::dedup;
+
+#include "rgw_dedup_remap.h"
+#include "rgw_sal_rados.h"
+#include "rgw_dedup_table.h"
+#include "rgw_dedup_utils.h"
+#include "rgw_dedup.h"
+#include "rgw_dedup_store.h"
+#include "rgw_dedup_cluster.h"
+#include "rgw_dedup_epoch.h"
+#include "rgw_perf_counters.h"
+#include "include/ceph_assert.h"
+
+static constexpr auto dout_subsys = ceph_subsys_rgw_dedup;
+
+namespace rgw::dedup {
+ static inline constexpr unsigned MAX_STORAGE_CLASS_IDX = 128;
+ using storage_class_idx_t = uint8_t;
+
+ //---------------------------------------------------------------------------
+ void Background::DedupWatcher::handle_notify(uint64_t notify_id, uint64_t cookie,
+ uint64_t notifier_id, bufferlist &bl)
+ {
+ ldpp_dout(parent->dpp, 10) << __func__ << "::notify_id=" << notify_id
+ << "::cookie=" << cookie
+ << "::notifier_id=" << notifier_id << dendl;
+ if (parent->d_watch_handle != cookie) {
+ ldpp_dout(parent->dpp, 1) << __func__ << "::ERR: wrong cookie=" << cookie
+ << "::d_watch_handle=" << parent->d_watch_handle
+ << dendl;
+ return;
+ }
+ parent->handle_notify(notify_id, cookie, bl);
+ }
+
+ //---------------------------------------------------------------------------
+ void Background::DedupWatcher::handle_error(uint64_t cookie, int err)
+ {
+ if (parent->d_watch_handle != cookie) {
+ ldpp_dout(parent->dpp, 1) << __func__ << "::ERR: wrong cookie=" << cookie
+ << "::d_watch_handle=" << parent->d_watch_handle
+ << dendl;
+ return;
+ }
+ ldpp_dout(parent->dpp, 1) << __func__ << "::error=" << err << dendl;
+
+ parent->unwatch_reload(parent->dpp);
+ parent->watch_reload(parent->dpp);
+ }
+
+ //---------------------------------------------------------------------------
+ void control_t::reset()
+ {
+ this->dedup_type = dedup_req_type_t::DEDUP_TYPE_NONE;
+ this->started = false;
+ this->dedup_exec = false;
+ this->shutdown_req = false;
+ this->shutdown_done = false;
+ this->local_pause_req = false;
+ this->local_paused = false;
+ this->remote_abort_req = false;
+ this->remote_aborted = false;
+ this->remote_pause_req = false;
+ this->remote_paused = false;
+ this->remote_restart_req = false;
+ }
+
+ //---------------------------------------------------------------------------
+ void encode(const control_t& ctl, ceph::bufferlist& bl)
+ {
+ ENCODE_START(1, 1, bl);
+ encode(static_cast<int32_t>(ctl.dedup_type), bl);
+ encode(ctl.started, bl);
+ encode(ctl.dedup_exec, bl);
+ encode(ctl.shutdown_req, bl);
+ encode(ctl.shutdown_done, bl);
+ encode(ctl.local_pause_req, bl);
+ encode(ctl.local_paused, bl);
+ encode(ctl.remote_abort_req, bl);
+ encode(ctl.remote_aborted, bl);
+ encode(ctl.remote_pause_req, bl);
+ encode(ctl.remote_paused, bl);
+ encode(ctl.remote_restart_req, bl);
+ ENCODE_FINISH(bl);
+ }
+
+ //---------------------------------------------------------------------------
+ void decode(control_t& ctl, ceph::bufferlist::const_iterator& bl)
+ {
+ DECODE_START(1, bl);
+ int32_t dedup_type;
+ decode(dedup_type, bl);
+ ctl.dedup_type = static_cast<dedup_req_type_t> (dedup_type);
+ decode(ctl.started, bl);
+ decode(ctl.dedup_exec, bl);
+ decode(ctl.shutdown_req, bl);
+ decode(ctl.shutdown_done, bl);
+ decode(ctl.local_pause_req, bl);
+ decode(ctl.local_paused, bl);
+ decode(ctl.remote_abort_req, bl);
+ decode(ctl.remote_aborted, bl);
+ decode(ctl.remote_pause_req, bl);
+ decode(ctl.remote_paused, bl);
+ decode(ctl.remote_restart_req, bl);
+ DECODE_FINISH(bl);
+ }
+
+ //---------------------------------------------------------------------------
+ std::ostream& operator<<(std::ostream &out, const control_t &ctl)
+ {
+ out << ctl.dedup_type;
+ if (ctl.started) {
+ out << "::started";
+ }
+ if (ctl.dedup_exec) {
+ out << "::dedup_exec";
+ }
+ if (ctl.shutdown_req) {
+ out << "::shutdown_req";
+ }
+ if (ctl.shutdown_done) {
+ out << "::shutdown_done";
+ }
+ if (ctl.local_pause_req) {
+ out << "::local_pause_req";
+ }
+ if (ctl.local_paused) {
+ out << "::local_paused";
+ }
+ if (ctl.remote_abort_req) {
+ out << "::remote_abort_req";
+ }
+ if (ctl.remote_aborted) {
+ out << "::remote_aborted";
+ }
+ if (ctl.remote_pause_req) {
+ out << "::remote_pause_req";
+ }
+ if (ctl.remote_paused) {
+ out << "::remote_paused";
+ }
+ if (ctl.remote_restart_req) {
+ out << "::remote_restart_req";
+ }
+
+ return out;
+ }
+
+ //===========================================================================
+ // rgw::dedup::Background
+ //===========================================================================
+ //---------------------------------------------------------------------------
+ static void display_ioctx_state(const DoutPrefixProvider *dpp,
+ const librados::IoCtx &ioctx,
+ const char *caller)
+ {
+ if (ioctx.is_valid()) {
+ ldpp_dout(dpp, 5) << caller << "::valid ioctx, instance_id="
+ << ioctx.get_instance_id() << dendl;
+ }
+ else {
+ ldpp_dout(dpp, 5) << caller << "::invalid ioctx" << dendl;
+ }
+ }
+
+ //---------------------------------------------------------------------------
+ static int safe_pool_delete(rgw::sal::RadosStore *store,
+ const DoutPrefixProvider *dpp,
+ int64_t expected_pool_id)
+ {
+ const rgw_pool& dedup_pool = store->svc()->zone->get_zone_params().dedup_pool;
+ auto rados_handle = store->getRados()->get_rados_handle();
+ int64_t pool_id = rados_handle->pool_lookup(dedup_pool.name.c_str());
+ if (pool_id < 0) {
+ int err = pool_id;
+ if (err == ENOENT) {
+ ldpp_dout(dpp, 10) <<__func__ << "::pool doesn't exist (probably was removed by other RGW)::"
+ << dedup_pool.name << "::expected_pool_id="
+ << expected_pool_id << dendl;
+ }
+ else {
+ ldpp_dout(dpp, 5) <<__func__ << "::failed pool_lookup(" << dedup_pool.name
+ << ") err=" << cpp_strerror(-err) << dendl;
+ }
+ return err;
+ }
+
+ if (pool_id != expected_pool_id) {
+ ldpp_dout(dpp, 5) << __func__ << "::ERR: pool_id was changed from: "
+ << expected_pool_id << " to: " << pool_id
+ << " abort pool_delete() request!" << dendl;
+ // report Stale file handle
+ return -ESTALE;
+ }
+
+ ldpp_dout(dpp, 10) <<__func__ << "::calling delete pool(" << dedup_pool.name
+ << ") pool_id=" << pool_id << dendl;
+ return rados_handle->pool_delete(dedup_pool.name.c_str());
+ }
+
+ //---------------------------------------------------------------------------
+ static int64_t create_pool(rgw::sal::RadosStore *store,
+ const DoutPrefixProvider *dpp,
+ const std::string &pool_name)
+ {
+#if 0
+ // using Replica-1 for the intermediate data
+ // since it can be regenerated in case of a failure
+ std::string replica_count(std::to_string(1));
+#else
+ // temporary solution until we find a way to disable the health warn on replica1
+ std::string replica_count(std::to_string(2));
+#endif
+ librados::bufferlist inbl;
+ std::string output;
+ std::string command = R"(
+ {
+ "prefix": "osd pool create",
+ "pool": ")" + pool_name +
+ R"(",
+ "pool_type": "replicated",
+ "size": )" + replica_count +
+ R"(
+ })";
+
+ auto rados_handle = store->getRados()->get_rados_handle();
+ int ret = rados_handle->mon_command(command, inbl, nullptr, &output);
+ if (output.length()) {
+ if (output != "pool 'rgw_dedup_pool' already exists") {
+ ldpp_dout(dpp, 10) << __func__ << "::" << output << dendl;
+ }
+ }
+ if (ret != 0 && ret != -EEXIST) {
+ ldpp_dout(dpp, 1) << __func__ << "::ERR: failed to create pool "
+ << pool_name << " with: "
+ << cpp_strerror(-ret) << ", ret=" << ret << dendl;
+ return ret;
+ }
+ const rgw_pool& dedup_pool = store->svc()->zone->get_zone_params().dedup_pool;
+ return rados_handle->pool_lookup(dedup_pool.name.c_str());
+ }
+
+ //---------------------------------------------------------------------------
+ static int init_dedup_pool_ioctx(rgw::sal::RadosStore *store,
+ const DoutPrefixProvider *dpp,
+ bool create,
+ librados::IoCtx &ioctx)
+ {
+ const rgw_pool& dedup_pool = store->svc()->zone->get_zone_params().dedup_pool;
+ std::string pool_name(dedup_pool.name.c_str());
+ auto rados_handle = store->getRados()->get_rados_handle();
+ int64_t pool_id = rados_handle->pool_lookup(dedup_pool.name.c_str());
+ if (pool_id >= 0) {
+ // TBD: what to do when create option is passed
+ ldpp_dout(dpp, 10) << __func__ << "::pool " << dedup_pool.name
+ << " already exists, pool_id=" << pool_id << dendl;
+ }
+ else if (create) {
+ pool_id = create_pool(store, dpp, pool_name);
+ if (pool_id >= 0) {
+ ldpp_dout(dpp, 10) << __func__ << "::pool " << dedup_pool.name
+ << " was created, pool_id=" << pool_id << dendl;
+ }
+ else {
+ return pool_id;
+ }
+ }
+ else {
+ ldpp_dout(dpp, 1) << __func__
+ << "::ERR: pool doesn't exist and no create option" << dendl;
+ return -ENOENT;
+ }
+
+ int ret = rgw_init_ioctx(dpp, rados_handle, dedup_pool, ioctx);
+ if (unlikely(ret < 0)) {
+ ldpp_dout(dpp, 1) << __func__ << "::ERR: failed rgw_init_ioctx() ret=" << ret
+ << "::" << cpp_strerror(-ret) << dendl;
+ return ret;
+ }
+
+ ret = ioctx.application_enable("rgw_dedup", false);
+ if (ret == 0) {
+ ldpp_dout(dpp, 10) << __func__ << "::pool " << dedup_pool.name
+ << " was associated with dedup app" << dendl;
+ }
+ else {
+ ldpp_dout(dpp, 1) << __func__ << "::ERR: failed to enable pool "
+ << dedup_pool.name << " with: "
+ << cpp_strerror(-ret) << ", ret=" << ret << dendl;
+ }
+ return ret;
+ }
+
+ //---------------------------------------------------------------------------
+ int Background::init_rados_access_handles(bool init_pool)
+ {
+ store = dynamic_cast<rgw::sal::RadosStore*>(driver);
+ if (!store) {
+ ldpp_dout(dpp, 0) << "ERR: failed dynamic_cast to RadosStore" << dendl;
+ // this is the return code used in rgw_bucket.cc
+ return -ENOTSUP;
+ }
+
+ rados = store->getRados();
+ rados_handle = rados->get_rados_handle();
+ if (init_pool) {
+ int ret = init_dedup_pool_ioctx(store, dpp, true, d_dedup_cluster_ioctx);
+ display_ioctx_state(dpp, d_dedup_cluster_ioctx, __func__);
+ return ret;
+ }
+ return 0;
+ }
+
+ //---------------------------------------------------------------------------
+ Background::Background(rgw::sal::Driver* _driver, CephContext* _cct) :
+ driver(_driver),
+ dp(_cct, dout_subsys, "dedup background: "),
+ dpp(&dp),
+ cct(_cct),
+ d_cluster(dpp, cct, driver),
+ d_watcher_ctx(this)
+ {
+ d_min_obj_size_for_dedup = cct->_conf->rgw_max_chunk_size;
+ d_head_object_size = cct->_conf->rgw_max_chunk_size;
+ //ceph_assert(4*1024*1024 == d_head_object_size);
+
+ int ret = init_rados_access_handles(false);
+ if (ret != 0) {
+ derr << __func__ << "::ERR: failed init_rados_access_handles() ret="
+ << ret << "::" << cpp_strerror(-ret) << dendl;
+ throw std::runtime_error("Failed init_rados_access_handles()");
+ }
+
+ d_heart_beat_last_update = ceph_clock_now();
+ d_heart_beat_max_elapsed_sec = 3;
+ }
+
+ //---------------------------------------------------------------------------
+ int Background::add_disk_rec_from_bucket_idx(disk_block_array_t &disk_arr,
+ const rgw::sal::Bucket *p_bucket,
+ const parsed_etag_t *p_parsed_etag,
+ const std::string &obj_name,
+ uint64_t obj_size,
+ const std::string &storage_class)
+ {
+ disk_record_t rec(p_bucket, obj_name, p_parsed_etag, obj_size, storage_class);
+ // First pass using only ETAG and size taken from bucket-index
+ rec.s.flags.set_fastlane();
+
+ auto p_disk = disk_arr.get_shard_block_seq(p_parsed_etag->md5_low);
+ disk_block_seq_t::record_info_t rec_info;
+ int ret = p_disk->add_record(d_dedup_cluster_ioctx, &rec, &rec_info);
+ if (unlikely(ret != 0)) {
+ return ret;
+ }
+ ldpp_dout(dpp, 20) << __func__ << "::" << p_bucket->get_name() << "/"
+ << obj_name << " was written to block_idx="
+ << rec_info.block_id << " rec_id=" << rec_info.rec_id << dendl;
+ return 0;
+ }
+
+ //---------------------------------------------------------------------------
+ int Background::add_record_to_dedup_table(dedup_table_t *p_table,
+ const disk_record_t *p_rec,
+ disk_block_id_t block_id,
+ record_id_t rec_id,
+ md5_stats_t *p_stats,
+ remapper_t *remapper)
+ {
+ uint32_t size_4k_units = byte_size_to_disk_blocks(p_rec->s.obj_bytes_size);
+ storage_class_idx_t sc_idx = remapper->remap(p_rec->stor_class, dpp,
+ &p_stats->failed_map_overflow);
+ if (unlikely(sc_idx == remapper_t::NULL_IDX)) {
+ // TBD: need stat counters
+ return -EOVERFLOW;
+ }
+ key_t key(p_rec->s.md5_high, p_rec->s.md5_low, size_4k_units,
+ p_rec->s.num_parts, sc_idx);
+ bool has_shared_manifest = p_rec->has_shared_manifest();
+ ldpp_dout(dpp, 20) << __func__ << "::bucket=" << p_rec->bucket_name
+ << ", obj=" << p_rec->obj_name << ", block_id="
+ << (uint32_t)block_id << ", rec_id=" << (uint32_t)rec_id
+ << ", shared_manifest=" << has_shared_manifest
+ << "::num_parts=" << p_rec->s.num_parts
+ << "::size_4k_units=" << key.size_4k_units
+ << "::ETAG=" << std::hex << p_rec->s.md5_high
+ << p_rec->s.md5_low << std::dec << dendl;
+
+ int ret = p_table->add_entry(&key, block_id, rec_id, has_shared_manifest);
+ if (ret == 0) {
+ p_stats->loaded_objects ++;
+ ldpp_dout(dpp, 20) << __func__ << "::" << p_rec->bucket_name << "/"
+ << p_rec->obj_name << " was added successfully to table"
+ << "::loaded_objects=" << p_stats->loaded_objects << dendl;
+ }
+ else {
+ // We allocate memory for the dedup on startup based on the existing obj count
+ // If the system grew significantly since that point we won't be able to
+ // accommodate all the objects in the hash-table.
+ // Please keep in mind that it is very unlikely since duplicates objects will
+ // consume a single entry and since we skip small objects so in reality
+ // I expect the allocation to be more than sufficient.
+ //
+ // However, if we filled up the system there is still value is continuing
+ // with this process since we might find duplicates to existing object (which
+ // don't take extra space)
+
+ int level = 15;
+ if (p_stats->failed_table_load % 0x10000 == 0) {
+ level = 5;
+ }
+ else if (p_stats->failed_table_load % 0x100 == 0) {
+ level = 10;
+ }
+ ldpp_dout(dpp, level) << __func__ << "::Failed p_table->add_entry (overflow)"
+ << "::loaded_objects=" << p_stats->loaded_objects
+ << "::failed_table_load=" << p_stats->failed_table_load
+ << dendl;
+
+ p_stats->failed_table_load++;
+ }
+ return ret;
+ }
+
+#ifdef FULL_DEDUP_SUPPORT
+
+ static constexpr uint64_t cost = 1; // 1 throttle unit per request
+ static constexpr uint64_t id = 0; // ids unused
+ //---------------------------------------------------------------------------
+ [[maybe_unused]]static void show_ref_tags(const DoutPrefixProvider* dpp, std::string &oid, rgw_rados_ref &obj)
+ {
+ unsigned idx = 0;
+ std::list<std::string> refs;
+ std::string wildcard_tag;
+ int ret = cls_refcount_read(obj.ioctx, oid, &refs, true);
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << __func__ << "::ERR: manifest::failed cls_refcount_read()"
+ << " idx=" << idx << dendl;
+ return;
+ }
+
+ for (list<string>::iterator iter = refs.begin(); iter != refs.end(); ++iter) {
+ ldpp_dout(dpp, 20) << __func__ << "::manifest::" << oid << "::" << idx
+ << "::TAG=" << *iter << dendl;
+ }
+ }
+
+ //---------------------------------------------------------------------------
+ int Background::free_tail_objs_by_manifest(const string &ref_tag,
+ const string &oid,
+ RGWObjManifest &tgt_manifest)
+ {
+ unsigned idx = 0;
+ for (auto p = tgt_manifest.obj_begin(dpp); p != tgt_manifest.obj_end(dpp); ++p, ++idx) {
+ rgw_raw_obj raw_obj = p.get_location().get_raw_obj(rados);
+ if (oid == raw_obj.oid) {
+ ldpp_dout(dpp, 20) << __func__ << "::[" << idx <<"] Skip HEAD OBJ: " << raw_obj.oid << dendl;
+ continue;
+ }
+
+ rgw_rados_ref obj;
+ int ret = rgw_get_rados_ref(dpp, rados_handle, raw_obj, &obj);
+ if (ret < 0) {
+ ldpp_dout(dpp, 1) << __func__ << "ERR: manifest::failed to open context "
+ << obj << dendl;
+ continue;
+ }
+ librados::IoCtx ioctx = obj.ioctx;
+ ldpp_dout(dpp, 20) << __func__ << "::removing tail object: " << raw_obj.oid
+ << dendl;
+ ret = ioctx.remove(raw_obj.oid);
+ }
+
+ return 0;
+ }
+
+ //---------------------------------------------------------------------------
+ int Background::rollback_ref_by_manifest(const string &ref_tag,
+ const string &oid,
+ RGWObjManifest &manifest)
+ {
+ unsigned idx = 0;
+ int ret_code = 0;
+ std::unique_ptr<rgw::Aio> aio = rgw::make_throttle(cct->_conf->rgw_max_copy_obj_concurrent_io, null_yield);
+ for (auto p = manifest.obj_begin(dpp); p != manifest.obj_end(dpp); ++p, ++idx) {
+ rgw_raw_obj raw_obj = p.get_location().get_raw_obj(rados);
+ if (oid == raw_obj.oid) {
+ ldpp_dout(dpp, 20) << __func__ << "::[" << idx <<"] Skip HEAD OBJ: "
+ << raw_obj.oid << dendl;
+ continue;
+ }
+
+ rgw_rados_ref obj;
+ int local_ret = rgw_get_rados_ref(dpp, rados_handle, raw_obj, &obj);
+ if (local_ret < 0) {
+ ret_code = local_ret;
+ ldpp_dout(dpp, 1) << __func__ << "::ERR: manifest::failed to open context "
+ << obj << dendl;
+ // skip bad objects, nothing we can do
+ continue;
+ }
+
+ ObjectWriteOperation op;
+ cls_refcount_put(op, ref_tag, true);
+ rgw::AioResultList completed = aio->get(obj.obj,
+ rgw::Aio::librados_op(obj.ioctx, std::move(op), null_yield),
+ cost, id);
+ }
+ rgw::AioResultList completed = aio->drain();
+ return ret_code;
+ }
+
+ //---------------------------------------------------------------------------
+ int Background::inc_ref_count_by_manifest(const string &ref_tag,
+ const string &oid,
+ RGWObjManifest &manifest)
+ {
+ std::unique_ptr<rgw::Aio> aio = rgw::make_throttle(cct->_conf->rgw_max_copy_obj_concurrent_io, null_yield);
+ rgw::AioResultList all_results;
+ int ret = 0;
+ unsigned idx = 0;
+ for (auto p = manifest.obj_begin(dpp); p != manifest.obj_end(dpp); ++p, ++idx) {
+ rgw_raw_obj raw_obj = p.get_location().get_raw_obj(rados);
+ if (oid == raw_obj.oid) {
+ ldpp_dout(dpp, 20) << __func__ << "::[" << idx <<"] Skip HEAD OBJ: " << raw_obj.oid << dendl;
+ continue;
+ }
+
+ rgw_rados_ref obj;
+ ret = rgw_get_rados_ref(dpp, rados_handle, raw_obj, &obj);
+ if (ret < 0) {
+ ldpp_dout(dpp, 1) << __func__ << "::ERR: manifest::failed to open context "
+ << obj << dendl;
+ break;
+ }
+
+ ObjectWriteOperation op;
+ cls_refcount_get(op, ref_tag, true);
+ ldpp_dout(dpp, 20) << __func__ << "::inc ref-count on tail object: " << raw_obj.oid << dendl;
+ rgw::AioResultList completed = aio->get(obj.obj,
+ rgw::Aio::librados_op(obj.ioctx, std::move(op), null_yield),
+ cost, id);
+ ret = rgw::check_for_errors(completed);
+ all_results.splice(all_results.end(), completed);
+ if (ret < 0) {
+ ldpp_dout(dpp, 1) << __func__ << "::ERROR: failed to copy obj=" << obj
+ << ", the error code = " << ret << dendl;
+ break;
+ }
+ }
+
+ if (ret == 0) {
+ rgw::AioResultList completed = aio->drain();
+ int ret = rgw::check_for_errors(completed);
+ all_results.splice(all_results.end(), completed);
+ if (ret == 0) {
+ return 0;
+ }
+ else {
+ ldpp_dout(dpp, 1) << __func__ << "::ERR: manifest: failed to drain ios ret="
+ << ret <<dendl;
+ }
+ }
+
+ // if arrived here we failed somewhere -> rollback all ref-inc operations
+ /* wait all pending op done */
+ rgw::AioResultList completed = aio->drain();
+ all_results.splice(all_results.end(), completed);
+ int ret2 = 0;
+ for (auto& aio_res : all_results) {
+ if (aio_res.result < 0) {
+ continue; // skip errors
+ }
+ rgw_rados_ref obj;
+ ret2 = rgw_get_rados_ref(dpp, rados_handle, aio_res.obj, &obj);
+ if (ret2 < 0) {
+ continue;
+ }
+
+ ObjectWriteOperation op;
+ cls_refcount_put(op, ref_tag, true);
+ rgw::AioResultList completed = aio->get(obj.obj,
+ rgw::Aio::librados_op(obj.ioctx, std::move(op), null_yield),
+ cost, id);
+ ret2 = rgw::check_for_errors(completed);
+ if (ret2 < 0) {
+ ldpp_dout(dpp, 1) << __func__ << "::ERR: cleanup after error failed to drop reference on obj=" << aio_res.obj << dendl;
+ }
+ }
+ completed = aio->drain();
+ ret2 = rgw::check_for_errors(completed);
+ if (ret2 < 0) {
+ ldpp_dout(dpp, 1) << __func__ << "::ERR: failed to drain rollback ios, ret="
+ << ret2 <<dendl;
+ }
+
+ return ret;
+ }
+
+ //---------------------------------------------------------------------------
+ static int get_ioctx(const DoutPrefixProvider* const dpp,
+ rgw::sal::Driver* driver,
+ RGWRados* rados,
+ const disk_record_t *p_rec,
+ librados::IoCtx *p_ioctx,
+ std::string *oid)
+ {
+ unique_ptr<rgw::sal::Bucket> bucket;
+ {
+ rgw_bucket b{p_rec->tenant_name, p_rec->bucket_name, p_rec->bucket_id};
+ int ret = driver->load_bucket(dpp, b, &bucket, null_yield);
+ if (unlikely(ret != 0)) {
+ ldpp_dout(dpp, 1) << __func__ << "::ERR: driver->load_bucket(): "
+ << cpp_strerror(-ret) << dendl;
+ return ret;
+ }
+ }
+
+ build_oid(p_rec->bucket_id, p_rec->obj_name, oid);
+ //ldpp_dout(dpp, 0) << __func__ << "::OID=" << oid << " || bucket_id=" << bucket_id << dendl;
+ rgw_pool data_pool;
+ rgw_obj obj{bucket->get_key(), *oid};
+ if (!rados->get_obj_data_pool(bucket->get_placement_rule(), obj, &data_pool)) {
+ ldpp_dout(dpp, 1) << __func__ << "::failed to get data pool for bucket "
+ << bucket->get_name() << dendl;
+ return -EIO;
+ }
+ int ret = rgw_init_ioctx(dpp, rados->get_rados_handle(), data_pool, *p_ioctx);
+ if (ret < 0) {
+ ldpp_dout(dpp, 1) << __func__ << "::ERR: failed to get ioctx from data pool:"
+ << data_pool.to_str() << dendl;
+ return -EIO;
+ }
+
+ return 0;
+ }
+
+ //---------------------------------------------------------------------------
+ static void init_cmp_pairs(const disk_record_t *p_rec,
+ const bufferlist &etag_bl,
+ bufferlist &hash_bl, // OUT PARAM
+ librados::ObjectWriteOperation *p_op)
+ {
+ p_op->cmpxattr(RGW_ATTR_ETAG, CEPH_OSD_CMPXATTR_OP_EQ, etag_bl);
+ // TBD: do we really need the secondary compare using the full manifest?
+ // Can replace it with something cheaper like size/version?
+ p_op->cmpxattr(RGW_ATTR_MANIFEST, CEPH_OSD_CMPXATTR_OP_EQ, p_rec->manifest_bl);
+
+ // BLAKE3 hash has 256 bit splitted into multiple 64bit units
+ const unsigned units = (256 / (sizeof(uint64_t)*8));
+ static_assert(units == 4);
+ for (unsigned i = 0; i < units; i++) {
+ ceph::encode(p_rec->s.hash[i], hash_bl);
+ }
+
+ if (!p_rec->s.flags.hash_calculated()) {
+ p_op->cmpxattr(RGW_ATTR_BLAKE3, CEPH_OSD_CMPXATTR_OP_EQ, hash_bl);
+ }
+ }
+
+ //---------------------------------------------------------------------------
+ int Background::dedup_object(const disk_record_t *p_src_rec,
+ const disk_record_t *p_tgt_rec,
+ md5_stats_t *p_stats,
+ bool has_shared_manifest_src)
+ {
+ RGWObjManifest src_manifest;
+ try {
+ auto bl_iter = p_src_rec->manifest_bl.cbegin();
+ decode(src_manifest, bl_iter);
+ } catch (buffer::error& err) {
+ ldpp_dout(dpp, 1) << __func__ << "::ERROR: bad src manifest" << dendl;
+ return -EINVAL;
+ }
+ RGWObjManifest tgt_manifest;
+ try {
+ auto bl_iter = p_tgt_rec->manifest_bl.cbegin();
+ decode(tgt_manifest, bl_iter);
+ } catch (buffer::error& err) {
+ ldpp_dout(dpp, 1) << __func__ << "::ERROR: bad tgt manifest" << dendl;
+ return -EINVAL;
+ }
+ ldpp_dout(dpp, 20) << __func__ << "::DEDUP From: "
+ << p_src_rec->bucket_name << "/" << p_src_rec->obj_name << " -> "
+ << p_tgt_rec->bucket_name << "/" << p_tgt_rec->obj_name << dendl;
+
+ bufferlist etag_bl;
+ etag_to_bufferlist(p_tgt_rec->s.md5_high, p_tgt_rec->s.md5_low, p_tgt_rec->s.num_parts, &etag_bl);
+ ldpp_dout(dpp, 20) << __func__ << "::num_parts=" << p_tgt_rec->s.num_parts
+ << "::ETAG=" << etag_bl.to_str() << dendl;
+
+ bufferlist hash_bl, manifest_hash_bl, tgt_hash_bl;
+ crypto::digest<crypto::SHA1>(p_src_rec->manifest_bl).encode(hash_bl);
+ // Use a shorter hash (64bit instead of 160bit)
+ hash_bl.splice(0, 8, &manifest_hash_bl);
+ librados::ObjectWriteOperation tgt_op;
+ init_cmp_pairs(p_tgt_rec, etag_bl, tgt_hash_bl, &tgt_op);
+ tgt_op.setxattr(RGW_ATTR_SHARE_MANIFEST, manifest_hash_bl);
+ tgt_op.setxattr(RGW_ATTR_MANIFEST, p_src_rec->manifest_bl);
+ if (p_tgt_rec->s.flags.hash_calculated()) {
+ tgt_op.setxattr(RGW_ATTR_BLAKE3, tgt_hash_bl);
+ p_stats->set_hash_attrs++;
+ }
+
+ std::string src_oid, tgt_oid;
+ librados::IoCtx src_ioctx, tgt_ioctx;
+ int ret1 = get_ioctx(dpp, driver, rados, p_src_rec, &src_ioctx, &src_oid);
+ int ret2 = get_ioctx(dpp, driver, rados, p_tgt_rec, &tgt_ioctx, &tgt_oid);
+ if (unlikely(ret1 != 0 || ret2 != 0)) {
+ ldpp_dout(dpp, 1) << __func__ << "::ERR: failed get_ioctx()" << dendl;
+ return (ret1 ? ret1 : ret2);
+ }
+
+ // TBD: Do we need to remove target RGW_ATTR_TAIL_TAG??
+ string ref_tag = p_tgt_rec->ref_tag;
+ ldpp_dout(dpp, 20) << __func__ << "::ref_tag=" << ref_tag << dendl;
+ int ret = inc_ref_count_by_manifest(ref_tag, src_oid, src_manifest);
+ if (ret == 0) {
+ ldpp_dout(dpp, 20) << __func__ << "::send TGT CLS (Shared_Manifest)" << dendl;
+ ret = tgt_ioctx.operate(tgt_oid, &tgt_op);
+ if (unlikely(ret != 0)) {
+ ldpp_dout(dpp, 1) << __func__ << "::ERR: failed tgt_ioctx.operate("
+ << tgt_oid << "), err is " << cpp_strerror(-ret) << dendl;
+ rollback_ref_by_manifest(ref_tag, src_oid, src_manifest);
+ return ret;
+ }
+
+ // free tail objects based on TGT manifest
+ free_tail_objs_by_manifest(ref_tag, tgt_oid, tgt_manifest);
+
+ if (!has_shared_manifest_src) {
+ // When SRC OBJ A has two or more dups (B, C) we set SHARED_MANIFEST
+ // after deduping B and update it in dedup_table, but don't update the
+ // disk-record (as require an expensive random-disk-write).
+ // When deduping C we can trust the shared_manifest state in the table and
+ // skip a redundant update to SRC object attribute
+ bufferlist src_hash_bl;
+ librados::ObjectWriteOperation src_op;
+ init_cmp_pairs(p_src_rec, etag_bl, src_hash_bl, &src_op);
+ src_op.setxattr(RGW_ATTR_SHARE_MANIFEST, manifest_hash_bl);
+ if (p_src_rec->s.flags.hash_calculated()) {
+ src_op.setxattr(RGW_ATTR_BLAKE3, src_hash_bl);
+ p_stats->set_hash_attrs++;
+ }
+
+ ldpp_dout(dpp, 20) << __func__ <<"::send SRC CLS (Shared_Manifest)"<< dendl;
+ ret = src_ioctx.operate(src_oid, &src_op);
+ if (unlikely(ret != 0)) {
+ ldpp_dout(dpp, 1) << __func__ << "::ERR: failed src_ioctx.operate("
+ << src_oid << "), err is " << cpp_strerror(-ret)<<dendl;
+ return ret;
+ }
+ }
+ }
+
+ // do we need to set compression on the head object or is it set on tail?
+ // RGW_ATTR_COMPRESSION
+ return ret;
+ }
+
+ //---------------------------------------------------------------------------
+ int Background::calc_object_blake3(const disk_record_t *p_rec, uint8_t *p_hash)
+ {
+ ldpp_dout(dpp, 20) << __func__ << "::obj_name=" << p_rec->obj_name << dendl;
+ RGWObjManifest manifest;
+ try {
+ auto bl_iter = p_rec->manifest_bl.cbegin();
+ decode(manifest, bl_iter);
+ } catch (buffer::error& err) {
+ ldpp_dout(dpp, 1) << __func__ << "::ERROR: bad src manifest for: "
+ << p_rec->obj_name << dendl;
+ return -EINVAL;
+ }
+
+ blake3_hasher hmac;
+ blake3_hasher_init(&hmac);
+ for (auto p = manifest.obj_begin(dpp); p != manifest.obj_end(dpp); ++p) {
+ rgw_raw_obj raw_obj = p.get_location().get_raw_obj(rados);
+ rgw_rados_ref obj;
+ int ret = rgw_get_rados_ref(dpp, rados_handle, raw_obj, &obj);
+ if (ret < 0) {
+ ldpp_dout(dpp, 1) << __func__ << "::failed rgw_get_rados_ref() for oid: "
+ << raw_obj.oid << ", err is " << cpp_strerror(-ret) << dendl;
+ return ret;
+ }
+
+ bufferlist bl;
+ librados::IoCtx ioctx = obj.ioctx;
+ // read full object
+ ret = ioctx.read(raw_obj.oid, bl, 0, 0);
+ if (ret > 0) {
+ for (const auto& bptr : bl.buffers()) {
+ blake3_hasher_update(&hmac, (const unsigned char *)bptr.c_str(), bptr.length());
+ }
+ }
+ else {
+ ldpp_dout(dpp, 1) << __func__ << "::ERR: failed to read " << raw_obj.oid
+ << ", error is " << cpp_strerror(-ret) << dendl;
+ return ret;
+ }
+ }
+
+ blake3_hasher_finalize(&hmac, p_hash, BLAKE3_OUT_LEN);
+ return 0;
+ }
+
+ //---------------------------------------------------------------------------
+ [[maybe_unused]]static void __attribute__ ((noinline))
+ print_record(const DoutPrefixProvider* dpp,
+ const disk_record_t *p_tgt_rec,
+ disk_block_id_t block_id,
+ record_id_t rec_id,
+ md5_shard_t md5_shard)
+ {
+ ldpp_dout(dpp, 20) << __func__ << "::bucket=" << p_tgt_rec->bucket_name
+ << ", obj=" << p_tgt_rec->obj_name
+ << ", block_id=" << block_id
+ << ", rec_id=" << (int)rec_id
+ << ", md5_shard=" << (int)md5_shard << dendl;
+
+ ldpp_dout(dpp, 20) << __func__ << "::md5_shard=" << (int)md5_shard
+ << "::" << p_tgt_rec->bucket_name
+ << "/" << p_tgt_rec->obj_name
+ << "::num_parts=" << p_tgt_rec->s.num_parts
+ << "::ETAG=" << std::hex << p_tgt_rec->s.md5_high
+ << p_tgt_rec->s.md5_low << std::dec << dendl;
+ }
+
+ //---------------------------------------------------------------------------
+ int Background::add_obj_attrs_to_record(rgw_bucket *p_rb,
+ disk_record_t *p_rec,
+ const rgw::sal::Attrs &attrs,
+ dedup_table_t *p_table,
+ md5_stats_t *p_stats) /*IN-OUT*/
+ {
+ // if TAIL_TAG exists -> use it as ref-tag, eitherwise take ID_TAG
+ auto itr = attrs.find(RGW_ATTR_TAIL_TAG);
+ if (itr != attrs.end()) {
+ p_rec->ref_tag = itr->second.to_str();
+ }
+ else {
+ itr = attrs.find(RGW_ATTR_ID_TAG);
+ if (itr != attrs.end()) {
+ p_rec->ref_tag = itr->second.to_str();
+ }
+ else {
+ ldpp_dout(dpp, 5) << __func__ << "::No TAIL_TAG and no ID_TAG" << dendl;
+ return -EINVAL;
+ }
+ }
+ p_rec->s.ref_tag_len = p_rec->ref_tag.length();
+
+ // clear bufferlist first
+ p_rec->manifest_bl.clear();
+
+ itr = attrs.find(RGW_ATTR_MANIFEST);
+ if (itr != attrs.end()) {
+ const bufferlist &bl = itr->second;
+ RGWObjManifest manifest;
+ try {
+ auto bl_iter = bl.cbegin();
+ decode(manifest, bl_iter);
+ } catch (buffer::error& err) {
+ ldpp_dout(dpp, 1) << __func__
+ << "::ERROR: unable to decode manifest" << dendl;
+ return -EINVAL;
+ }
+
+ // force explicit tail_placement as the dedup could be on another bucket
+ const rgw_bucket_placement& tail_placement = manifest.get_tail_placement();
+ if (tail_placement.bucket.name.empty()) {
+ ldpp_dout(dpp, 20) << __func__ << "dedup::updating tail placement" << dendl;
+ manifest.set_tail_placement(tail_placement.placement_rule, *p_rb);
+ encode(manifest, p_rec->manifest_bl);
+ }
+ else {
+ p_rec->manifest_bl = bl;
+ }
+ p_rec->s.manifest_len = p_rec->manifest_bl.length();
+ }
+ else {
+ ldpp_dout(dpp, 5) << __func__ << "::ERROR: no manifest" << dendl;
+ return -EINVAL;
+ }
+
+ itr = attrs.find(RGW_ATTR_SHARE_MANIFEST);
+ if (itr != attrs.end()) {
+ uint64_t hash = 0;
+ try {
+ auto bl_iter = itr->second.cbegin();
+ ceph::decode(hash, bl_iter);
+ p_rec->s.shared_manifest = hash;
+ } catch (buffer::error& err) {
+ ldpp_dout(dpp, 1) << __func__ << "::ERROR: bad shared_manifest" << dendl;
+ return -EINVAL;
+ }
+ ldpp_dout(dpp, 20) << __func__ << "::Set Shared_Manifest::OBJ_NAME="
+ << p_rec->obj_name << "::shared_manifest=0x" << std::hex
+ << p_rec->s.shared_manifest << std::dec << dendl;
+ p_rec->s.flags.set_shared_manifest();
+ }
+ else {
+ memset(&p_rec->s.shared_manifest, 0, sizeof(p_rec->s.shared_manifest));
+ }
+
+ itr = attrs.find(RGW_ATTR_BLAKE3);
+ if (itr != attrs.end()) {
+ try {
+ auto bl_iter = itr->second.cbegin();
+ // BLAKE3 hash 256 bit splitted into multiple 64bit units
+ const unsigned units = (256 / (sizeof(uint64_t)*8));
+ static_assert(units == 4);
+ for (unsigned i = 0; i < units; i++) {
+ uint64_t val;
+ ceph::decode(val, bl_iter);
+ p_rec->s.hash[i] = val;
+ }
+ p_stats->valid_hash_attrs++;
+ return 0;
+ } catch (buffer::error& err) {
+ ldpp_dout(dpp, 1) << __func__ << "::ERR: failed HASH decode" << dendl;
+ return -EINVAL;
+ }
+ }
+
+ p_stats->invalid_hash_attrs++;
+ // TBD: redundant memset...
+ memset(p_rec->s.hash, 0, sizeof(p_rec->s.hash));
+ // BLAKE3_OUT_LEN is 32 Bytes
+ int ret = calc_object_blake3(p_rec, (uint8_t*)p_rec->s.hash);
+ if (ret == 0) {
+ p_rec->s.flags.set_hash_calculated();
+ }
+
+ return ret;
+ }
+
+ //---------------------------------------------------------------------------
+ // We purged all entries not marked for-dedup (i.e. singleton bit is set) from the table
+ // so all entries left are sources of dedup with multiple copies.
+ // Need to read attributes from the Head-Object and output them to a new SLAB
+ int Background::read_object_attribute(dedup_table_t *p_table,
+ disk_record_t *p_rec,
+ disk_block_id_t old_block_id,
+ record_id_t old_rec_id,
+ md5_shard_t md5_shard,
+ md5_stats_t *p_stats /* IN-OUT */,
+ disk_block_seq_t *p_disk,
+ remapper_t *remapper)
+ {
+ bool should_print_debug = cct->_conf->subsys.should_gather<ceph_subsys_rgw_dedup, 20>();
+ if (unlikely(should_print_debug)) {
+ print_record(dpp, p_rec, old_block_id, old_rec_id, md5_shard);
+ }
+ p_stats->processed_objects ++;
+
+ uint32_t size_4k_units = byte_size_to_disk_blocks(p_rec->s.obj_bytes_size);
+ uint64_t ondisk_byte_size = disk_blocks_to_byte_size(size_4k_units);
+ storage_class_idx_t sc_idx = remapper->remap(p_rec->stor_class, dpp,
+ &p_stats->failed_map_overflow);
+ if (unlikely(sc_idx == remapper_t::NULL_IDX)) {
+ // TBD: need stat counters
+ return -EOVERFLOW;
+ }
+ key_t key_from_bucket_index(p_rec->s.md5_high, p_rec->s.md5_low, size_4k_units,
+ p_rec->s.num_parts, sc_idx);
+ dedup_table_t::value_t src_val;
+ int ret = p_table->get_val(&key_from_bucket_index, &src_val);
+ if (ret != 0) {
+ if (ondisk_byte_size <= d_min_obj_size_for_dedup && p_rec->s.num_parts == 0) {
+ // record has no valid entry in table because it is a too small
+ // It was loaded to table for calculation and then purged
+ p_stats->skipped_purged_small++;
+ ldpp_dout(dpp, 20) << __func__ << "::skipped purged small obj::"
+ << p_rec->obj_name << "::" << ondisk_byte_size << dendl;
+ // help small object tests pass - avoid complication differentiating between
+ // small objects ( < 64KB, >= 64KB <= 4MB, > 4MB
+ p_stats->processed_objects--;
+ }
+ else {
+ // record has no valid entry in table because it is a singleton
+ p_stats->skipped_singleton++;
+ p_stats->skipped_singleton_bytes += ondisk_byte_size;
+ ldpp_dout(dpp, 20) << __func__ << "::skipped singleton::"
+ << p_rec->obj_name << std::dec << dendl;
+ }
+ return 0;
+ }
+
+ // Every object after this point was counted as a dedup potential
+ // If we conclude that it can't be dedup it should be accounted for
+ rgw_bucket b{p_rec->tenant_name, p_rec->bucket_name, p_rec->bucket_id};
+ unique_ptr<rgw::sal::Bucket> bucket;
+ ret = driver->load_bucket(dpp, b, &bucket, null_yield);
+ if (unlikely(ret != 0)) {
+ // could happen when the bucket is removed between passes
+ p_stats->ingress_failed_load_bucket++;
+ ldpp_dout(dpp, 15) << __func__ << "::Failed driver->load_bucket(): "
+ << cpp_strerror(-ret) << dendl;
+ return 0;
+ }
+
+ unique_ptr<rgw::sal::Object> p_obj = bucket->get_object(p_rec->obj_name);
+ if (unlikely(!p_obj)) {
+ // could happen when the object is removed between passes
+ p_stats->ingress_failed_get_object++;
+ ldpp_dout(dpp, 15) << __func__ << "::Failed bucket->get_object("
+ << p_rec->obj_name << ")" << dendl;
+ return 0;
+ }
+
+ ret = p_obj->get_obj_attrs(null_yield, dpp);
+ if (unlikely(ret < 0)) {
+ p_stats->ingress_failed_get_obj_attrs++;
+ ldpp_dout(dpp, 10) << __func__ << "::ERR: failed to stat object(" << p_rec->obj_name
+ << "), returned error: " << cpp_strerror(-ret) << dendl;
+ return ret;
+ }
+
+ const rgw::sal::Attrs& attrs = p_obj->get_attrs();
+ if (attrs.find(RGW_ATTR_CRYPT_MODE) != attrs.end()) {
+ p_stats->ingress_skip_encrypted++;
+ p_stats->ingress_skip_encrypted_bytes += ondisk_byte_size;
+ ldpp_dout(dpp, 20) <<__func__ << "::Skipping encrypted object "
+ << p_rec->obj_name << dendl;
+ return 0;
+ }
+
+ // TBD: We should be able to support RGW_ATTR_COMPRESSION when all copies are compressed
+ if (attrs.find(RGW_ATTR_COMPRESSION) != attrs.end()) {
+ p_stats->ingress_skip_compressed++;
+ p_stats->ingress_skip_compressed_bytes += ondisk_byte_size;
+ ldpp_dout(dpp, 20) <<__func__ << "::Skipping compressed object "
+ << p_rec->obj_name << dendl;
+ return 0;
+ }
+
+ // extract ETAG and Size and compare with values taken from the bucket-index
+ parsed_etag_t parsed_etag;
+ auto itr = attrs.find(RGW_ATTR_ETAG);
+ if (itr != attrs.end()) {
+ if (unlikely(!parse_etag_string(itr->second.to_str(), &parsed_etag))) {
+ p_stats->ingress_corrupted_etag++;
+ ldpp_dout(dpp, 10) << __func__ << "::ERROR: corrupted etag::" << p_rec->obj_name << dendl;
+ return -EINVAL;
+ }
+ }
+ else {
+ p_stats->ingress_corrupted_etag++;
+ ldpp_dout(dpp, 10) << __func__ << "::ERROR: no etag" << p_rec->obj_name << dendl;
+ return -EINVAL;
+ }
+
+ std::string storage_class;
+ itr = attrs.find(RGW_ATTR_STORAGE_CLASS);
+ if (itr != attrs.end()) {
+ storage_class = itr->second.to_str();
+ }
+ else {
+ storage_class = RGW_STORAGE_CLASS_STANDARD;
+ }
+ // no need to check for remap success as we compare keys bellow
+ sc_idx = remapper->remap(storage_class, dpp, &p_stats->failed_map_overflow);
+ key_t key_from_obj(parsed_etag.md5_high, parsed_etag.md5_low,
+ byte_size_to_disk_blocks(p_obj->get_size()),
+ parsed_etag.num_parts, sc_idx);
+ if (unlikely(key_from_obj != key_from_bucket_index ||
+ p_rec->s.obj_bytes_size != p_obj->get_size())) {
+ ldpp_dout(dpp, 15) <<__func__ << "::Skipping changed object "
+ << p_rec->obj_name << dendl;
+ p_stats->ingress_skip_changed_objs++;
+ return 0;
+ }
+
+ // reset flags
+ p_rec->s.flags.clear();
+ ret = add_obj_attrs_to_record(&b, p_rec, attrs, p_table, p_stats);
+ if (unlikely(ret != 0)) {
+ ldpp_dout(dpp, 5) << __func__ << "::ERR: failed add_obj_attrs_to_record() ret="
+ << ret << "::" << cpp_strerror(-ret) << dendl;
+ return ret;
+ }
+
+ disk_block_seq_t::record_info_t rec_info;
+ ret = p_disk->add_record(d_dedup_cluster_ioctx, p_rec, &rec_info);
+ if (ret == 0) {
+ // set the disk_block_id_t to this unless the existing disk_block_id is marked as shared-manifest
+ ceph_assert(rec_info.rec_id < MAX_REC_IN_BLOCK);
+ ldpp_dout(dpp, 20) << __func__ << "::" << p_rec->bucket_name << "/"
+ << p_rec->obj_name << " was written to block_idx="
+ << rec_info.block_id << "::rec_id=" << (int)rec_info.rec_id
+ << "::shared_manifest=" << p_rec->has_shared_manifest() << dendl;
+ p_table->update_entry(&key_from_bucket_index, rec_info.block_id,
+ rec_info.rec_id, p_rec->has_shared_manifest());
+ }
+ else {
+ ldpp_dout(dpp, 5) << __func__ << "::ERR: Failed p_disk->add_record()"<< dendl;
+ if (ret == -EINVAL) {
+ p_stats->ingress_corrupted_obj_attrs++;
+ }
+ }
+ return ret;
+ }
+
+ //---------------------------------------------------------------------------
+ static int write_blake3_object_attribute(const DoutPrefixProvider* const dpp,
+ rgw::sal::Driver* driver,
+ RGWRados* rados,
+ const disk_record_t *p_rec)
+ {
+ bufferlist etag_bl;
+ bufferlist hash_bl;
+ librados::ObjectWriteOperation op;
+ etag_to_bufferlist(p_rec->s.md5_high, p_rec->s.md5_low, p_rec->s.num_parts,
+ &etag_bl);
+ init_cmp_pairs(p_rec, etag_bl, hash_bl /*OUT PARAM*/, &op);
+ op.setxattr(RGW_ATTR_BLAKE3, hash_bl);
+
+ std::string oid;
+ librados::IoCtx ioctx;
+ int ret = get_ioctx(dpp, driver, rados, p_rec, &ioctx, &oid);
+ if (unlikely(ret != 0)) {
+ ldpp_dout(dpp, 5) << __func__ << "::ERR: failed get_ioctx()" << dendl;
+ return ret;
+ }
+
+ ret = ioctx.operate(oid, &op);
+ if (unlikely(ret != 0)) {
+ ldpp_dout(dpp, 5) << __func__ << "::ERR: failed ioctx.operate("
+ << oid << "), err is " << cpp_strerror(-ret) << dendl;
+ }
+ return ret;
+ }
+
+ //---------------------------------------------------------------------------
+ // We purged all entries not marked for-dedup (i.e. singleton bit is set) from the table
+ // so all entries left are sources of dedup with multiple copies.
+ // If the record is marked as Shared-Manifest-Object -> skip it
+ // if the record's key doesn’t exist in table -> skip it (it is a singleton and it was purged)
+ // If the record block-index matches the hashtable entry -> skip it (it is the SRC object)
+ // All other entries are Dedicated-Manifest-Objects with a valid SRC object
+
+ // we can withstand most errors moving to the next object
+ // only report an error if we recived a stop scan request!
+ //
+ int Background::try_deduping_record(dedup_table_t *p_table,
+ const disk_record_t *p_tgt_rec,
+ disk_block_id_t block_id,
+ record_id_t rec_id,
+ md5_shard_t md5_shard,
+ md5_stats_t *p_stats, /* IN-OUT */
+ remapper_t *remapper)
+ {
+ bool should_print_debug = cct->_conf->subsys.should_gather<ceph_subsys_rgw, 20>();
+ if (unlikely(should_print_debug)) {
+ print_record(dpp, p_tgt_rec, block_id, rec_id, md5_shard);
+ }
+
+ uint32_t size_4k_units = byte_size_to_disk_blocks(p_tgt_rec->s.obj_bytes_size);
+ storage_class_idx_t sc_idx = remapper->remap(p_tgt_rec->stor_class, dpp,
+ &p_stats->failed_map_overflow);
+ ceph_assert(sc_idx != remapper_t::NULL_IDX);
+ key_t key(p_tgt_rec->s.md5_high, p_tgt_rec->s.md5_low, size_4k_units,
+ p_tgt_rec->s.num_parts, sc_idx);
+ dedup_table_t::value_t src_val;
+ int ret = p_table->get_val(&key, &src_val);
+ if (ret != 0) {
+ // record has no valid entry in table because it is a singleton
+ // should never happened since we purged all singletons before
+ ldpp_dout(dpp, 5) << __func__ << "::skipped singleton::" << p_tgt_rec->bucket_name
+ << "/" << p_tgt_rec->obj_name << "::num_parts=" << p_tgt_rec->s.num_parts
+ << "::ETAG=" << std::hex << p_tgt_rec->s.md5_high
+ << p_tgt_rec->s.md5_low << std::dec << dendl;
+ ceph_abort("Unexpcted singleton");
+ return 0;
+ }
+
+ disk_block_id_t src_block_id = src_val.block_idx;
+ record_id_t src_rec_id = src_val.rec_id;
+ if (block_id == src_block_id && rec_id == src_rec_id) {
+ // the table entry point to this record which means it is a dedup source so nothing to do
+ p_stats->skipped_source_record++;
+ ldpp_dout(dpp, 20) << __func__ << "::skipped source-record" << dendl;
+ return 0;
+ }
+
+ // ceph store full blocks so need to round up and multiply by block_size
+ uint64_t ondisk_byte_size = disk_blocks_to_byte_size(size_4k_units);
+ uint64_t dedupable_objects_bytes = calc_deduped_bytes(d_head_object_size,
+ p_tgt_rec->s.num_parts,
+ ondisk_byte_size);
+ if (p_tgt_rec->s.flags.has_shared_manifest()) {
+ // record holds a shared_manifest object so can't be a dedup target
+ p_stats->skipped_shared_manifest++;
+ p_stats->shared_manifest_dedup_bytes += dedupable_objects_bytes;
+ ldpp_dout(dpp, 20) << __func__ << "::skipped shared_manifest" << dendl;
+ return 0;
+ }
+
+ // This records is a dedup target with source record on source_block_id
+ disk_record_t src_rec;
+ ret = load_record(d_dedup_cluster_ioctx, p_tgt_rec, &src_rec, src_block_id,
+ src_rec_id, md5_shard, dpp);
+ if (unlikely(ret != 0)) {
+ p_stats->failed_src_load++;
+ // we can withstand most errors moving to the next object
+ ldpp_dout(dpp, 5) << __func__ << "::ERR: Failed load_record("
+ << src_block_id << ", " << src_rec_id << ")" << dendl;
+ return 0;
+ }
+
+ ldpp_dout(dpp, 20) << __func__ << "::SRC=" << src_rec.bucket_name
+ << "/" << src_rec.obj_name << dendl;
+ // verify that SRC and TGT records don't refer to the same physical object
+ // This could happen in theory if we read the same objects twice
+ if (src_rec.obj_name == p_tgt_rec->obj_name && src_rec.bucket_name == p_tgt_rec->bucket_name) {
+ p_stats->duplicate_records++;
+ ldpp_dout(dpp, 10) << __func__ << "::WARN: Duplicate records for object="
+ << src_rec.obj_name << dendl;
+ return 0;
+ }
+
+ // the hash table size is rounded to the nearest 4KB and will wrap after 16G
+ if (unlikely(src_rec.s.obj_bytes_size != p_tgt_rec->s.obj_bytes_size)) {
+ p_stats->size_mismatch++;
+ ldpp_dout(dpp, 10) << __func__ << "::WARN: different byte size for objects::"
+ << src_rec.obj_name << "::" << src_rec.s.obj_bytes_size
+ << "::" << p_tgt_rec->obj_name << "::"
+ << p_tgt_rec->s.obj_bytes_size << dendl;
+ return 0;
+ }
+
+ if (memcmp(src_rec.s.hash, p_tgt_rec->s.hash, sizeof(src_rec.s.hash)) != 0) {
+ p_stats->hash_mismatch++;
+ ldpp_dout(dpp, 10) << __func__ << "::HASH mismatch" << dendl;
+ // TBD: set hash attributes on head objects to save calc next time
+ if (src_rec.s.flags.hash_calculated()) {
+ write_blake3_object_attribute(dpp, driver, rados, &src_rec);
+ p_stats->set_hash_attrs++;
+ }
+ if (p_tgt_rec->s.flags.hash_calculated()) {
+ write_blake3_object_attribute(dpp, driver, rados, p_tgt_rec);
+ p_stats->set_hash_attrs++;
+ }
+ return 0;
+ }
+
+ ret = dedup_object(&src_rec, p_tgt_rec, p_stats, src_val.has_shared_manifest());
+ if (ret == 0) {
+ p_stats->deduped_objects++;
+ p_stats->deduped_objects_bytes += dedupable_objects_bytes;
+ if (p_tgt_rec->s.num_parts == 0) {
+ // single part objects duplicate the head object when dedup is used
+ p_stats->dup_head_bytes += d_head_object_size;
+ }
+
+ // mark the SRC object as a providor of a shared manifest
+ if (!src_val.has_shared_manifest()) {
+ p_stats->set_shared_manifest_src++;
+ // set the shared manifest flag in the dedup table
+ p_table->set_shared_manifest_src_mode(&key, src_block_id, src_rec_id);
+ }
+ else {
+ ldpp_dout(dpp, 20) << __func__ << "::SRC object already marked as shared_manifest" << dendl;
+ }
+ }
+ else {
+ ldpp_dout(dpp, 10) << __func__ << "::ERR: Failed dedup for "
+ << src_rec.bucket_name << "/" << src_rec.obj_name << dendl;
+ p_stats->failed_dedup++;
+ }
+
+ return 0;
+ }
+
+#endif // #ifdef FULL_DEDUP_SUPPORT
+ //---------------------------------------------------------------------------
+ const char* Background::dedup_step_name(dedup_step_t step)
+ {
+ static const char* names[] = {"STEP_NONE",
+ "STEP_BUCKET_INDEX_INGRESS",
+ "STEP_BUILD_TABLE",
+ "STEP_READ_ATTRIBUTES",
+ "STEP_REMOVE_DUPLICATES"};
+ static const char* undefined_step = "UNDEFINED_STEP";
+ if (step >= STEP_NONE && step <= STEP_REMOVE_DUPLICATES) {
+ return names[step];
+ }
+ else {
+ return undefined_step;
+ }
+ }
+
+ //---------------------------------------------------------------------------
+ int Background::process_all_slabs(dedup_table_t *p_table,
+ dedup_step_t step,
+ md5_shard_t md5_shard,
+ work_shard_t worker_id,
+ uint32_t *p_slab_count,
+ md5_stats_t *p_stats, /* IN-OUT */
+ disk_block_seq_t *p_disk_block_seq,
+ remapper_t *remapper)
+ {
+ char block_buff[sizeof(disk_block_t)];
+ const int MAX_OBJ_LOAD_FAILURE = 3;
+ const int MAX_BAD_BLOCKS = 2;
+ bool has_more = true;
+ uint32_t seq_number = 0;
+ int failure_count = 0;
+ ldpp_dout(dpp, 20) << __func__ << "::" << dedup_step_name(step) << "::worker_id="
+ << worker_id << ", md5_shard=" << md5_shard << dendl;
+ *p_slab_count = 0;
+ while (has_more) {
+ bufferlist bl;
+ int ret = load_slab(d_dedup_cluster_ioctx, bl, md5_shard, worker_id, seq_number, dpp);
+ if (unlikely(ret < 0)) {
+ ldpp_dout(dpp, 1) << __func__ << "::ERR::Failed loading object!! md5_shard=" << md5_shard
+ << ", worker_id=" << worker_id << ", seq_number=" << seq_number
+ << ", failure_count=" << failure_count << dendl;
+ // skip to the next SLAB stopping after 3 bad objects
+ if (failure_count++ < MAX_OBJ_LOAD_FAILURE) {
+ seq_number += DISK_BLOCK_COUNT;
+ continue;
+ }
+ else {
+ return ret;
+ }
+ }
+
+ (*p_slab_count)++;
+ failure_count = 0;
+ unsigned slab_rec_count = 0;
+ auto bl_itr = bl.cbegin();
+ for (uint32_t block_num = 0; block_num < DISK_BLOCK_COUNT; block_num++, seq_number++) {
+ disk_block_id_t disk_block_id(worker_id, seq_number);
+ const char *p = get_next_data_ptr(bl_itr, block_buff, sizeof(block_buff),
+ dpp);
+ disk_block_t *p_disk_block = (disk_block_t*)p;
+ disk_block_header_t *p_header = p_disk_block->get_header();
+ p_header->deserialize();
+ if (unlikely(p_header->verify(disk_block_id, dpp) != 0)) {
+ p_stats->failed_block_load++;
+ // move to next block until reaching a valid block
+ if (failure_count++ < MAX_BAD_BLOCKS) {
+ continue;
+ }
+ else {
+ ldpp_dout(dpp, 1) << __func__ << "::Skipping slab with too many bad blocks::"
+ << (int)md5_shard << ", worker_id=" << (int)worker_id
+ << ", seq_number=" << seq_number << dendl;
+ failure_count = 0;
+ break;
+ }
+ }
+
+ if (p_header->rec_count == 0) {
+ ldpp_dout(dpp, 20) << __func__ << "::Block #" << block_num
+ << " has an empty header, no more blocks" << dendl;
+ has_more = false;
+ break;
+ }
+
+ for (unsigned rec_id = 0; rec_id < p_header->rec_count; rec_id++) {
+ unsigned offset = p_header->rec_offsets[rec_id];
+ // We deserialize the record inside the CTOR
+ disk_record_t rec(p + offset);
+ ret = rec.validate(__func__, dpp, disk_block_id, rec_id);
+ if (unlikely(ret != 0)) {
+ p_stats->failed_rec_load++;
+ return ret;
+ }
+
+ if (step == STEP_BUILD_TABLE) {
+ add_record_to_dedup_table(p_table, &rec, disk_block_id, rec_id, p_stats, remapper);
+ slab_rec_count++;
+ }
+#ifdef FULL_DEDUP_SUPPORT
+ else if (step == STEP_READ_ATTRIBUTES) {
+ read_object_attribute(p_table, &rec, disk_block_id, rec_id, md5_shard,
+ p_stats, p_disk_block_seq, remapper);
+ slab_rec_count++;
+ }
+ else if (step == STEP_REMOVE_DUPLICATES) {
+ try_deduping_record(p_table, &rec, disk_block_id, rec_id, md5_shard,
+ p_stats, remapper);
+ slab_rec_count++;
+ }
+#endif // #ifdef FULL_DEDUP_SUPPORT
+ else {
+ ceph_abort("unexpected step");
+ }
+ }
+
+ check_and_update_md5_heartbeat(md5_shard, p_stats->loaded_objects,
+ p_stats->processed_objects);
+ if (unlikely(d_ctl.should_pause())) {
+ handle_pause_req(__func__);
+ }
+ if (unlikely(d_ctl.should_stop())) {
+ return -ECANCELED;
+ }
+
+ has_more = (p_header->offset == BLOCK_MAGIC);
+ ceph_assert(p_header->offset == BLOCK_MAGIC || p_header->offset == LAST_BLOCK_MAGIC);
+ if (!has_more) {
+ ldpp_dout(dpp, 20) << __func__ << "::No more blocks! block_id=" << disk_block_id
+ << ", rec_count=" << p_header->rec_count << dendl;
+ break;
+ }
+ }
+ ldpp_dout(dpp, 20) <<__func__ << "::slab seq_number=" << seq_number
+ << ", rec_count=" << slab_rec_count << dendl;
+ }
+ return 0;
+ }
+
+ //---------------------------------------------------------------------------
+ static void __attribute__ ((noinline))
+ show_ingress_bucket_idx_obj(const DoutPrefixProvider *dpp,
+ const parsed_etag_t &parsed_etag,
+ const string &bucket_name,
+ const string &obj_name)
+ {
+ ldpp_dout(dpp, 20) << __func__ << "::(1)::" << bucket_name << "/" << obj_name
+ << "::num_parts=" << parsed_etag.num_parts
+ << "::ETAG=" << std::hex << parsed_etag.md5_high
+ << parsed_etag.md5_low << std::dec << dendl;
+ }
+
+ //---------------------------------------------------------------------------
+ int Background::ingress_bucket_idx_single_object(disk_block_array_t &disk_arr,
+ const rgw::sal::Bucket *p_bucket,
+ const rgw_bucket_dir_entry &entry,
+ worker_stats_t *p_worker_stats /*IN-OUT*/)
+ {
+ // ceph store full blocks so need to round up and multiply by block_size
+ uint64_t ondisk_byte_size = calc_on_disk_byte_size(entry.meta.size);
+ // count all objects including too small and non default storage_class objs
+ p_worker_stats->ingress_obj++;
+ p_worker_stats->ingress_obj_bytes += ondisk_byte_size;
+
+ parsed_etag_t parsed_etag;
+ if (unlikely(!parse_etag_string(entry.meta.etag, &parsed_etag))) {
+ p_worker_stats->ingress_corrupted_etag++;
+ ldpp_dout(dpp, 1) << __func__ << "::ERROR: corrupted etag" << dendl;
+ return -EINVAL;
+ }
+
+ if (unlikely((cct->_conf->subsys.should_gather<ceph_subsys_rgw_dedup, 20>()))) {
+ show_ingress_bucket_idx_obj(dpp, parsed_etag, p_bucket->get_name(), entry.key.name);
+ }
+
+ // We limit dedup to objects from the same storage_class
+ // TBD:
+ // Should we use a skip-list of storage_classes we should skip (like glacier) ?
+ const std::string& storage_class =
+ rgw_placement_rule::get_canonical_storage_class(entry.meta.storage_class);
+ if (storage_class == RGW_STORAGE_CLASS_STANDARD) {
+ p_worker_stats->default_storage_class_objs++;
+ p_worker_stats->default_storage_class_objs_bytes += ondisk_byte_size;
+ }
+ else {
+ ldpp_dout(dpp, 20) << __func__ << "::" << entry.key.name
+ << "::storage_class:" << entry.meta.storage_class << dendl;
+ p_worker_stats->non_default_storage_class_objs++;
+ p_worker_stats->non_default_storage_class_objs_bytes += ondisk_byte_size;
+ }
+
+ if (ondisk_byte_size <= d_min_obj_size_for_dedup) {
+ if (parsed_etag.num_parts == 0) {
+ // dedup only useful for objects bigger than 4MB
+ p_worker_stats->ingress_skip_too_small++;
+ p_worker_stats->ingress_skip_too_small_bytes += ondisk_byte_size;
+
+ if (ondisk_byte_size >= 64*1024) {
+ p_worker_stats->ingress_skip_too_small_64KB++;
+ p_worker_stats->ingress_skip_too_small_64KB_bytes += ondisk_byte_size;
+ }
+ else {
+ return 0;
+ }
+ }
+ else {
+ // multipart objects are always good candidates for dedup
+ // the head object is empty and data is stored only in tail objs
+ p_worker_stats->small_multipart_obj++;
+ }
+ }
+ // multipart/single_part counters are for objects being fully processed
+ if (parsed_etag.num_parts > 0) {
+ p_worker_stats->multipart_objs++;
+ }
+ else {
+ p_worker_stats->single_part_objs++;
+ }
+
+ return add_disk_rec_from_bucket_idx(disk_arr, p_bucket, &parsed_etag,
+ entry.key.name, entry.meta.size,
+ storage_class);
+ }
+
+ //---------------------------------------------------------------------------
+ void Background::check_and_update_heartbeat(unsigned shard_id, uint64_t count_a,
+ uint64_t count_b, const char *prefix)
+ {
+ utime_t now = ceph_clock_now();
+ utime_t time_elapsed = now - d_heart_beat_last_update;
+ if (unlikely(time_elapsed.tv.tv_sec >= d_heart_beat_max_elapsed_sec)) {
+ ldpp_dout(dpp, 20) << __func__ << "::max_elapsed_sec="
+ << d_heart_beat_max_elapsed_sec << dendl;
+ d_heart_beat_last_update = now;
+ d_cluster.update_shard_token_heartbeat(store, shard_id, count_a, count_b,
+ prefix);
+ }
+ }
+
+ //---------------------------------------------------------------------------
+ void Background::check_and_update_worker_heartbeat(work_shard_t worker_id,
+ int64_t ingress_obj_count)
+ {
+ check_and_update_heartbeat(worker_id, ingress_obj_count, 0, WORKER_SHARD_PREFIX);
+ }
+
+ //---------------------------------------------------------------------------
+ void Background::check_and_update_md5_heartbeat(md5_shard_t md5_id,
+ uint64_t load_count,
+ uint64_t dedup_count)
+ {
+ check_and_update_heartbeat(md5_id, load_count, dedup_count, MD5_SHARD_PREFIX);
+ }
+
+ //---------------------------------------------------------------------------
+ static uint32_t move_to_next_bucket_index_shard(const DoutPrefixProvider* dpp,
+ unsigned current_shard,
+ unsigned num_work_shards,
+ const std::string &bucket_name,
+ rgw_obj_index_key *p_marker /* OUT-PARAM */)
+ {
+ uint32_t next_shard = current_shard + num_work_shards;
+ ldpp_dout(dpp, 20) << __func__ << "::" << bucket_name << "::curr_shard="
+ << current_shard << ", next shard=" << next_shard << dendl;
+ *p_marker = rgw_obj_index_key(); // reset marker to an empty index
+ return next_shard;
+ }
+
+ // This function process bucket-index shards of a given @bucket
+ // The bucket-index-shards are stored in a group of @oids
+ // The @oids are using a simple map from the shard-id to the oid holding bucket-indices
+ // We start by processing all bucket-indices owned by this @worker-id
+ // Once we are done with a given bucket-index shard we skip to the next
+ // bucket-index-shard owned by this worker-id
+ // if (bucket_index_shard % work_id) == 0) -> read and process bucket_index_shard
+ // else -> skip bucket_index_shard and don't read it
+ //---------------------------------------------------------------------------
+ int Background::process_bucket_shards(disk_block_array_t &disk_arr,
+ const rgw::sal::Bucket *bucket,
+ std::map<int, string> &oids,
+ librados::IoCtx &ioctx,
+ work_shard_t worker_id,
+ work_shard_t num_work_shards,
+ worker_stats_t *p_worker_stats /*IN-OUT*/)
+ {
+ const uint32_t num_shards = oids.size();
+ uint32_t current_shard = worker_id;
+ rgw_obj_index_key marker; // start with an empty marker
+ const string null_prefix, null_delimiter;
+ const bool list_versions = true;
+ const int max_entries = 1000;
+ uint32_t obj_count = 0;
+
+ while (current_shard < num_shards ) {
+ check_and_update_worker_heartbeat(worker_id, p_worker_stats->ingress_obj);
+ if (unlikely(d_ctl.should_pause())) {
+ handle_pause_req(__func__);
+ }
+ if (unlikely(d_ctl.should_stop())) {
+ return -ECANCELED;
+ }
+
+ const string& oid = oids[current_shard];
+ rgw_cls_list_ret result;
+ librados::ObjectReadOperation op;
+ // get bucket-indices of @current_shard
+ cls_rgw_bucket_list_op(op, marker, null_prefix, null_delimiter, max_entries,
+ list_versions, &result);
+ int ret = rgw_rados_operate(dpp, ioctx, oid, std::move(op), nullptr, null_yield);
+ if (unlikely(ret < 0)) {
+ ldpp_dout(dpp, 1) << __func__ << "::ERR: failed rgw_rados_operate() ret="
+ << ret << "::" << cpp_strerror(-ret) << dendl;
+ current_shard = move_to_next_bucket_index_shard(dpp, current_shard, num_work_shards,
+ bucket->get_name(), &marker);
+ continue;
+ }
+ obj_count += result.dir.m.size();
+ for (auto& entry : result.dir.m) {
+ const rgw_bucket_dir_entry& dirent = entry.second;
+ if (unlikely((!dirent.exists && !dirent.is_delete_marker()) || !dirent.pending_map.empty())) {
+ // TBD: should we bailout ???
+ ldpp_dout(dpp, 1) << __func__ << "::ERR: calling check_disk_state bucket="
+ << bucket->get_name() << " entry=" << dirent.key << dendl;
+ // make sure we're advancing marker
+ marker = dirent.key;
+ continue;
+ }
+ marker = dirent.key;
+ ret = ingress_bucket_idx_single_object(disk_arr, bucket, dirent, p_worker_stats);
+ }
+ // TBD: advance marker only once here!
+ if (result.is_truncated) {
+ ldpp_dout(dpp, 15) << __func__ << "::[" << current_shard
+ << "]result.is_truncated::count=" << obj_count << dendl;
+ }
+ else {
+ // we reached the end of this shard -> move to the next shard
+ current_shard = move_to_next_bucket_index_shard(dpp, current_shard, num_work_shards,
+ bucket->get_name(), &marker);
+ ldpp_dout(dpp, 15) << __func__ << "::move_to_next_bucket_index_shard::count="
+ << obj_count << "::new_shard=" << current_shard << dendl;
+ }
+ }
+ ldpp_dout(dpp, 15) << __func__ << "::Finished processing Bucket "
+ << bucket->get_name() << ", num_shards=" << num_shards
+ << ", obj_count=" << obj_count << dendl;
+ return 0;
+ }
+
+ //---------------------------------------------------------------------------
+ int Background::ingress_bucket_objects_single_shard(disk_block_array_t &disk_arr,
+ const rgw_bucket &bucket_rec,
+ work_shard_t worker_id,
+ work_shard_t num_work_shards,
+ worker_stats_t *p_worker_stats /*IN-OUT*/)
+ {
+ unique_ptr<rgw::sal::Bucket> bucket;
+ int ret = driver->load_bucket(dpp, bucket_rec, &bucket, null_yield);
+ if (unlikely(ret != 0)) {
+ ldpp_dout(dpp, 1) << __func__ << "::ERROR: driver->load_bucket(): "
+ << cpp_strerror(-ret) << dendl;
+ return ret;
+ }
+
+ const std::string bucket_id = bucket->get_key().get_key();
+ RGWBucketInfo bucket_info;
+ ret = rados->get_bucket_instance_info(bucket_id, bucket_info,
+ nullptr, nullptr, null_yield, dpp);
+ if (unlikely(ret < 0)) {
+ if (ret == -ENOENT) {
+ // probably a race condition with bucket removal
+ ldpp_dout(dpp, 10) << __func__ << "::ret == -ENOENT" << dendl;
+ return 0;
+ }
+ ldpp_dout(dpp, 5) << __func__ << "::ERROR: get_bucket_instance_info(), ret="
+ << ret << "::" << cpp_strerror(-ret) << dendl;
+ return ret;
+ }
+ const rgw::bucket_index_layout_generation idx_layout = bucket_info.layout.current_index;
+ librados::IoCtx ioctx;
+ // objects holding the bucket-listings
+ std::map<int, std::string> oids;
+ ret = store->svc()->bi_rados->open_bucket_index(dpp, bucket_info, std::nullopt,
+ idx_layout, &ioctx, &oids, nullptr);
+ if (ret >= 0) {
+ // process all the shards in this bucket owned by the worker_id
+ return process_bucket_shards(disk_arr, bucket.get(), oids, ioctx, worker_id,
+ num_work_shards, p_worker_stats);
+ }
+ else {
+ ldpp_dout(dpp, 1) << __func__ << "::ERROR: open_bucket_index() ret="
+ << ret << "::" << cpp_strerror(-ret) << dendl;
+ return ret;
+ }
+ }
+
+ //---------------------------------------------------------------------------
+ static void display_table_stat_counters(const DoutPrefixProvider* dpp,
+ const md5_stats_t *p_stats)
+ {
+ uint64_t obj_count_in_shard = (p_stats->big_objs_stat.singleton_count +
+ p_stats->big_objs_stat.unique_count +
+ p_stats->big_objs_stat.duplicate_count);
+
+ ldpp_dout(dpp, 10) << "\n>>>>>" << __func__ << "::FINISHED STEP_BUILD_TABLE\n"
+ << "::total_count=" << obj_count_in_shard
+ << "::loaded_objects=" << p_stats->loaded_objects
+ << p_stats->big_objs_stat << dendl;
+ ldpp_dout(dpp, 10) << __func__ << "::small objs::"
+ << p_stats->small_objs_stat << dendl;
+ }
+
+ //---------------------------------------------------------------------------
+ int Background::objects_dedup_single_md5_shard(dedup_table_t *p_table,
+ md5_shard_t md5_shard,
+ md5_stats_t *p_stats,
+ work_shard_t num_work_shards)
+ {
+ remapper_t remapper(MAX_STORAGE_CLASS_IDX);
+ // make sure that the standard storage_class is always in the mapper!
+ storage_class_idx_t sc_idx = remapper.remap(RGW_STORAGE_CLASS_STANDARD, dpp,
+ &p_stats->failed_map_overflow);
+ ceph_assert(sc_idx == 0);
+ uint32_t slab_count_arr[num_work_shards];
+ // first load all etags to hashtable to find dedups
+ // the entries come from bucket-index and got minimal info (etag, size)
+ for (work_shard_t worker_id = 0; worker_id < num_work_shards; worker_id++) {
+ process_all_slabs(p_table, STEP_BUILD_TABLE, md5_shard, worker_id,
+ slab_count_arr+worker_id, p_stats, nullptr, &remapper);
+ if (unlikely(d_ctl.should_stop())) {
+ ldpp_dout(dpp, 5) << __func__ << "::STEP_BUILD_TABLE::STOPPED\n" << dendl;
+ return -ECANCELED;
+ }
+ }
+ p_table->count_duplicates(&p_stats->small_objs_stat, &p_stats->big_objs_stat,
+ &p_stats->dup_head_bytes_estimate);
+ display_table_stat_counters(dpp, p_stats);
+
+ ldpp_dout(dpp, 10) << __func__ << "::MD5 Loop::" << d_ctl.dedup_type << dendl;
+ if (d_ctl.dedup_type != dedup_req_type_t::DEDUP_TYPE_FULL) {
+ for (work_shard_t worker_id = 0; worker_id < num_work_shards; worker_id++) {
+ remove_slabs(worker_id, md5_shard, slab_count_arr[worker_id]);
+ }
+ return 0;
+ }
+
+#ifndef FULL_DEDUP_SUPPORT
+ // we don't support full dedup with this release
+ return 0;
+#endif
+
+ p_table->remove_singletons_and_redistribute_keys();
+ // The SLABs holds minimal data set brought from the bucket-index
+ // Objects participating in DEDUP need to read attributes from the Head-Object
+ // TBD - find a better name than num_work_shards for the combined output
+ {
+ disk_block_t arr[DISK_BLOCK_COUNT];
+ worker_stats_t wstat;
+ disk_block_seq_t disk_block_seq(dpp, arr, num_work_shards, md5_shard, &wstat);
+ for (work_shard_t worker_id = 0; worker_id < num_work_shards; worker_id++) {
+ process_all_slabs(p_table, STEP_READ_ATTRIBUTES, md5_shard, worker_id,
+ slab_count_arr+worker_id, p_stats, &disk_block_seq, &remapper);
+ if (unlikely(d_ctl.should_stop())) {
+ ldpp_dout(dpp, 5) << __func__ << "::STEP_READ_ATTRIBUTES::STOPPED\n" << dendl;
+ return -ECANCELED;
+ }
+ // we finished processing output SLAB from @worker_id -> remove them
+ remove_slabs(worker_id, md5_shard, slab_count_arr[worker_id]);
+ }
+ disk_block_seq.flush_disk_records(d_dedup_cluster_ioctx);
+ }
+
+ ldpp_dout(dpp, 10) << __func__ << "::STEP_REMOVE_DUPLICATES::started..." << dendl;
+ uint32_t slab_count = 0;
+ process_all_slabs(p_table, STEP_REMOVE_DUPLICATES, md5_shard, num_work_shards,
+ &slab_count, p_stats, nullptr, &remapper);
+ if (unlikely(d_ctl.should_stop())) {
+ ldpp_dout(dpp, 5) << __func__ << "::STEP_REMOVE_DUPLICATES::STOPPED\n" << dendl;
+ return -ECANCELED;
+ }
+ ldpp_dout(dpp, 10) << __func__ << "::STEP_REMOVE_DUPLICATES::finished..." << dendl;
+ // remove the special SLAB holding aggragted data
+ remove_slabs(num_work_shards, md5_shard, slab_count);
+ return 0;
+ }
+
+ //---------------------------------------------------------------------------
+ int Background::read_bucket_stats(const rgw_bucket &bucket_rec,
+ uint64_t *p_num_obj,
+ uint64_t *p_size)
+ {
+ unique_ptr<rgw::sal::Bucket> bucket;
+ int ret = driver->load_bucket(dpp, bucket_rec, &bucket, null_yield);
+ if (unlikely(ret != 0)) {
+ ldpp_dout(dpp, 1) << __func__ << "::ERROR: driver->load_bucket(): "
+ << cpp_strerror(-ret) << dendl;
+ return ret;
+ }
+
+ const auto& index = bucket->get_info().get_current_index();
+ if (is_layout_indexless(index)) {
+ ldpp_dout(dpp, 1) << __func__
+ << "::ERR, indexless buckets do not maintain stats; bucket="
+ << bucket->get_name() << dendl;
+ return -EINVAL;
+ }
+
+ std::map<RGWObjCategory, RGWStorageStats> stats;
+ std::string bucket_ver, master_ver;
+ std::string max_marker;
+ ret = bucket->read_stats(dpp, null_yield, index, RGW_NO_SHARD, &bucket_ver,
+ &master_ver, stats, &max_marker);
+ if (ret < 0) {
+ ldpp_dout(dpp, 1) << __func__ << "::ERR getting bucket stats bucket="
+ << bucket->get_name() << " ret=" << ret << dendl;
+ return ret;
+ }
+
+ for (auto itr = stats.begin(); itr != stats.end(); ++itr) {
+ RGWStorageStats& s = itr->second;
+ ldpp_dout(dpp, 20) << __func__ << "::" << bucket->get_name() << "::"
+ << to_string(itr->first) << "::num_obj=" << s.num_objects
+ << "::size=" << s.size << dendl;
+ *p_num_obj += s.num_objects;
+ *p_size += s.size;
+ }
+
+ return 0;
+ }
+
+ //---------------------------------------------------------------------------
+ int Background::collect_all_buckets_stats()
+ {
+ int ret = 0;
+ std::string section("bucket.instance");
+ std::string marker;
+ void *handle = nullptr;
+ ret = driver->meta_list_keys_init(dpp, section, marker, &handle);
+ if (ret < 0) {
+ ldpp_dout(dpp, 1) << __func__ << "::ERR: Failed meta_list_keys_init: "
+ << cpp_strerror(-ret) << dendl;
+ return ret;
+ }
+
+ d_all_buckets_obj_count = 0;
+ d_all_buckets_obj_size = 0;
+
+ bool has_more = true;
+ while (has_more) {
+ std::list<std::string> entries;
+ constexpr int max_keys = 1000;
+ ret = driver->meta_list_keys_next(dpp, handle, max_keys, entries, &has_more);
+ if (ret == 0) {
+ for (auto& entry : entries) {
+ ldpp_dout(dpp, 20) <<__func__ << "::bucket_name=" << entry << dendl;
+ rgw_bucket bucket;
+ ret = rgw_bucket_parse_bucket_key(cct, entry, &bucket, nullptr);
+ if (unlikely(ret < 0)) {
+ ldpp_dout(dpp, 1) << __func__ << "::ERR: Failed rgw_bucket_parse_bucket_key: "
+ << cpp_strerror(-ret) << dendl;
+ goto err;
+ }
+ ldpp_dout(dpp, 20) <<__func__ << "::bucket=" << bucket << dendl;
+ ret = read_bucket_stats(bucket, &d_all_buckets_obj_count,
+ &d_all_buckets_obj_size);
+ if (unlikely(ret != 0)) {
+ goto err;
+ }
+ }
+ driver->meta_list_keys_complete(handle);
+ }
+ else {
+ ldpp_dout(dpp, 1) << __func__ << "::ERR: failed driver->meta_list_keys_next()" << dendl;
+ goto err;
+ }
+ }
+ ldpp_dout(dpp, 10) <<__func__
+ << "::all_buckets_obj_count=" << d_all_buckets_obj_count
+ << "::all_buckets_obj_size=" << d_all_buckets_obj_size
+ << dendl;
+ return 0;
+
+ err:
+ ldpp_dout(dpp, 1) << __func__ << "::error handler" << dendl;
+ // reset counters to mark that we don't have the info
+ d_all_buckets_obj_count = 0;
+ d_all_buckets_obj_size = 0;
+ if (handle) {
+ driver->meta_list_keys_complete(handle);
+ }
+ return ret;
+ }
+
+ //---------------------------------------------------------------------------
+ int Background::objects_ingress_single_work_shard(work_shard_t worker_id,
+ work_shard_t num_work_shards,
+ md5_shard_t num_md5_shards,
+ worker_stats_t *p_worker_stats,
+ uint8_t *raw_mem,
+ uint64_t raw_mem_size)
+ {
+ int ret = 0;
+ std::string section("bucket.instance");
+ std::string marker;
+ void *handle = nullptr;
+ ret = driver->meta_list_keys_init(dpp, section, marker, &handle);
+ if (ret < 0) {
+ ldpp_dout(dpp, 1) << __func__ << "::ERR: Failed meta_list_keys_init: "
+ << cpp_strerror(-ret) << dendl;
+ return ret;
+ }
+ disk_block_array_t disk_arr(dpp, raw_mem, raw_mem_size, worker_id,
+ p_worker_stats, num_md5_shards);
+ bool has_more = true;
+ // iterate over all buckets
+ while (ret == 0 && has_more) {
+ std::list<std::string> entries;
+ constexpr int max_keys = 1000;
+ ret = driver->meta_list_keys_next(dpp, handle, max_keys, entries, &has_more);
+ if (ret == 0) {
+ ldpp_dout(dpp, 20) <<__func__ << "::entries.size()=" << entries.size() << dendl;
+ for (auto& entry : entries) {
+ ldpp_dout(dpp, 20) <<__func__ << "::bucket_name=" << entry << dendl;
+ rgw_bucket bucket;
+ ret = rgw_bucket_parse_bucket_key(cct, entry, &bucket, nullptr);
+ if (unlikely(ret < 0)) {
+ // bad bucket entry, skip to the next one
+ ldpp_dout(dpp, 1) << __func__ << "::ERR: Failed rgw_bucket_parse_bucket_key: "
+ << cpp_strerror(-ret) << dendl;
+ continue;
+ }
+ ldpp_dout(dpp, 20) <<__func__ << "::bucket=" << bucket << dendl;
+ ret = ingress_bucket_objects_single_shard(disk_arr, bucket, worker_id,
+ num_work_shards, p_worker_stats);
+ if (unlikely(ret != 0)) {
+ if (d_ctl.should_stop()) {
+ driver->meta_list_keys_complete(handle);
+ return -ECANCELED;
+ }
+ ldpp_dout(dpp, 1) << __func__ << "::Failed ingress_bucket_objects_single_shard()" << dendl;
+ // skip bad bucket and move on to the next one
+ continue;
+ }
+ }
+ driver->meta_list_keys_complete(handle);
+ }
+ else {
+ ldpp_dout(dpp, 1) << __func__ << "::failed driver->meta_list_keys_next()" << dendl;
+ driver->meta_list_keys_complete(handle);
+ // TBD: what can we do here?
+ break;
+ }
+ }
+ ldpp_dout(dpp, 20) <<__func__ << "::flush_output_buffers() worker_id="
+ << worker_id << dendl;
+ disk_arr.flush_output_buffers(dpp, d_dedup_cluster_ioctx);
+ return ret;
+ }
+
+ //---------------------------------------------------------------------------
+ int Background::remove_slabs(unsigned worker_id, unsigned md5_shard, uint32_t slab_count)
+ {
+ unsigned failure_count = 0;
+
+ for (uint32_t slab_id = 0; slab_id < slab_count; slab_id++) {
+ uint32_t seq_number = disk_block_id_t::slab_id_to_seq_num(slab_id);
+ disk_block_id_t block_id(worker_id, seq_number);
+ std::string oid(block_id.get_slab_name(md5_shard));
+ ldpp_dout(dpp, 20) << __func__ << "::calling ioctx->remove(" << oid << ")" << dendl;
+ int ret = d_dedup_cluster_ioctx.remove(oid);
+ if (ret != 0) {
+ ldpp_dout(dpp, 0) << __func__ << "::ERR Failed ioctx->remove(" << oid << ")" << dendl;
+ failure_count++;
+ }
+ }
+
+ return failure_count;
+ }
+
+ //---------------------------------------------------------------------------
+ int Background::f_ingress_work_shard(unsigned worker_id,
+ uint8_t *raw_mem,
+ uint64_t raw_mem_size,
+ work_shard_t num_work_shards,
+ md5_shard_t num_md5_shards)
+ {
+ ldpp_dout(dpp, 20) << __func__ << "::worker_id=" << worker_id << dendl;
+ utime_t start_time = ceph_clock_now();
+ worker_stats_t worker_stats;
+ int ret = objects_ingress_single_work_shard(worker_id, num_work_shards, num_md5_shards,
+ &worker_stats,raw_mem, raw_mem_size);
+ if (ret == 0) {
+ worker_stats.duration = ceph_clock_now() - start_time;
+ d_cluster.mark_work_shard_token_completed(store, worker_id, &worker_stats);
+ ldpp_dout(dpp, 10) << "stat counters [worker]:\n" << worker_stats << dendl;
+ ldpp_dout(dpp, 10) << "Shard Process Duration = "
+ << worker_stats.duration << dendl;
+ }
+ //ldpp_dout(dpp, 0) << __func__ << "::sleep for 2 seconds\n" << dendl;
+ //std::this_thread::sleep_for(std::chrono::seconds(2));
+ return ret;
+ }
+
+ //---------------------------------------------------------------------------
+ int Background::f_dedup_md5_shard(unsigned md5_shard,
+ uint8_t *raw_mem,
+ uint64_t raw_mem_size,
+ work_shard_t num_work_shards,
+ md5_shard_t num_md5_shards)
+ {
+ utime_t start_time = ceph_clock_now();
+ md5_stats_t md5_stats;
+ //DEDUP_DYN_ALLOC
+ dedup_table_t table(dpp, d_head_object_size, raw_mem, raw_mem_size);
+ int ret = objects_dedup_single_md5_shard(&table, md5_shard, &md5_stats, num_work_shards);
+ if (ret == 0) {
+ md5_stats.duration = ceph_clock_now() - start_time;
+ d_cluster.mark_md5_shard_token_completed(store, md5_shard, &md5_stats);
+ ldpp_dout(dpp, 10) << "stat counters [md5]:\n" << md5_stats << dendl;
+ ldpp_dout(dpp, 10) << "Shard Process Duration = "
+ << md5_stats.duration << dendl;
+ }
+ return ret;
+ }
+
+ //---------------------------------------------------------------------------
+ int Background::process_all_shards(bool ingress_work_shards,
+ int (Background::*func)(unsigned, uint8_t*, uint64_t, work_shard_t, md5_shard_t),
+ uint8_t *raw_mem,
+ uint64_t raw_mem_size,
+ work_shard_t num_work_shards,
+ md5_shard_t num_md5_shards)
+ {
+ while (true) {
+ d_heart_beat_last_update = ceph_clock_now();
+ uint16_t shard_id;
+ if (ingress_work_shards) {
+ shard_id = d_cluster.get_next_work_shard_token(store, num_work_shards);
+ }
+ else {
+ shard_id = d_cluster.get_next_md5_shard_token(store, num_md5_shards);
+ }
+
+ // start with a common error handler
+ if (shard_id != NULL_SHARD) {
+ ldpp_dout(dpp, 10) << __func__ << "::Got shard_id=" << shard_id << dendl;
+ int ret = (this->*func)(shard_id, raw_mem, raw_mem_size, num_work_shards,
+ num_md5_shards);
+ if (unlikely(ret != 0)) {
+ if (d_ctl.should_stop()) {
+ ldpp_dout(dpp, 5) << __func__ << "::stop execution" << dendl;
+ return -ECANCELED;
+ }
+ else {
+ ldpp_dout(dpp, 5) << __func__ << "::Skip shard #" << shard_id << dendl;
+ }
+ }
+ }
+ else {
+ ldpp_dout(dpp, 10) << __func__ << "::finished processing all shards" <<dendl;
+ break;
+ }
+ } // while loop
+ return 0;
+ }
+
+ //---------------------------------------------------------------------------
+ [[maybe_unused]]static int collect_pool_stats(const DoutPrefixProvider* const dpp,
+ RGWRados* rados,
+ uint64_t *p_num_objects,
+ uint64_t *p_num_objects_bytes)
+ {
+ *p_num_objects = 0;
+ *p_num_objects_bytes = 0;
+ list<string> vec;
+ vec.push_back("default.rgw.buckets.data");
+ map<string,librados::pool_stat_t> stats;
+ auto rados_handle = rados->get_rados_handle();
+ int ret = rados_handle->get_pool_stats(vec, stats);
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << __func__ << ":ERROR: fetching pool stats: "
+ << cpp_strerror(-ret) << dendl;
+ return ret;
+ }
+
+ for (auto i = stats.begin(); i != stats.end(); ++i) {
+ const char *pool_name = i->first.c_str();
+ librados::pool_stat_t& s = i->second;
+ // TBD: add support for EC
+ // We need to find the user byte size without the added protection
+ double replica_level = (double)s.num_object_copies / s.num_objects;
+ *p_num_objects = s.num_objects;
+ *p_num_objects_bytes = s.num_bytes / replica_level;
+ ldpp_dout(dpp, 10) <<__func__ << "::" << pool_name << "::num_objects="
+ << s.num_objects << "::num_copies=" << s.num_object_copies
+ << "::num_bytes=" << s.num_bytes << "/" << *p_num_objects_bytes << dendl;
+ }
+ return 0;
+ }
+
+ //-------------------------------------------------------------------------------
+ // 32B per object-entry in the hashtable
+ // 2MB per shard-buffer
+ //=============||==============||=========||===================================||
+ // Obj Count || shard count || memory || calculation ||
+ // ------------||--------------||---------||---------------------------------- ||
+ // 1M || 4 || 8MB || 8MB/32 = 0.25M * 4 = 1M ||
+ // 4M || 8 || 16MB || 16MB/32 = 0.50M * 8 = 4M ||
+ //-------------------------------------------------------------------------------
+ // 16M || 16 || 32MB || 32MB/32 = 1.00M * 16 = 16M ||
+ //-------------------------------------------------------------------------------
+ // 64M || 32 || 64MB || 64MB/32 = 2.00M * 32 = 64M ||
+ // 256M || 64 || 128MB || 128MB/32 = 4.00M * 64 = 256M ||
+ // 1024M( 1G) || 128 || 256MB || 256MB/32 = 8.00M * 128 = 1024M ||
+ // 4096M( 4G) || 256 || 512MB || 512MB/32 = 16M.00 * 256 = 4096M ||
+ // 16384M(16G) || 512 || 1024MB || 1024MB/32 = 32M.00 * 512 = 16384M ||
+ //-------------||--------------||---------||-----------------------------------||
+ static md5_shard_t calc_num_md5_shards(uint64_t obj_count)
+ {
+ // create headroom by allocating space for a 10% bigger system
+ obj_count = obj_count + (obj_count/10);
+
+ uint64_t M = 1024 * 1024;
+ if (obj_count < 1*M) {
+ // less than 1M objects -> use 4 shards (8MB)
+ return 4;
+ }
+ else if (obj_count < 4*M) {
+ // less than 4M objects -> use 8 shards (16MB)
+ return 8;
+ }
+ else if (obj_count < 16*M) {
+ // less than 16M objects -> use 16 shards (32MB)
+ return 16;
+ }
+ else if (obj_count < 64*M) {
+ // less than 64M objects -> use 32 shards (64MB)
+ return 32;
+ }
+ else if (obj_count < 256*M) {
+ // less than 256M objects -> use 64 shards (128MB)
+ return 64;
+ }
+ else if (obj_count < 1024*M) {
+ // less than 1024M objects -> use 128 shards (256MB)
+ return 128;
+ }
+ else if (obj_count < 4*1024*M) {
+ // less than 4096M objects -> use 256 shards (512MB)
+ return 256;
+ }
+ else {
+ return 512;
+ }
+ }
+
+ //---------------------------------------------------------------------------
+ int Background::setup(dedup_epoch_t *p_epoch)
+ {
+ int ret = collect_all_buckets_stats();
+ if (unlikely(ret != 0)) {
+ return ret;
+ }
+
+ md5_shard_t num_md5_shards = calc_num_md5_shards(d_all_buckets_obj_count);
+ num_md5_shards = std::min(num_md5_shards, MAX_MD5_SHARD);
+ num_md5_shards = std::max(num_md5_shards, MIN_MD5_SHARD);
+ work_shard_t num_work_shards = num_md5_shards;
+ num_work_shards = std::min(num_work_shards, MAX_WORK_SHARD);
+
+ ldpp_dout(dpp, 5) << __func__ << "::obj_count=" <<d_all_buckets_obj_count
+ << "::num_md5_shards=" << num_md5_shards
+ << "::num_work_shards=" << num_work_shards << dendl;
+ // init handles and create the dedup_pool
+ ret = init_rados_access_handles(true);
+ if (ret != 0) {
+ derr << "dedup_bg::resume() failed init_rados_access_handles() ret="
+ << ret << "::" << cpp_strerror(-ret) << dendl;
+ return ret;
+ }
+ display_ioctx_state(dpp, d_dedup_cluster_ioctx, __func__);
+
+ ret = d_cluster.reset(store, p_epoch, num_work_shards, num_md5_shards);
+ if (ret != 0) {
+ ldpp_dout(dpp, 1) << __func__ << "::ERR: failed cluster.init()" << dendl;
+ return ret;
+ }
+
+ if (unlikely(p_epoch->num_work_shards > MAX_WORK_SHARD)) {
+ ldpp_dout(dpp, 1) << __func__ << "::ERR: p_epoch->num_work_shards="
+ << p_epoch->num_work_shards
+ << " is larger than MAX_WORK_SHARD ("
+ << MAX_WORK_SHARD << ")" << dendl;
+ return -EOVERFLOW;
+ }
+ if (unlikely(p_epoch->num_md5_shards > MAX_MD5_SHARD)) {
+ ldpp_dout(dpp, 1) << __func__ << "::ERR: p_epoch->num_md5_shards="
+ << p_epoch->num_md5_shards
+ << " is larger than MAX_MD5_SHARD ("
+ << MAX_MD5_SHARD << ")" << dendl;
+ return -EOVERFLOW;
+ }
+
+ ldpp_dout(dpp, 10) <<__func__ << "::" << *p_epoch << dendl;
+ d_ctl.dedup_type = p_epoch->dedup_type;
+#ifdef FULL_DEDUP_SUPPORT
+ ceph_assert(d_ctl.dedup_type == dedup_req_type_t::DEDUP_TYPE_FULL ||
+ d_ctl.dedup_type == dedup_req_type_t::DEDUP_TYPE_ESTIMATE);
+#else
+ ceph_assert(d_ctl.dedup_type == dedup_req_type_t::DEDUP_TYPE_ESTIMATE);
+#endif
+ ldpp_dout(dpp, 10) << __func__ << "::" << d_ctl.dedup_type << dendl;
+
+ return 0;
+ }
+
+ //---------------------------------------------------------------------------
+ int Background::watch_reload(const DoutPrefixProvider* dpp)
+ {
+ return cluster::watch_reload(store, dpp, &d_watch_handle, &d_watcher_ctx);
+ }
+
+ //---------------------------------------------------------------------------
+ int Background::unwatch_reload(const DoutPrefixProvider* dpp)
+ {
+ if (d_watch_handle == 0) {
+ // nothing to unwatch
+ ldpp_dout(dpp, 1) << "dedup_bg::unwatch_reload(): nothing to watch"
+ << dendl;
+ return 0;
+ }
+
+ ldpp_dout(dpp, 5) << "dedup_bg::unwatch_reload(): watch_handle="
+ << d_watch_handle << dendl;
+
+ int ret = cluster::unwatch_reload(store, dpp, d_watch_handle);
+ if (ret == 0) {
+ ldpp_dout(dpp, 5) << "dedup_bg::unwatch_reload():Stopped watching "
+ << "::d_watch_handle=" << d_watch_handle << dendl;
+ d_watch_handle = 0;
+ }
+ return ret;
+ }
+
+ //---------------------------------------------------------------------------
+ void Background::handle_notify(uint64_t notify_id, uint64_t cookie, bufferlist &bl)
+ {
+ int ret = 0;
+ int32_t urgent_msg = URGENT_MSG_NONE;
+ try {
+ auto bl_iter = bl.cbegin();
+ ceph::decode(urgent_msg, bl_iter);
+ } catch (buffer::error& err) {
+ ldpp_dout(dpp, 1) << __func__ << "::ERROR: bad urgent_msg" << dendl;
+ ret = -EINVAL;
+ }
+ ldpp_dout(dpp, 5) << __func__ << "::-->" << get_urgent_msg_names(urgent_msg) << dendl;
+
+ // use lock to prevent concurrent pause/resume requests
+ std::unique_lock cond_lock(d_cond_mutex); // [------>open lock block
+ if (unlikely(d_ctl.local_urgent_req())) {
+ // can't operate when the system is paused/shutdown
+ cond_lock.unlock(); // close lock block------>]
+ ldpp_dout(dpp, 5) << __func__
+ << "::system is paused/shutdown -> cancel notification" << dendl;
+ cluster::ack_notify(store, dpp, &d_ctl, notify_id, cookie, -EBUSY);
+ return;
+ }
+
+ switch(urgent_msg) {
+ case URGENT_MSG_ABORT:
+ if (d_ctl.dedup_exec) {
+ d_ctl.remote_abort_req = true;
+ d_cond.notify_all();
+ d_cond.wait(cond_lock, [this]{return d_ctl.remote_aborted || d_ctl.local_urgent_req();});
+ d_ctl.remote_aborted ? ret = 0 : ret = -EBUSY;
+ }
+ else {
+ ldpp_dout(dpp, 5) << __func__ << "::inactive dedup->nothing to do" << dendl;
+ }
+ break;
+ case URGENT_MSG_RESTART:
+ if (!d_ctl.dedup_exec) {
+ d_ctl.remote_restart_req = true;
+ d_cond.notify_all();
+ }
+ else {
+ ldpp_dout(dpp, 5) << __func__ << "::\ncan't restart active dedup\n"<< dendl;
+ ret = -EEXIST;
+ }
+ break;
+ case URGENT_MSG_PASUE:
+ if (d_ctl.dedup_exec && !d_ctl.remote_paused) {
+ d_ctl.remote_pause_req = true;
+ d_cond.notify_all();
+ d_cond.wait(cond_lock, [this]{return d_ctl.remote_paused || d_ctl.local_urgent_req();});
+ d_ctl.remote_paused ? ret = 0 : ret = -EBUSY;
+ }
+ else {
+ if (d_ctl.remote_paused) {
+ ldpp_dout(dpp, 5) << __func__ << "::dedup is already paused" << dendl;
+ }
+ else {
+ ldpp_dout(dpp, 5) << __func__ << "::inactive dedup->nothing to do" << dendl;
+ }
+ }
+ break;
+ case URGENT_MSG_RESUME:
+ if (d_ctl.remote_pause_req || d_ctl.remote_paused) {
+ d_ctl.remote_pause_req = false;
+ d_ctl.remote_paused = false;
+ d_cond.notify_all();
+ }
+ else {
+ ldpp_dout(dpp, 5) << __func__ << "::dedup is not paused->nothing to do" << dendl;
+ }
+ break;
+ default:
+ ldpp_dout(dpp, 1) << __func__ << "::unexpected urgent_msg: "
+ << get_urgent_msg_names(urgent_msg) << dendl;
+ ret = -EINVAL;
+ }
+
+ cond_lock.unlock(); // close lock block------>]
+ cluster::ack_notify(store, dpp, &d_ctl, notify_id, cookie, ret);
+ }
+
+ //---------------------------------------------------------------------------
+ void Background::start()
+ {
+ const DoutPrefixProvider* const dpp = &dp;
+ ldpp_dout(dpp, 10) << __FILE__ << "::" <<__func__ << dendl;
+ {
+ std::unique_lock pause_lock(d_cond_mutex);
+ if (d_ctl.started) {
+ // start the thread only once
+ ldpp_dout(dpp, 1) << "dedup_bg already started" << dendl;
+ return;
+ }
+ d_ctl.started = true;
+ }
+ d_runner = std::thread(&Background::run, this);
+ }
+
+ //------------------------- --------------------------------------------------
+ void Background::shutdown()
+ {
+ ldpp_dout(dpp, 5) <<__func__ << "::dedup_bg shutdown()" << dendl;
+ std::unique_lock cond_lock(d_cond_mutex);
+ bool nested_call = false;
+ if (d_ctl.shutdown_req) {
+ // should never happen!
+ ldpp_dout(dpp, 1) <<__func__ << "dedup_bg nested call" << dendl;
+ nested_call = true;
+ }
+ d_ctl.shutdown_req = true;
+ d_cond.notify_all();
+ ldpp_dout(dpp, 1) <<__func__ << "dedup_bg shutdown waiting..." << dendl;
+ d_cond.wait(cond_lock, [this]{return d_ctl.shutdown_done;});
+ //cond_lock.unlock();
+
+ if (nested_call) {
+ ldpp_dout(dpp, 1) <<__func__ << "::nested call:: repeat notify" << dendl;
+ d_cond.notify_all();
+ }
+
+ if (d_runner.joinable()) {
+ ldpp_dout(dpp, 5) <<__func__ << "::dedup_bg wait join()" << dendl;
+ d_runner.join();
+ ldpp_dout(dpp, 5) <<__func__ << "::dedup_bg finished join()" << dendl;
+ }
+ else {
+ ldpp_dout(dpp, 5) <<__func__ << "::dedup_bg not joinable()" << dendl;
+ }
+
+ d_ctl.reset();
+ }
+
+ //---------------------------------------------------------------------------
+ void Background::pause()
+ {
+ display_ioctx_state(dpp, d_dedup_cluster_ioctx, "dedup_bg->pause() request");
+ std::unique_lock cond_lock(d_cond_mutex);
+
+ if (d_ctl.local_paused || d_ctl.shutdown_done) {
+ cond_lock.unlock();
+ ldpp_dout(dpp, 1) << __FILE__ << "::" <<__func__
+ << "::dedup_bg is already paused/stopped" << dendl;
+ return;
+ }
+
+ bool nested_call = false;
+ if (d_ctl.local_pause_req) {
+ // should never happen!
+ ldpp_dout(dpp, 1) <<__func__ << "::nested call" << dendl;
+ nested_call = true;
+ }
+ d_ctl.local_pause_req = true;
+ d_cond.notify_all();
+ d_cond.wait(cond_lock, [this]{return d_ctl.local_paused||d_ctl.shutdown_done;});
+ if (nested_call) {
+ ldpp_dout(dpp, 1) << "dedup_bg::nested call:: repeat notify" << dendl;
+ d_cond.notify_all();
+ }
+
+ // destory open watch request and pool handle before pause() is completed
+ unwatch_reload(dpp);
+ d_dedup_cluster_ioctx.close();
+ ldpp_dout(dpp, 5) << "dedup_bg paused" << dendl;
+ }
+
+ //---------------------------------------------------------------------------
+ void Background::resume(rgw::sal::Driver* _driver)
+ {
+ ldpp_dout(dpp, 5) << "dedup_bg->resume()" << dendl;
+ // use lock to prevent concurrent pause/resume requests
+ std::unique_lock cond_lock(d_cond_mutex);
+
+ if (!d_ctl.local_paused) {
+ cond_lock.unlock();
+ ldpp_dout(dpp, 5) << "dedup_bg::resume thread is not paused!" << dendl;
+ if (_driver != driver) {
+ ldpp_dout(dpp, 1) << "dedup_bg attempt to change driver on an active system was refused" << dendl;
+ }
+ return;
+ }
+
+ driver = _driver;
+ // can pool change its uid between pause/resume ???
+ int ret = init_rados_access_handles(false);
+ if (ret != 0) {
+ derr << "dedup_bg::resume() failed init_rados_access_handles() ret="
+ << ret << "::" << cpp_strerror(-ret) << dendl;
+ throw std::runtime_error("Failed init_rados_access_handles()");
+ }
+ display_ioctx_state(dpp, d_dedup_cluster_ioctx, "dedup_bg->resume() done");
+ // create new watch request using the new pool handle
+ watch_reload(dpp);
+ d_ctl.local_pause_req = false;
+ d_ctl.local_paused = false;
+
+ // wake up threads blocked after seeing pause state
+ d_cond.notify_all();
+ ldpp_dout(dpp, 5) << "dedup_bg was resumed" << dendl;
+ }
+
+ //---------------------------------------------------------------------------
+ void Background::handle_pause_req(const char *caller)
+ {
+ ldpp_dout(dpp, 5) << __func__ << "::caller=" << caller << dendl;
+ ldpp_dout(dpp, 5) << __func__ << "::" << d_ctl << dendl;
+ while (d_ctl.local_pause_req || d_ctl.local_paused || d_ctl.remote_pause_req || d_ctl.remote_paused) {
+ std::unique_lock cond_lock(d_cond_mutex);
+ if (d_ctl.should_stop()) {
+ ldpp_dout(dpp, 5) << __func__ << "::should_stop!" << dendl;
+ return;
+ }
+
+ if (d_ctl.local_pause_req) {
+ d_ctl.local_pause_req = false;
+ d_ctl.local_paused = true;
+ }
+
+ if (d_ctl.remote_pause_req) {
+ d_ctl.remote_pause_req = false;
+ d_ctl.remote_paused = true;
+ }
+
+ d_cond.notify_all();
+
+ if (d_ctl.local_paused) {
+ ldpp_dout(dpp, 10) << __func__ << "::wait on d_ctl.local_paused" << dendl;
+ d_cond.wait(cond_lock, [this]{return !d_ctl.local_paused || d_ctl.should_stop() ;});
+ }
+
+ if (d_ctl.remote_paused) {
+ ldpp_dout(dpp, 10) << __func__ << "::wait on d_ctl.remote_paused" << dendl;
+ d_cond.wait(cond_lock, [this]{return !d_ctl.remote_paused || d_ctl.should_stop() || d_ctl.local_pause_req;});
+ }
+ } // while loop
+
+ ldpp_dout(dpp, 5) << "Dedup background thread resumed!" << dendl;
+ }
+
+ //---------------------------------------------------------------------------
+ void Background::work_shards_barrier(work_shard_t num_work_shards)
+ {
+ // Wait for other worker to finish ingress step
+ // We can move to the next step even if some token are in failed state
+ const unsigned MAX_WAIT_SEC = 120; // wait 2 minutes for failing members
+ unsigned ttl = 3;
+ unsigned time_elapsed = 0;
+
+ while (true) {
+ int ret = d_cluster.all_work_shard_tokens_completed(store, num_work_shards);
+ // we start incrementing time_elapsed only after all valid tokens finish
+ if (ret == 0 || (time_elapsed > MAX_WAIT_SEC) ) {
+ break;
+ }
+
+ ldpp_dout(dpp, 10) << __func__ << "::Wait for object ingress completion, ttl="
+ << ttl << " seconds" << dendl;
+ std::unique_lock cond_lock(d_cond_mutex);
+ d_cond.wait_for(cond_lock, std::chrono::seconds(ttl),
+ [this]{return d_ctl.should_stop() || d_ctl.should_pause();});
+ if (unlikely(d_ctl.should_pause())) {
+ handle_pause_req(__func__);
+ }
+ if (unlikely(d_ctl.should_stop())) {
+ return;
+ }
+
+ if (ret != -EAGAIN) {
+ // All incomplete tokens are corrupted or in time out state
+ // Give them an extra 120 seconds just in case ...
+ time_elapsed += ttl;
+ }
+ // else there are still good tokens in process, wait for them
+ }
+
+ ldpp_dout(dpp, 10) << "\n\n==Object Ingress step was completed on all shards==\n"
+ << dendl;
+ if (unlikely(d_ctl.should_pause())) {
+ handle_pause_req(__func__);
+ }
+ }
+
+ //---------------------------------------------------------------------------
+ static bool all_md5_shards_completed(cluster *p_cluster,
+ rgw::sal::RadosStore *store,
+ md5_shard_t num_md5_shards)
+ {
+ return (p_cluster->all_md5_shard_tokens_completed(store, num_md5_shards) == 0);
+ }
+
+ //---------------------------------------------------------------------------
+ void Background::md5_shards_barrier(md5_shard_t num_md5_shards)
+ {
+ // Wait for others to finish step
+ unsigned ttl = 3;
+ // require that everything completed successfully before deleting the pool
+ while (!all_md5_shards_completed(&d_cluster, store, num_md5_shards)) {
+ ldpp_dout(dpp, 10) << __func__ << "::Wait for md5 completion, ttl="
+ << ttl << " seconds" << dendl;
+ std::unique_lock cond_lock(d_cond_mutex);
+ d_cond.wait_for(cond_lock, std::chrono::seconds(ttl),
+ [this]{return d_ctl.should_stop() || d_ctl.should_pause();});
+ if (unlikely(d_ctl.should_pause())) {
+ handle_pause_req(__func__);
+ }
+ if (unlikely(d_ctl.should_stop())) {
+ return;
+ }
+ }
+
+ ldpp_dout(dpp, 10) << "\n\n==MD5 processing was completed on all shards!==\n"
+ << dendl;
+ if (unlikely(d_ctl.should_pause())) {
+ handle_pause_req(__func__);
+ }
+ }
+
+ //---------------------------------------------------------------------------
+ void Background::run()
+ {
+ const auto rc = ceph_pthread_setname("dedup_bg");
+ ldpp_dout(dpp, 5) << __func__ << "ceph_pthread_setname() ret=" << rc << dendl;
+
+ // 256x8KB=2MB
+ const uint64_t PER_SHARD_BUFFER_SIZE = DISK_BLOCK_COUNT *sizeof(disk_block_t);
+ ldpp_dout(dpp, 20) <<__func__ << "::dedup::main loop" << dendl;
+
+ while (!d_ctl.shutdown_req) {
+ if (unlikely(d_ctl.should_pause())) {
+ handle_pause_req(__func__);
+ if (unlikely(d_ctl.should_stop())) {
+ ldpp_dout(dpp, 5) <<__func__ << "::stop req after a pause" << dendl;
+ d_ctl.dedup_exec = false;
+ }
+ }
+
+ if (d_ctl.dedup_exec) {
+ dedup_epoch_t epoch;
+ if (setup(&epoch) != 0) {
+ ldpp_dout(dpp, 1) << __func__ << "::failed setup()" << dendl;
+ return;
+ }
+ const rgw_pool& dedup_pool = store->svc()->zone->get_zone_params().dedup_pool;
+ int64_t pool_id = rados_handle->pool_lookup(dedup_pool.name.c_str());
+ if (pool_id < 0) {
+ ldpp_dout(dpp, 1) << __func__ << "::bad pool_id" << dendl;
+ return;
+ }
+ work_shard_t num_work_shards = epoch.num_work_shards;
+ md5_shard_t num_md5_shards = epoch.num_md5_shards;
+ const uint64_t RAW_MEM_SIZE = PER_SHARD_BUFFER_SIZE * num_md5_shards;
+ ldpp_dout(dpp, 5) <<__func__ << "::RAW_MEM_SIZE=" << RAW_MEM_SIZE
+ << "::num_work_shards=" << num_work_shards
+ << "::num_md5_shards=" << num_md5_shards << dendl;
+ // DEDUP_DYN_ALLOC
+ auto raw_mem = std::make_unique<uint8_t[]>(RAW_MEM_SIZE);
+ if (raw_mem == nullptr) {
+ ldpp_dout(dpp, 1) << "failed slab memory allocation - size=" << RAW_MEM_SIZE << dendl;
+ return;
+ }
+
+ process_all_shards(true, &Background::f_ingress_work_shard, raw_mem.get(),
+ RAW_MEM_SIZE, num_work_shards, num_md5_shards);
+ if (!d_ctl.should_stop()) {
+ // Wait for all other workers to finish ingress step
+ work_shards_barrier(num_work_shards);
+ if (!d_ctl.should_stop()) {
+ process_all_shards(false, &Background::f_dedup_md5_shard, raw_mem.get(),
+ RAW_MEM_SIZE, num_work_shards, num_md5_shards);
+ // Wait for all other md5 shards to finish
+ md5_shards_barrier(num_md5_shards);
+ safe_pool_delete(store, dpp, pool_id);
+ }
+ else {
+ ldpp_dout(dpp, 5) <<__func__ << "::stop req from barrier" << dendl;
+ }
+ }
+ else {
+ ldpp_dout(dpp, 5) <<__func__ << "::stop req from ingress_work_shard" << dendl;
+ }
+ } // dedup_exec
+
+ std::unique_lock cond_lock(d_cond_mutex);
+ d_ctl.dedup_exec = false;
+ if (d_ctl.remote_abort_req) {
+ d_ctl.remote_aborted = true;
+
+ d_ctl.remote_abort_req = false;
+ d_ctl.remote_paused = false;
+ d_cond.notify_all();
+ ldpp_dout(dpp, 5) << __func__ << "::Dedup was aborted on a remote req" << dendl;
+ }
+ d_cond.wait(cond_lock, [this]{return d_ctl.remote_restart_req || d_ctl.should_stop() || d_ctl.should_pause();});
+ if (!d_ctl.should_stop() && !d_ctl.should_pause()) {
+ // TBD: should we release lock here ???
+ if (d_cluster.can_start_new_scan(store)) {
+ d_ctl.dedup_exec = true;
+ d_ctl.remote_aborted = false;
+ d_ctl.remote_paused = false;
+ d_ctl.remote_restart_req = false;
+ d_cond.notify_all();
+ }
+ }else if (d_ctl.should_stop()) {
+ ldpp_dout(dpp, 5) << "main loop::should_stop::" << d_ctl << dendl;
+ }
+ else {
+ ldpp_dout(dpp, 5) << "main loop::should_pause::" << d_ctl << dendl;
+ }
+ }
+ d_ctl.shutdown_done = true;
+ d_cond.notify_all();
+ // shutdown
+ ldpp_dout(dpp, 5) << __func__ << "::Dedup background thread stopped" << dendl;
+ }
+
+}; //namespace rgw::dedup
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2;
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Author: Gabriel BenHanokh <gbenhano@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#pragma once
+#include "common/dout.h"
+#include "rgw_common.h"
+#include "rgw_dedup_utils.h"
+#include "rgw_dedup_table.h"
+#include "rgw_dedup_cluster.h"
+#include "rgw_realm_reloader.h"
+#include <string>
+#include <unordered_map>
+#include <variant>
+#include <iostream>
+#include <ostream>
+
+namespace rgw::dedup {
+ struct dedup_epoch_t;
+ struct control_t {
+ control_t() {
+ reset();
+ }
+ void reset();
+ inline bool local_urgent_req() const {
+ return (shutdown_req || local_pause_req);
+ }
+ inline bool should_stop() const {
+ return (shutdown_req || remote_abort_req);
+ }
+ inline bool should_pause() const {
+ return (local_pause_req || remote_pause_req);
+ }
+
+ // allow to start/pasue/resume/stop execution
+ dedup_req_type_t dedup_type = dedup_req_type_t::DEDUP_TYPE_NONE;
+ bool started = false;
+ bool dedup_exec = false;
+ bool shutdown_req = false;
+ bool shutdown_done = false;
+ bool local_pause_req = false;
+ bool local_paused = false;
+ bool remote_abort_req = false;
+ bool remote_aborted = false;
+ bool remote_pause_req = false;
+ bool remote_paused = false;
+ bool remote_restart_req = false;
+ };
+ std::ostream& operator<<(std::ostream &out, const control_t &ctl);
+ void encode(const control_t& ctl, ceph::bufferlist& bl);
+ void decode(control_t& ctl, ceph::bufferlist::const_iterator& bl);
+ class remapper_t;
+ class disk_block_seq_t;
+ struct disk_record_t;
+ struct key_t;
+ //Interval between each execution of the script is set to 5 seconds
+ static inline constexpr int INIT_EXECUTE_INTERVAL = 5;
+ class Background : public RGWRealmReloader::Pauser {
+ class DedupWatcher : public librados::WatchCtx2 {
+ Background* const parent;
+ public:
+ DedupWatcher(Background* _parent) : parent(_parent) {}
+ ~DedupWatcher() override = default;
+ void handle_notify(uint64_t notify_id, uint64_t cookie,
+ uint64_t notifier_id, bufferlist& bl) override;
+ void handle_error(uint64_t cookie, int err) override;
+ };
+
+ public:
+ Background(rgw::sal::Driver* _driver, CephContext* _cct);
+ int watch_reload(const DoutPrefixProvider* dpp);
+ int unwatch_reload(const DoutPrefixProvider* dpp);
+ void handle_notify(uint64_t notify_id, uint64_t cookie, bufferlist &bl);
+ void start();
+ void shutdown();
+ void pause() override;
+ void resume(rgw::sal::Driver* _driver) override;
+
+ private:
+ enum dedup_step_t {
+ STEP_NONE,
+ STEP_BUCKET_INDEX_INGRESS,
+ STEP_BUILD_TABLE,
+ STEP_READ_ATTRIBUTES,
+ STEP_REMOVE_DUPLICATES
+ };
+
+ void run();
+ int setup(struct dedup_epoch_t*);
+ void work_shards_barrier(work_shard_t num_work_shards);
+ void md5_shards_barrier(md5_shard_t num_md5_shards);
+ void handle_pause_req(const char* caller);
+ const char* dedup_step_name(dedup_step_t step);
+ int read_buckets();
+ void check_and_update_heartbeat(unsigned shard_id, uint64_t count_a, uint64_t count_b,
+ const char *prefix);
+
+ inline void check_and_update_worker_heartbeat(work_shard_t worker_id, int64_t obj_count);
+ inline void check_and_update_md5_heartbeat(md5_shard_t md5_id,
+ uint64_t load_count,
+ uint64_t dedup_count);
+ int ingress_bucket_idx_single_object(disk_block_array_t &disk_arr,
+ const rgw::sal::Bucket *bucket,
+ const rgw_bucket_dir_entry &entry,
+ worker_stats_t *p_worker_stats /*IN-OUT*/);
+ int process_bucket_shards(disk_block_array_t &disk_arr,
+ const rgw::sal::Bucket *bucket,
+ std::map<int,std::string> &oids,
+ librados::IoCtx &ioctx,
+ work_shard_t shard_id,
+ work_shard_t num_work_shards,
+ worker_stats_t *p_worker_stats /*IN-OUT*/);
+ int ingress_bucket_objects_single_shard(disk_block_array_t &disk_arr,
+ const rgw_bucket &bucket_rec,
+ work_shard_t worker_id,
+ work_shard_t num_work_shards,
+ worker_stats_t *p_worker_stats /*IN-OUT*/);
+ int objects_ingress_single_work_shard(work_shard_t worker_id,
+ work_shard_t num_work_shards,
+ md5_shard_t num_md5_shards,
+ worker_stats_t *p_worker_stats,
+ uint8_t *raw_mem,
+ uint64_t raw_mem_size);
+ int f_ingress_work_shard(unsigned shard_id,
+ uint8_t *raw_mem,
+ uint64_t raw_mem_size,
+ work_shard_t num_work_shards,
+ md5_shard_t num_md5_shards);
+ int f_dedup_md5_shard(unsigned shard_id,
+ uint8_t *raw_mem,
+ uint64_t raw_mem_size,
+ work_shard_t num_work_shards,
+ md5_shard_t num_md5_shards);
+ int process_all_shards(bool ingress_work_shards,
+ int (Background::* func)(unsigned, uint8_t*, uint64_t, work_shard_t, md5_shard_t),
+ uint8_t *raw_mem,
+ uint64_t raw_mem_size,
+ work_shard_t num_work_shards,
+ md5_shard_t num_md5_shards);
+ int read_bucket_stats(const rgw_bucket &bucket_rec,
+ uint64_t *p_num_obj,
+ uint64_t *p_size);
+ int collect_all_buckets_stats();
+ int objects_dedup_single_md5_shard(dedup_table_t *p_table,
+ md5_shard_t md5_shard,
+ md5_stats_t *p_stats,
+ work_shard_t num_work_shards);
+ int add_disk_rec_from_bucket_idx(disk_block_array_t &disk_arr,
+ const rgw::sal::Bucket *p_bucket,
+ const parsed_etag_t *p_parsed_etag,
+ const std::string &obj_name,
+ uint64_t obj_size,
+ const std::string &storage_class);
+
+ int add_record_to_dedup_table(dedup_table_t *p_table,
+ const struct disk_record_t *p_rec,
+ disk_block_id_t block_id,
+ record_id_t rec_id,
+ md5_stats_t *p_stats,
+ remapper_t *remapper);
+
+ int process_all_slabs(dedup_table_t *p_table,
+ dedup_step_t step,
+ md5_shard_t md5_shard,
+ work_shard_t work_shard,
+ uint32_t *p_seq_count,
+ md5_stats_t *p_stats /* IN-OUT */,
+ disk_block_seq_t *p_disk_block_arr,
+ remapper_t *remapper);
+
+#ifdef FULL_DEDUP_SUPPORT
+ int calc_object_blake3(const disk_record_t *p_rec, uint8_t *p_hash);
+ int add_obj_attrs_to_record(rgw_bucket *p_rb,
+ disk_record_t *p_rec,
+ const rgw::sal::Attrs &attrs,
+ dedup_table_t *p_table,
+ md5_stats_t *p_stats); /* IN-OUT */
+
+ int read_object_attribute(dedup_table_t *p_table,
+ disk_record_t *p_rec,
+ disk_block_id_t block_id,
+ record_id_t rec_id,
+ md5_shard_t md5_shard,
+ md5_stats_t *p_stats /* IN-OUT */,
+ disk_block_seq_t *p_disk,
+ remapper_t *remapper);
+ int try_deduping_record(dedup_table_t *p_table,
+ const disk_record_t *p_rec,
+ disk_block_id_t block_id,
+ record_id_t rec_id,
+ md5_shard_t md5_shard,
+ md5_stats_t *p_stats, /* IN-OUT */
+ remapper_t *remapper);
+ int inc_ref_count_by_manifest(const std::string &ref_tag,
+ const std::string &oid,
+ RGWObjManifest &manifest);
+ int rollback_ref_by_manifest(const std::string &ref_tag,
+ const std::string &oid,
+ RGWObjManifest &tgt_manifest);
+ int free_tail_objs_by_manifest(const std::string &ref_tag,
+ const std::string &oid,
+ RGWObjManifest &tgt_manifest);
+ int dedup_object(const disk_record_t *p_src_rec,
+ const disk_record_t *p_tgt_rec,
+ md5_stats_t *p_stats,
+ bool is_shared_manifest_src);
+#endif
+ int remove_slabs(unsigned worker_id, unsigned md5_shard, uint32_t slab_count);
+ int init_rados_access_handles(bool init_pool);
+
+ // private data members
+ rgw::sal::Driver* driver = nullptr;
+ rgw::sal::RadosStore* store = nullptr;
+ RGWRados* rados = nullptr;
+ librados::Rados* rados_handle = nullptr;
+ const DoutPrefix dp;
+ const DoutPrefixProvider* const dpp;
+ CephContext* const cct;
+ cluster d_cluster;
+ librados::IoCtx d_dedup_cluster_ioctx;
+ utime_t d_heart_beat_last_update;
+ unsigned d_heart_beat_max_elapsed_sec;
+
+ // A pool with 6 billion objects has a 1/(2^64) chance for collison with a 128bit MD5
+ uint64_t d_max_protected_objects = (6ULL * 1024 * 1024 * 1024);
+ uint64_t d_all_buckets_obj_count = 0;
+ uint64_t d_all_buckets_obj_size = 0;
+ // we don't benefit from deduping RGW objects smaller than head-object size
+ uint32_t d_min_obj_size_for_dedup = (4ULL * 1024 * 1024);
+ uint32_t d_head_object_size = (4ULL * 1024 * 1024);
+ control_t d_ctl;
+ uint64_t d_watch_handle = 0;
+ DedupWatcher d_watcher_ctx;
+
+ std::thread d_runner;
+ std::mutex d_cond_mutex;
+ std::condition_variable d_cond;
+ };
+
+} //namespace rgw::dedup
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2;
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Author: Gabriel BenHanokh <gbenhano@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "rgw_dedup_cluster.h"
+#include "rgw_dedup.h"
+#include "rgw_dedup_epoch.h"
+#include "rgw_common.h"
+#include "rgw_dedup_store.h"
+#include "include/rados/rados_types.hpp"
+#include "include/rados/buffer.h"
+#include "include/rados/librados.hpp"
+#include "svc_zone.h"
+#include "common/Clock.h" // for ceph_clock_now()
+#include "common/config.h"
+#include "common/Cond.h"
+#include "common/debug.h"
+#include "common/errno.h"
+#include "rgw_common.h"
+#include "include/denc.h"
+#include "rgw_sal.h"
+#include "driver/rados/rgw_sal_rados.h"
+#include <cstdlib>
+#include <ctime>
+#include <string>
+
+namespace rgw::dedup {
+ const char* DEDUP_EPOCH_TOKEN = "EPOCH_TOKEN";
+ const char* DEDUP_WATCH_OBJ = "DEDUP_WATCH_OBJ";
+
+ static constexpr unsigned EPOCH_MAX_LOCK_DURATION_SEC = 30;
+ struct shard_progress_t;
+ static int collect_shard_stats(rgw::sal::RadosStore *store,
+ const DoutPrefixProvider *dpp,
+ utime_t epoch_time,
+ unsigned shards_count,
+ const char *prefix,
+ bufferlist bl_arr[],
+ struct shard_progress_t *sp_arr);
+
+ const uint64_t SP_ALL_OBJECTS = ULLONG_MAX;
+ const uint64_t SP_NO_OBJECTS = 0ULL;
+ const char* SHARD_PROGRESS_ATTR = "shard_progress";
+
+ //---------------------------------------------------------------------------
+ static int get_control_ioctx(rgw::sal::RadosStore *store,
+ const DoutPrefixProvider *dpp,
+ librados::IoCtx &ctl_ioctx /* OUT-PARAM */)
+ {
+ const auto& control_pool = store->svc()->zone->get_zone_params().control_pool;
+ auto rados_handle = store->getRados()->get_rados_handle();
+ int ret = rgw_init_ioctx(dpp, rados_handle, control_pool, ctl_ioctx);
+ if (unlikely(ret < 0)) {
+ ldpp_dout(dpp, 1) << __func__ << "::ERR: failed rgw_init_ioctx() for control_pool ret="
+ << ret << "::" << cpp_strerror(-ret) << dendl;
+ }
+ return ret;
+ }
+
+ //---------------------------------------------------------------------------
+ static int get_epoch(rgw::sal::RadosStore *store,
+ const DoutPrefixProvider *dpp,
+ dedup_epoch_t *p_epoch, /* OUT */
+ const char *caller)
+ {
+ librados::IoCtx ctl_ioctx;
+ int ret = get_control_ioctx(store, dpp, ctl_ioctx);
+ if (unlikely(ret != 0)) {
+ return ret;
+ }
+
+ std::string oid(DEDUP_EPOCH_TOKEN);
+ bufferlist bl;
+ ret = ctl_ioctx.getxattr(oid, RGW_DEDUP_ATTR_EPOCH, bl);
+ if (ret > 0) {
+ try {
+ auto p = bl.cbegin();
+ decode(*p_epoch, p);
+ }catch (const buffer::error&) {
+ ldpp_dout(dpp, 0) << __func__ << "::failed epoch decode!" << dendl;
+ return -EINVAL;
+ }
+ if (caller) {
+ ldpp_dout(dpp, 10) << __func__ << "::"<< caller<< "::" << *p_epoch << dendl;
+ }
+ return 0;
+ }
+ else {
+ // zero length read means no data
+ if (ret == 0) {
+ ret = -ENODATA;
+ }
+ ldpp_dout(dpp, 10) << __func__ << "::" << (caller ? caller : "")
+ << "::failed ctl_ioctx.getxattr() with: "
+ << cpp_strerror(-ret) << ", ret=" << ret << dendl;
+ return ret;
+ }
+ }
+
+ //---------------------------------------------------------------------------
+ static int set_epoch(rgw::sal::RadosStore *store,
+ const std::string &cluster_id,
+ const DoutPrefixProvider *dpp,
+ work_shard_t num_work_shards,
+ md5_shard_t num_md5_shards)
+ {
+ librados::IoCtx ctl_ioctx;
+ int ret = get_control_ioctx(store, dpp, ctl_ioctx);
+ if (unlikely(ret != 0)) {
+ return ret;
+ }
+
+ std::string oid(DEDUP_EPOCH_TOKEN);
+ ldpp_dout(dpp, 10) << __func__ << "::oid=" << oid << dendl;
+ bool exclusive = true; // block overwrite of old objects
+ ret = ctl_ioctx.create(oid, exclusive);
+ if (ret >= 0) {
+ ldpp_dout(dpp, 10) << __func__ << "::successfully created Epoch object!" << dendl;
+ // now try and take ownership
+ }
+ else if (ret == -EEXIST) {
+ ldpp_dout(dpp, 10) << __func__ << "::Epoch object exists -> trying to take over" << dendl;
+ // try and take ownership
+ }
+ else{
+ ldpp_dout(dpp, 1) << __func__ << "::ERROR: failed to create " << oid
+ <<" with: "<< cpp_strerror(-ret) << ", ret=" << ret <<dendl;
+ return ret;
+ }
+
+ uint32_t serial = 0;
+ dedup_req_type_t dedup_type = dedup_req_type_t::DEDUP_TYPE_ESTIMATE;
+ dedup_epoch_t new_epoch = { serial, dedup_type, ceph_clock_now(),
+ num_work_shards, num_md5_shards };
+ bufferlist new_epoch_bl, empty_bl;
+ encode(new_epoch, new_epoch_bl);
+ librados::ObjectWriteOperation op;
+ op.cmpxattr(RGW_DEDUP_ATTR_EPOCH, CEPH_OSD_CMPXATTR_OP_EQ, empty_bl);
+ op.setxattr(RGW_DEDUP_ATTR_EPOCH, new_epoch_bl);
+
+ ldpp_dout(dpp, 10) << __func__ << "::send EPOCH CLS" << dendl;
+ ret = ctl_ioctx.operate(oid, &op);
+ if (ret == 0) {
+ ldpp_dout(dpp, 10) << __func__ << "::Epoch object was written" << dendl;
+ }
+ // TBD: must check for failure caused by an existing EPOCH xattr!
+ // probably best to read attribute from epoch!
+ else if (ret == -ECANCELED) {
+ dedup_epoch_t epoch;
+ ret = get_epoch(store, dpp, &epoch, __func__);
+ if (ret == 0) {
+ ldpp_dout(dpp, 10) << __func__ << "::Accept existing Epoch object" << dendl;
+ }
+ return ret;
+ }
+ else {
+ ldpp_dout(dpp, 5) << __func__ << "::ERR: failed ctl_ioctx.operate("
+ << oid << "), err is " << cpp_strerror(-ret) << dendl;
+ }
+ return ret;
+ }
+
+ //---------------------------------------------------------------------------
+ static int swap_epoch(rgw::sal::RadosStore *store,
+ const DoutPrefixProvider *dpp,
+ const dedup_epoch_t *p_old_epoch,
+ dedup_req_type_t dedup_type,
+ work_shard_t num_work_shards,
+ md5_shard_t num_md5_shards)
+ {
+ librados::IoCtx ctl_ioctx;
+ int ret = get_control_ioctx(store, dpp, ctl_ioctx);
+ if (unlikely(ret != 0)) {
+ return ret;
+ }
+
+ dedup_epoch_t new_epoch = { p_old_epoch->serial + 1, dedup_type,
+ ceph_clock_now(), num_work_shards, num_md5_shards};
+ bufferlist old_epoch_bl, new_epoch_bl, err_bl;
+ encode(*p_old_epoch, old_epoch_bl);
+ encode(new_epoch, new_epoch_bl);
+ librados::ObjectWriteOperation op;
+ op.cmpxattr(RGW_DEDUP_ATTR_EPOCH, CEPH_OSD_CMPXATTR_OP_EQ, old_epoch_bl);
+ op.setxattr(RGW_DEDUP_ATTR_EPOCH, new_epoch_bl);
+
+ ldpp_dout(dpp, 10) << __func__ << "::send EPOCH CLS" << dendl;
+ std::string oid(DEDUP_EPOCH_TOKEN);
+ ret = ctl_ioctx.operate(oid, &op);
+ if (ret != 0) {
+ ldpp_dout(dpp, 5) << __func__ << "::ERR: failed ctl_ioctx.operate("
+ << oid << "), err is " << cpp_strerror(-ret) << dendl;
+ }
+
+ return ret;
+ }
+
+ //---------------------------------------------------------------------------
+ struct shard_progress_t {
+ shard_progress_t() {
+ // init an empty object
+ this->progress_a = SP_NO_OBJECTS;
+ this->progress_b = SP_NO_OBJECTS;
+ this->completed = false;
+
+ // set all timers to now
+ this->creation_time = utime_t();
+ this->completion_time = utime_t();
+ this->update_time = utime_t();
+
+ // owner and stats_bl are empty until set
+ }
+
+ shard_progress_t(uint64_t _progress_a,
+ uint64_t _progress_b,
+ bool _completed,
+ const std::string &_owner,
+ const bufferlist &_stats_bl) : owner(_owner), stats_bl(_stats_bl) {
+ this->progress_a = _progress_a;
+ this->progress_b = _progress_b;
+ this->completed = _completed;
+
+ utime_t now = ceph_clock_now();
+ this->update_time = now;
+
+ if (_progress_a == SP_NO_OBJECTS && _progress_b == SP_NO_OBJECTS) {
+ this->creation_time = now;
+ }
+ if (_completed) {
+ this->completion_time = now;
+ }
+ }
+
+ bool is_completed() const {
+ if (this->progress_b == SP_ALL_OBJECTS) {
+ ceph_assert(this->completed);
+ return true;
+ }
+ else {
+ ceph_assert(!this->completed);
+ return false;
+ }
+ }
+
+ bool was_not_started() const {
+ return (this->creation_time == this->update_time);
+ }
+
+ uint64_t progress_a;
+ uint64_t progress_b;
+ bool completed;
+ utime_t update_time;
+ utime_t creation_time;
+ utime_t completion_time;
+ std::string owner;
+ bufferlist stats_bl;
+ };
+
+ //---------------------------------------------------------------------------
+ std::ostream& operator<<(std::ostream &out, shard_progress_t& sp)
+ {
+ out << (sp.completed ? " + ::" : " - ::");
+ out << sp.owner << "::[" << sp.progress_a << ", " << sp.progress_b << "]";
+ out << "::creation: " << sp.creation_time;
+ out << "::update: " << sp.update_time;
+ out << "::completion: " << sp.completion_time;
+ return out;
+ }
+
+ //---------------------------------------------------------------------------
+ void encode(const shard_progress_t& sp, ceph::bufferlist& bl)
+ {
+ ENCODE_START(1, 1, bl);
+ encode(sp.progress_a, bl);
+ encode(sp.progress_b, bl);
+ encode(sp.completed, bl);
+ encode(sp.creation_time, bl);
+ encode(sp.completion_time, bl);
+ encode(sp.update_time, bl);
+ encode(sp.owner, bl);
+ encode(sp.stats_bl, bl);
+ ENCODE_FINISH(bl);
+ }
+
+ //---------------------------------------------------------------------------
+ void decode(shard_progress_t & sp, ceph::bufferlist::const_iterator& bl)
+ {
+ DECODE_START(1, bl);
+ decode(sp.progress_a, bl);
+ decode(sp.progress_b, bl);
+ decode(sp.completed, bl);
+ decode(sp.creation_time, bl);
+ decode(sp.completion_time, bl);
+ decode(sp.update_time, bl);
+ decode(sp.owner, bl);
+ decode(sp.stats_bl, bl);
+ DECODE_FINISH(bl);
+ }
+
+ //==========================================================================
+
+ //---------------------------------------------------------------------------
+ void cluster::clear()
+ {
+ d_curr_md5_shard = 0;
+ d_curr_worker_shard = 0;
+
+ d_num_completed_workers = 0;
+ d_num_completed_md5 = 0;
+
+ memset(d_completed_workers, TOKEN_STATE_PENDING, sizeof(d_completed_workers));
+ memset(d_completed_md5, TOKEN_STATE_PENDING, sizeof(d_completed_md5));
+ }
+
+
+ static constexpr auto COOKIE_LEN = 15;
+ static constexpr auto CLUSTER_ID_LEN = 15;
+ //---------------------------------------------------------------------------
+ cluster::cluster(const DoutPrefixProvider *_dpp,
+ CephContext *cct,
+ rgw::sal::Driver* driver):
+ dpp(_dpp),
+ d_lock_cookie(gen_rand_alphanumeric(cct, COOKIE_LEN)),
+ d_cluster_id (gen_rand_alphanumeric(cct, CLUSTER_ID_LEN))
+ {
+ clear();
+ }
+
+ //---------------------------------------------------------------------------
+ int cluster::reset(rgw::sal::RadosStore *store,
+ dedup_epoch_t *p_epoch,
+ work_shard_t num_work_shards,
+ md5_shard_t num_md5_shards)
+ {
+ ldpp_dout(dpp, 10) << __func__ << "::REQ num_work_shards=" << num_work_shards
+ << "::num_md5_shards=" << num_md5_shards << dendl;
+ clear();
+
+ while (true) {
+ int ret = get_epoch(store, dpp, p_epoch, __func__);
+ if (ret != 0) {
+ return ret;
+ }
+ if (p_epoch->num_work_shards && p_epoch->num_md5_shards) {
+ ldpp_dout(dpp, 10) << __func__ << "::ACC num_work_shards=" << p_epoch->num_work_shards
+ << "::num_md5_shards=" << p_epoch->num_md5_shards << dendl;
+ break;
+ }
+ else if (!num_work_shards && !num_md5_shards) {
+ ldpp_dout(dpp, 10) << __func__ << "::Init flow, no need to wait" << dendl;
+ break;
+ }
+ else {
+ ret = swap_epoch(store, dpp, p_epoch,
+ static_cast<dedup_req_type_t> (p_epoch->dedup_type),
+ num_work_shards, num_md5_shards);
+ }
+ }
+
+ d_epoch_time = p_epoch->time;
+ // retry cleanup 3 times before declaring failure
+ const unsigned RETRY_LIMIT = 3;
+ int ret = 1;
+ for (unsigned i = 0; i < RETRY_LIMIT && ret != 0; i++) {
+ ret = cleanup_prev_run(store);
+ }
+ if (ret != 0) {
+ return ret;
+ }
+
+ create_shard_tokens(store, p_epoch->num_work_shards, WORKER_SHARD_PREFIX);
+ create_shard_tokens(store, p_epoch->num_md5_shards, MD5_SHARD_PREFIX);
+
+ ret = verify_all_shard_tokens(store, p_epoch->num_work_shards,
+ WORKER_SHARD_PREFIX);
+ if (ret != 0) {
+ return ret;
+ }
+ return verify_all_shard_tokens(store, p_epoch->num_md5_shards,
+ MD5_SHARD_PREFIX);
+ }
+
+ //---------------------------------------------------------------------------
+ int cluster::cleanup_prev_run(rgw::sal::RadosStore *store)
+ {
+ librados::IoCtx ctl_ioctx;
+ int ret = get_control_ioctx(store, dpp, ctl_ioctx);
+ if (unlikely(ret != 0)) {
+ return ret;
+ }
+
+ int error_code = 0;
+ constexpr uint32_t max = 100;
+ std::string marker;
+ bool truncated = false;
+ rgw::AccessListFilter filter{};
+ unsigned deleted_count = 0, skipped_count = 0;
+ unsigned failed_count = 0, no_entry_count = 0;
+ do {
+ std::vector<std::string> oids;
+ int ret = rgw_list_pool(dpp, ctl_ioctx, max, filter, marker, &oids, &truncated);
+ if (ret == -ENOENT) {
+ ldpp_dout(dpp, 10) << __func__ << "::rgw_list_pool() ret == -ENOENT"<< dendl;
+ break;
+ }
+ else if (ret < 0) {
+ ldpp_dout(dpp, 1) << "failed rgw_list_pool()! ret=" << ret
+ << "::" << cpp_strerror(-ret) << dendl;
+ return ret;
+ }
+
+ for (const std::string& oid : oids) {
+ if (shard_token_oid::legal_oid_name(oid) == false) {
+ ldpp_dout(dpp, 10) << __func__ << "::skipping " << oid << dendl;
+ skipped_count++;
+ continue;
+ }
+
+ uint64_t size;
+ struct timespec tspec;
+ ret = ctl_ioctx.stat2(oid, &size, &tspec);
+ if (ret == -ENOENT) {
+ ldpp_dout(dpp, 20) << __func__ << "::" << oid
+ << " was removed by others" << dendl;
+ no_entry_count++;
+ continue;
+ }
+ else if (ret != 0) {
+ ldpp_dout(dpp, 10) << __func__ << "::failed ctl_ioctx.stat( "
+ << oid << " )" << dendl;
+ error_code = ret;
+ failed_count++;
+ continue;
+ }
+ utime_t mtime(tspec);
+ if (d_epoch_time < mtime) {
+ ldpp_dout(dpp, 10) << __func__ << "::skipping new obj! "
+ << "::EPOCH={" << d_epoch_time.tv.tv_sec << ":" << d_epoch_time.tv.tv_nsec << "} "
+ << "::mtime={" << mtime.tv.tv_sec << ":" << mtime.tv.tv_nsec << "}" << dendl;
+ skipped_count++;
+ continue;
+ }
+ ldpp_dout(dpp, 10) << __func__ << "::removing object: " << oid << dendl;
+ ret = ctl_ioctx.remove(oid);
+ if (ret == 0) {
+ deleted_count++;
+ }
+ else if (ret == -ENOENT) {
+ ldpp_dout(dpp, 20) << __func__ << "::" << oid
+ << " was removed by others" << dendl;
+ no_entry_count++;
+ continue;
+ }
+ else {
+ error_code = ret;
+ failed_count++;
+ ldpp_dout(dpp, 10) << __func__ << "::failed ctl_ioctx.remove( " << oid
+ << " ), ret=" << ret << "::" << cpp_strerror(-ret) << dendl;
+ }
+ }
+ ldpp_dout(dpp, 10) << __func__ << "::oids.size()=" << oids.size()
+ << "::deleted=" << deleted_count
+ << "::failed=" << failed_count
+ << "::no entry=" << no_entry_count
+ << "::skipped=" << skipped_count << dendl;
+ } while (truncated);
+
+ return error_code;
+ }
+
+ //---------------------------------------------------------------------------
+ int cluster::create_shard_tokens(rgw::sal::RadosStore *store,
+ unsigned shards_count,
+ const char *prefix)
+ {
+ librados::IoCtx ctl_ioctx;
+ int ret = get_control_ioctx(store, dpp, ctl_ioctx);
+ if (unlikely(ret != 0)) {
+ return ret;
+ }
+
+ shard_token_oid sto(prefix);
+ for (unsigned shard = 0; shard < shards_count; shard++) {
+ sto.set_shard(shard);
+ std::string oid(sto.get_buff(), sto.get_buff_size());
+ ldpp_dout(dpp, 15) << __func__ << "::creating object: " << oid << dendl;
+ bool exclusive = true;
+ ret = ctl_ioctx.create(oid, exclusive);
+ if (ret >= 0) {
+ ldpp_dout(dpp, 15) << __func__ << "::oid=" << oid << " was created!" << dendl;
+ }
+ else if (ret == -EEXIST) {
+ ldpp_dout(dpp, 15) << __func__ << "::failed ctl_ioctx.create("
+ << oid << ") -EEXIST!" << dendl;
+ }
+ else {
+ // TBD: can it happen legally ?
+ ldpp_dout(dpp, 1) << __func__ << "::failed ctl_ioctx.create(" << oid
+ << ") with: " << ret << "::" << cpp_strerror(-ret) << dendl;
+ }
+ }
+
+ return 0;
+ }
+
+ //---------------------------------------------------------------------------
+ int cluster::verify_all_shard_tokens(rgw::sal::RadosStore *store,
+ unsigned shards_count,
+ const char *prefix)
+ {
+ librados::IoCtx ctl_ioctx;
+ int ret = get_control_ioctx(store, dpp, ctl_ioctx);
+ if (unlikely(ret != 0)) {
+ return ret;
+ }
+
+ shard_token_oid sto(prefix);
+ for (unsigned shard = 0; shard < shards_count; shard++) {
+ sto.set_shard(shard);
+ std::string oid(sto.get_buff(), sto.get_buff_size());
+ ldpp_dout(dpp, 10) << __func__ << "::checking object: " << oid << dendl;
+
+ uint64_t size;
+ struct timespec tspec;
+ ret = ctl_ioctx.stat2(oid, &size, &tspec);
+ if (ret != 0) {
+ ldpp_dout(dpp, 5) << __func__ << "::failed ctl_ioctx.stat( " << oid << " )"
+ << "::shards_count=" << shards_count << dendl;
+ return ret;
+ }
+ }
+
+ return 0;
+ }
+
+ //---------------------------------------------------------------------------
+ int cluster::update_shard_token_heartbeat(rgw::sal::RadosStore *store,
+ unsigned shard,
+ uint64_t count_a,
+ uint64_t count_b,
+ const char *prefix)
+ {
+ librados::IoCtx ctl_ioctx;
+ int ret = get_control_ioctx(store, dpp, ctl_ioctx);
+ if (unlikely(ret != 0)) {
+ return ret;
+ }
+
+ shard_token_oid sto(prefix, shard);
+ std::string oid(sto.get_buff(), sto.get_buff_size());
+ bufferlist empty_bl;
+ shard_progress_t sp(count_a, count_b, false, d_cluster_id, empty_bl);
+ sp.creation_time = d_token_creation_time;
+ bufferlist sp_bl;
+ encode(sp, sp_bl);
+ return ctl_ioctx.setxattr(oid, SHARD_PROGRESS_ATTR, sp_bl);
+ }
+
+ //---------------------------------------------------------------------------
+ int cluster::mark_shard_token_completed(rgw::sal::RadosStore *store,
+ unsigned shard,
+ uint64_t obj_count,
+ const char *prefix,
+ const bufferlist &bl)
+ {
+ librados::IoCtx ctl_ioctx;
+ int ret = get_control_ioctx(store, dpp, ctl_ioctx);
+ if (unlikely(ret != 0)) {
+ return ret;
+ }
+
+ shard_token_oid sto(prefix, shard);
+ std::string oid(sto.get_buff(), sto.get_buff_size());
+ ldpp_dout(dpp, 10) << __func__ << "::" << prefix << "::" << oid << dendl;
+
+ shard_progress_t sp(obj_count, SP_ALL_OBJECTS, true, d_cluster_id, bl);
+ sp.creation_time = d_token_creation_time;
+ bufferlist sp_bl;
+ encode(sp, sp_bl);
+ ret = ctl_ioctx.setxattr(oid, SHARD_PROGRESS_ATTR, sp_bl);
+ if (ret == 0) {
+ ldpp_dout(dpp, 10) << __func__ << "::Done ctl_ioctx.setxattr(" << oid << ")"
+ << dendl;
+ }
+ else {
+ ldpp_dout(dpp, 0) << __func__ << "::Failed ctl_ioctx.setxattr(" << oid
+ << ") ret=" << ret << "::" << cpp_strerror(-ret) << dendl;
+ }
+
+ return ret;
+ }
+
+ //---------------------------------------------------------------------------
+ int32_t cluster::get_next_shard_token(rgw::sal::RadosStore *store,
+ uint16_t start_shard,
+ uint16_t max_shard,
+ const char *prefix)
+ {
+ librados::IoCtx ctl_ioctx;
+ int ret = get_control_ioctx(store, dpp, ctl_ioctx);
+ if (unlikely(ret != 0)) {
+ return ret;
+ }
+
+ // lock paramters:
+ const utime_t lock_duration; // zero duration means lock doesn't expire
+ const uint8_t lock_flags = 0; // no flags
+ const std::string lock_tag; // no tag
+
+ shard_token_oid sto(prefix);
+ for (auto shard = start_shard; shard < max_shard; shard++) {
+ sto.set_shard(shard);
+ std::string oid(sto.get_buff(), sto.get_buff_size());
+ ldpp_dout(dpp, 10) << __func__ << "::try garbbing " << oid << dendl;
+ librados::ObjectWriteOperation op;
+ op.assert_exists();
+ rados::cls::lock::lock(&op, oid, ClsLockType::EXCLUSIVE, d_lock_cookie,
+ lock_tag, "dedup_shard_token", lock_duration, lock_flags);
+ ret = rgw_rados_operate(dpp, ctl_ioctx, oid, std::move(op), null_yield);
+ if (ret == -EBUSY) {
+ // someone else took this token -> move to the next one
+ ldpp_dout(dpp, 10) << __func__ << "::Failed lock. " << oid <<
+ " is owned by other rgw" << dendl;
+ continue;
+ }
+ else if (ret == -ENOENT) {
+ // token is deleted - processing will stop the next time we try to read from the queue
+ ldpp_dout(dpp, 5) << __func__ << "::" << oid
+ << " token doesn't exist, fail lock!" << dendl;
+ continue;
+ }
+ else if (ret < 0) {
+ // failed to lock for another reason, continue to process other queues
+ ldpp_dout(dpp, 1) << __func__ << "::ERROR: failed to lock token: " << oid
+ << ":: ret=" << ret << "::" << cpp_strerror(-ret) << dendl;
+ //has_error = true;
+ continue;
+ }
+ ldpp_dout(dpp, 10) << __func__ << "::successfully locked " << oid << dendl;
+ bufferlist empty_bl;
+ shard_progress_t sp(SP_NO_OBJECTS, SP_NO_OBJECTS, false, d_cluster_id, empty_bl);
+ d_token_creation_time = sp.creation_time;
+ bufferlist sp_bl;
+ encode(sp, sp_bl);
+ ret = ctl_ioctx.setxattr(oid, SHARD_PROGRESS_ATTR, sp_bl);
+ if (ret == 0) {
+ ldpp_dout(dpp, 10) << __func__ << "::SUCCESS!::" << oid << dendl;
+ return shard;
+ }
+ }
+
+ return NULL_SHARD;
+ }
+
+ //---------------------------------------------------------------------------
+ work_shard_t cluster::get_next_work_shard_token(rgw::sal::RadosStore *store,
+ work_shard_t num_work_shards)
+ {
+ int32_t shard = get_next_shard_token(store, d_curr_worker_shard,
+ num_work_shards, WORKER_SHARD_PREFIX);
+ if (shard >= 0 && shard < num_work_shards) {
+ d_curr_worker_shard = shard + 1;
+ return shard;
+ }
+ else {
+ return NULL_WORK_SHARD;
+ }
+ }
+
+ //---------------------------------------------------------------------------
+ md5_shard_t cluster::get_next_md5_shard_token(rgw::sal::RadosStore *store,
+ md5_shard_t num_md5_shards)
+ {
+ int32_t shard = get_next_shard_token(store, d_curr_md5_shard, num_md5_shards,
+ MD5_SHARD_PREFIX);
+ if (shard >= 0 && shard < num_md5_shards) {
+ d_curr_md5_shard = shard + 1;
+ return shard;
+ }
+ else {
+ return NULL_MD5_SHARD;
+ }
+ }
+
+ //---------------------------------------------------------------------------
+ int cluster::all_shard_tokens_completed(rgw::sal::RadosStore *store,
+ unsigned shards_count,
+ const char *prefix,
+ uint16_t *p_num_completed,
+ uint8_t completed_arr[])
+ {
+ librados::IoCtx ctl_ioctx;
+ int ret = get_control_ioctx(store, dpp, ctl_ioctx);
+ if (unlikely(ret != 0)) {
+ return ret;
+ }
+
+ int err_code = 0;
+ unsigned count = 0;
+ shard_token_oid sto(prefix);
+ for (unsigned shard = 0; shard < shards_count; shard++) {
+ if (completed_arr[shard] == TOKEN_STATE_COMPLETED) {
+ count++;
+ continue;
+ }
+
+ sto.set_shard(shard);
+ std::string oid(sto.get_buff(), sto.get_buff_size());
+ ldpp_dout(dpp, 10) << __func__ << "::checking object: " << oid << dendl;
+ bufferlist bl;
+ ret = ctl_ioctx.getxattr(oid, SHARD_PROGRESS_ATTR, bl);
+ if (unlikely(ret <= 0)) {
+ if (ret != -ENODATA) {
+ ldpp_dout(dpp, 10) << __func__ << "::failed ctl_ioctx.getxattr() ret="
+ << ret << "::" << cpp_strerror(-ret) << dendl;
+ }
+ completed_arr[shard] = TOKEN_STATE_CORRUPTED;
+ // all failures to get valid token state return ENODATA
+ err_code = -ENODATA;
+ continue;
+ }
+
+ shard_progress_t sp;
+ try {
+ auto p = bl.cbegin();
+ decode(sp, p);
+ }
+ catch (const buffer::error&) {
+ ldpp_dout(dpp, 1) << __func__ << "::failed shard_progress_t decode!" << dendl;
+ completed_arr[shard] = TOKEN_STATE_CORRUPTED;
+ // all failures to get valid token state return ENODATA
+ err_code = -ENODATA;
+ continue;
+ }
+
+ if (sp.is_completed()) {
+ utime_t duration = sp.completion_time - sp.creation_time;
+ // mark token completed;
+ (*p_num_completed)++;
+ completed_arr[shard] = TOKEN_STATE_COMPLETED;
+ ldpp_dout(dpp, 20) << __func__ << "::" << oid
+ << "::completed! duration=" << duration << dendl;
+ count++;
+ }
+ else if (sp.was_not_started()) {
+ // token was not started yet
+ // TBD:
+ // If it is not locked we can process it (by why we skipped it)??
+ // If locked, check when it was done and if timed-out
+ ldpp_dout(dpp, 10) << __func__ << "::" << oid
+ << "::was not started, skipping" << dendl;
+ return -EAGAIN;
+ }
+ else {
+ static const utime_t heartbeat_timeout(EPOCH_MAX_LOCK_DURATION_SEC, 0);
+ utime_t time_elapsed = ceph_clock_now() - sp.update_time;
+ if (time_elapsed > heartbeat_timeout) {
+ // lock expired -> try and break lock
+ ldpp_dout(dpp, 5) << __func__ << "::" << oid
+ << "::expired lock, skipping:" << time_elapsed
+ << "::" << sp << dendl;
+ completed_arr[shard] = TOKEN_STATE_TIMED_OUT;
+ err_code = -ETIME;
+ continue;
+ }
+ else {
+ return -EAGAIN;
+ }
+ }
+ } // loop
+
+ if (count < shards_count) {
+ unsigned n = shards_count - count;
+ ldpp_dout(dpp, 10) << __func__ << "::waiting for " << n << " tokens" << dendl;
+ }
+ return err_code;
+ }
+
+ //---------------------------------------------------------------------------
+ static int collect_shard_stats(rgw::sal::RadosStore *store,
+ const DoutPrefixProvider *dpp,
+ utime_t epoch_time,
+ unsigned shards_count,
+ const char *prefix,
+ bufferlist bl_arr[],
+ shard_progress_t *sp_arr)
+ {
+ librados::IoCtx ctl_ioctx;
+ int ret = get_control_ioctx(store, dpp, ctl_ioctx);
+ if (unlikely(ret != 0)) {
+ return ret;
+ }
+
+ unsigned count = 0;
+ cluster::shard_token_oid sto(prefix);
+ for (unsigned shard = 0; shard < shards_count; shard++) {
+ sto.set_shard(shard);
+ std::string oid(sto.get_buff(), sto.get_buff_size());
+ ldpp_dout(dpp, 20) << __func__ << "::checking object: " << oid << dendl;
+
+ uint64_t size;
+ struct timespec tspec;
+ if (ctl_ioctx.stat2(oid, &size, &tspec) != 0) {
+ ldpp_dout(dpp, 10) << __func__ << "::failed ctl_ioctx.stat( " << oid << " )"
+ << "::shards_count=" << shards_count << dendl;
+ continue;
+ }
+ utime_t mtime(tspec);
+ if (epoch_time > mtime) {
+ ldpp_dout(dpp, 10) << __func__ << "::skipping old obj! "
+ << "::EPOCH={" << epoch_time.tv.tv_sec << ":" << epoch_time.tv.tv_nsec << "} "
+ << "::mtime={" << mtime.tv.tv_sec << ":" << mtime.tv.tv_nsec << "}" << dendl;
+ continue;
+ }
+
+ shard_progress_t sp;
+ bufferlist bl;
+ ret = ctl_ioctx.getxattr(oid, SHARD_PROGRESS_ATTR, bl);
+ if (ret > 0) {
+ try {
+ auto p = bl.cbegin();
+ decode(sp, p);
+ sp_arr[shard] = sp;
+ count++;
+ }
+ catch (const buffer::error&) {
+ ldpp_dout(dpp, 10) << __func__ << "::(1)failed shard_progress_t decode!" << dendl;
+ return -EINVAL;
+ }
+ }
+ else if (ret != -ENODATA) {
+ ldpp_dout(dpp, 10) << __func__ << "::" << oid << "::failed getxattr() ret="
+ << ret << "::" << cpp_strerror(-ret) << dendl;
+ continue;
+ }
+ bl_arr[shard] = sp.stats_bl;
+ }
+
+ if (count != shards_count) {
+ ldpp_dout(dpp, 10) << __func__ << "::missing shards stats! we got "
+ << count << " / " << shards_count << dendl;
+ }
+
+ return count;
+ }
+
+ struct member_time_t {
+ utime_t start_time;
+ utime_t end_time;
+ utime_t aggregated_time;
+ };
+
+ //---------------------------------------------------------------------------
+ static void collect_single_shard_stats(const DoutPrefixProvider *dpp,
+ std::map<std::string, member_time_t> &owner_map,
+ const shard_progress_t sp_arr[],
+ unsigned shard,
+ bool *p_show_time,
+ const char *name)
+ {
+ const utime_t null_time;
+ const shard_progress_t &sp = sp_arr[shard];
+ if (sp.creation_time == null_time || sp.completion_time == null_time) {
+ *p_show_time = false;
+ return;
+ }
+
+ const std::string &owner = sp.owner;
+ utime_t duration = sp.completion_time - sp.creation_time;
+ if (owner_map.find(owner) != owner_map.end()) {
+ owner_map[owner].aggregated_time += duration;
+ owner_map[owner].end_time = sp.completion_time;
+ }
+ else {
+ owner_map[owner].start_time = sp.creation_time;
+ owner_map[owner].aggregated_time = duration;
+ owner_map[owner].end_time = sp.completion_time;
+ }
+ ldpp_dout(dpp, 10) << __func__ << "::Got " << name
+ << " stats for shard #" << shard << dendl;
+ }
+
+ //---------------------------------------------------------------------------
+ static void show_incomplete_shards_fmt(bool has_incomplete_shards,
+ unsigned num_shards,
+ const shard_progress_t sp_arr[],
+ Formatter *fmt)
+
+ {
+ if (!has_incomplete_shards) {
+ return;
+ }
+ Formatter::ArraySection array_section{*fmt, "incomplete_shards"};
+ for (unsigned shard = 0; shard < num_shards; shard++) {
+ if (sp_arr[shard].is_completed() ) {
+ continue;
+ }
+ Formatter::ObjectSection object_section{*fmt, "shard_progress"};
+ fmt->dump_unsigned("shard_id", shard);
+ fmt->dump_string("owner", sp_arr[shard].owner);
+ fmt->dump_unsigned("progress_a", sp_arr[shard].progress_a);
+ fmt->dump_unsigned("progress_b", sp_arr[shard].progress_b);
+ fmt->dump_stream("last updated") << sp_arr[shard].update_time;
+ }
+ }
+
+ //---------------------------------------------------------------------------
+ static utime_t show_time_func_fmt(const utime_t &start_time,
+ bool show_time,
+ const std::map<std::string, member_time_t> &owner_map,
+ Formatter *fmt)
+ {
+ member_time_t all_members_time;
+ all_members_time.start_time = start_time;
+ all_members_time.end_time = start_time;
+ all_members_time.aggregated_time = utime_t();
+
+ Formatter::ObjectSection section{*fmt, "time"};
+ {
+ Formatter::ArraySection array_section{*fmt, "per-shard time"};
+ for (const auto& [owner, value] : owner_map) {
+ uint32_t sec = value.end_time.tv.tv_sec - value.start_time.tv.tv_sec;
+ fmt->dump_stream("member time")
+ << owner << "::start time = [" << value.start_time.tv.tv_sec % 1000
+ << ":" << value.start_time.tv.tv_nsec / (1000*1000) << "] "
+ << "::aggregated time = " << value.aggregated_time.tv.tv_sec
+ << "(" << sec << ") seconds";
+ all_members_time.aggregated_time += value.aggregated_time;
+ if (all_members_time.end_time < value.end_time) {
+ all_members_time.end_time = value.end_time;
+ }
+ }
+ }
+
+ if (show_time) {
+ uint32_t sec = all_members_time.end_time.tv.tv_sec - all_members_time.start_time.tv.tv_sec;
+
+ Formatter::ObjectSection section{*fmt, "All shards time"};
+ fmt->dump_stream("start time") << all_members_time.start_time;
+ fmt->dump_stream("end time")
+ << all_members_time.end_time << " (" << sec << " seconds total)";
+ fmt->dump_unsigned("aggregated time (sec)", all_members_time.aggregated_time.tv.tv_sec);
+ }
+
+ return all_members_time.end_time;
+ }
+
+ //---------------------------------------------------------------------------
+ static void show_dedup_ratio_estimate_fmt(const worker_stats_t &wrk_stats_sum,
+ const md5_stats_t &md5_stats_sum,
+ Formatter *fmt)
+ {
+ uint64_t s3_bytes_before = wrk_stats_sum.ingress_obj_bytes;
+ uint64_t s3_dedup_bytes = md5_stats_sum.big_objs_stat.dedup_bytes_estimate;
+ uint64_t s3_bytes_after = s3_bytes_before - s3_dedup_bytes;
+ Formatter::ObjectSection section{*fmt, "dedup_ratio_estimate"};
+ fmt->dump_unsigned("s3_bytes_before", s3_bytes_before);
+ fmt->dump_unsigned("s3_bytes_after", s3_bytes_after);
+ fmt->dump_unsigned("dup_head_bytes", md5_stats_sum.dup_head_bytes_estimate);
+
+ if (s3_bytes_before > s3_bytes_after && s3_bytes_after) {
+ double dedup_ratio = (double)s3_bytes_before/s3_bytes_after;
+ fmt->dump_float("dedup_ratio", dedup_ratio);
+ }
+ else {
+ fmt->dump_float("dedup_ratio", 0);
+ }
+ }
+
+ //---------------------------------------------------------------------------
+ static void show_dedup_ratio_actual_fmt(const worker_stats_t &wrk_stats_sum,
+ const md5_stats_t &md5_stats_sum,
+ Formatter *fmt)
+ {
+ uint64_t s3_bytes_before = wrk_stats_sum.ingress_obj_bytes;
+ uint64_t s3_dedup_bytes = (md5_stats_sum.deduped_objects_bytes +
+ md5_stats_sum.shared_manifest_dedup_bytes);
+ uint64_t s3_bytes_after = s3_bytes_before - s3_dedup_bytes;
+
+ Formatter::ObjectSection section{*fmt, "dedup_ratio_actual"};
+ fmt->dump_unsigned("s3_bytes_before", s3_bytes_before);
+ fmt->dump_unsigned("s3_bytes_after", s3_bytes_after);
+ fmt->dump_unsigned("dup_head_bytes", md5_stats_sum.dup_head_bytes);
+ if (s3_bytes_before > s3_bytes_after && s3_bytes_after) {
+ double dedup_ratio = (double)s3_bytes_before/s3_bytes_after;
+ fmt->dump_float("dedup_ratio", dedup_ratio);
+ }
+ else {
+ fmt->dump_float("dedup_ratio", 0);
+ }
+ }
+
+ //---------------------------------------------------------------------------
+ // command-line called from radosgw-admin.cc
+ int cluster::collect_all_shard_stats(rgw::sal::RadosStore *store,
+ Formatter *fmt,
+ const DoutPrefixProvider *dpp)
+ {
+ dedup_epoch_t epoch;
+ int ret = get_epoch(store, dpp, &epoch, nullptr);
+ if (ret != 0) {
+ return ret;
+ }
+
+ Formatter::ObjectSection section{*fmt, "DEDUP STAT COUNTERS"};
+ work_shard_t num_work_shards = epoch.num_work_shards;
+ md5_shard_t num_md5_shards = epoch.num_md5_shards;
+
+ unsigned completed_work_shards_count = 0;
+ unsigned completed_md5_shards_count = 0;
+ utime_t md5_start_time;
+ worker_stats_t wrk_stats_sum;
+ {
+ std::map<std::string, member_time_t> owner_map;
+ bool show_time = true;
+ bufferlist bl_arr[num_work_shards];
+ shard_progress_t sp_arr[num_work_shards];
+ int cnt = collect_shard_stats(store, dpp, epoch.time, num_work_shards,
+ WORKER_SHARD_PREFIX, bl_arr, sp_arr);
+ if (cnt != num_work_shards && 0) {
+ std::cerr << ">>>Partial work shard stats recived " << cnt << " / "
+ << num_work_shards << "\n" << std::endl;
+ }
+ bool has_incomplete_shards = false;
+ for (unsigned shard = 0; shard < num_work_shards; shard++) {
+ if (bl_arr[shard].length() == 0) {
+ has_incomplete_shards = true;
+ continue;
+ }
+ completed_work_shards_count++;
+ worker_stats_t stats;
+ try {
+ auto p = bl_arr[shard].cbegin();
+ decode(stats, p);
+ wrk_stats_sum += stats;
+ }catch (const buffer::error&) {
+ // TBD: can we use std::cerr or should we use formatter ??
+ std::cerr << __func__ << "::(2)failed worker_stats_t decode #" << shard << std::endl;
+ continue;
+ }
+ collect_single_shard_stats(dpp, owner_map, sp_arr, shard, &show_time, "WORKER");
+ }
+ Formatter::ObjectSection worker_stats(*fmt, "worker_stats");
+ wrk_stats_sum.dump(fmt);
+ show_incomplete_shards_fmt(has_incomplete_shards, num_work_shards, sp_arr, fmt);
+ md5_start_time = show_time_func_fmt(epoch.time, show_time, owner_map, fmt);
+ }
+
+ if (completed_work_shards_count == num_work_shards) {
+ std::map<std::string, member_time_t> owner_map;
+ bool show_time = true;
+ md5_stats_t md5_stats_sum;
+ bufferlist bl_arr[num_md5_shards];
+ shard_progress_t sp_arr[num_md5_shards];
+ int cnt = collect_shard_stats(store, dpp, epoch.time, num_md5_shards,
+ MD5_SHARD_PREFIX, bl_arr, sp_arr);
+ if (cnt != num_md5_shards && 0) {
+ std::cerr << ">>>Partial MD5_SHARD stats recived " << cnt << " / "
+ << num_md5_shards << "\n" << std::endl;
+ }
+ bool has_incomplete_shards = false;
+ for (unsigned shard = 0; shard < num_md5_shards; shard++) {
+ if (bl_arr[shard].length() == 0) {
+ has_incomplete_shards = true;
+ continue;
+ }
+ completed_md5_shards_count++;
+ md5_stats_t stats;
+ try {
+ auto p = bl_arr[shard].cbegin();
+ decode(stats, p);
+ md5_stats_sum += stats;
+ }catch (const buffer::error&) {
+ // TBD: can we use std::cerr or should we use formatter ??
+ std::cerr << __func__ << "::failed md5_stats_t decode #" << shard << std::endl;
+ continue;
+ }
+ collect_single_shard_stats(dpp, owner_map, sp_arr, shard, &show_time, "MD5");
+ }
+ {
+ Formatter::ObjectSection outer(*fmt, "md5_stats");
+ md5_stats_sum.dump(fmt);
+ show_incomplete_shards_fmt(has_incomplete_shards, num_md5_shards, sp_arr, fmt);
+ show_time_func_fmt(md5_start_time, show_time, owner_map, fmt);
+ }
+ show_dedup_ratio_estimate_fmt(wrk_stats_sum, md5_stats_sum, fmt);
+ show_dedup_ratio_actual_fmt(wrk_stats_sum, md5_stats_sum, fmt);
+ }
+
+ fmt->dump_bool("completed", (completed_md5_shards_count == num_md5_shards));
+ return 0;
+ }
+
+ //---------------------------------------------------------------------------
+ int cluster::watch_reload(rgw::sal::RadosStore *store,
+ const DoutPrefixProvider* dpp,
+ uint64_t *p_watch_handle,
+ librados::WatchCtx2 *ctx)
+ {
+ librados::IoCtx ctl_ioctx;
+ int ret = get_control_ioctx(store, dpp, ctl_ioctx);
+ if (unlikely(ret != 0)) {
+ return ret;
+ }
+
+ const std::string & oid = DEDUP_WATCH_OBJ;
+ // create the object to watch (object may already exist)
+ bool exclusive = true;
+ ret = ctl_ioctx.create(oid, exclusive);
+ if (ret >= 0) {
+ ldpp_dout(dpp, 10) << "dedup_bg::watch_reload():" << oid
+ << " was created!" << dendl;
+ }
+ else if (ret == -EEXIST) {
+ ldpp_dout(dpp, 5) << __func__ << "::"<< oid << " exists" << dendl;
+ }
+ else {
+ ldpp_dout(dpp, 1) << "dedup_bg::watch_reload(): failed ctl_ioctx.create("
+ << oid << ") ret=" << ret << "::" << cpp_strerror(-ret) << dendl;
+ return ret;
+ }
+
+ ret = ctl_ioctx.watch2(oid, p_watch_handle, ctx);
+ if (ret < 0) {
+ ldpp_dout(dpp, 1) << "dedup_bg::watch_reload(): failed watch2() " << oid
+ << ". error: " << cpp_strerror(-ret) << dendl;
+ *p_watch_handle = 0;
+ return ret;
+ }
+ ldpp_dout(dpp, 5) << "dedup_bg::watch_reload(): Started watching "
+ << oid << "::watch_handle=" << *p_watch_handle << dendl;
+ return 0;
+ }
+
+ //---------------------------------------------------------------------------
+ int cluster::unwatch_reload(rgw::sal::RadosStore *store,
+ const DoutPrefixProvider* dpp,
+ uint64_t watch_handle)
+ {
+ librados::IoCtx ctl_ioctx;
+ int ret = get_control_ioctx(store, dpp, ctl_ioctx);
+ if (unlikely(ret != 0)) {
+ return ret;
+ }
+
+ ret = ctl_ioctx.unwatch2(watch_handle);
+ if (ret < 0) {
+ ldpp_dout(dpp, 1) << "dedup_bg::unwatch_reload() failed unwatch2() "
+ << DEDUP_WATCH_OBJ << "::" << cpp_strerror(-ret) << dendl;
+ return ret;
+ }
+ return 0;
+ }
+
+ //---------------------------------------------------------------------------
+ int cluster::ack_notify(rgw::sal::RadosStore *store,
+ const DoutPrefixProvider *dpp,
+ const control_t *p_ctl,
+ uint64_t notify_id,
+ uint64_t cookie,
+ int status)
+ {
+ librados::IoCtx ctl_ioctx;
+ int ret = get_control_ioctx(store, dpp, ctl_ioctx);
+ if (unlikely(ret != 0)) {
+ return ret;
+ }
+
+ ldpp_dout(dpp, 5) << __func__ << "::status=" << status << dendl;
+ bufferlist reply_bl;
+ ceph::encode(status, reply_bl);
+ encode(*p_ctl, reply_bl);
+ ctl_ioctx.notify_ack(DEDUP_WATCH_OBJ, notify_id, cookie, reply_bl);
+
+ return 0;
+ }
+
+ //---------------------------------------------------------------------------
+ // command-line called from radosgw-admin.cc
+ int cluster::dedup_control(rgw::sal::RadosStore *store,
+ const DoutPrefixProvider *dpp,
+ urgent_msg_t urgent_msg)
+ {
+ ldpp_dout(dpp, 10) << __func__ << "::dedup_control req = "
+ << get_urgent_msg_names(urgent_msg) << dendl;
+ if (urgent_msg != URGENT_MSG_RESUME &&
+ urgent_msg != URGENT_MSG_PASUE &&
+ urgent_msg != URGENT_MSG_RESTART &&
+ urgent_msg != URGENT_MSG_ABORT) {
+ ldpp_dout(dpp, 1) << __func__ << "::illegal urgent_msg="<< urgent_msg << dendl;
+ return -EINVAL;
+ }
+
+ librados::IoCtx ctl_ioctx;
+ int ret = get_control_ioctx(store, dpp, ctl_ioctx);
+ if (unlikely(ret != 0)) {
+ return ret;
+ }
+
+ // 10 seconds timeout
+ const uint64_t timeout_ms = 10*1000;
+ bufferlist reply_bl, urgent_msg_bl;
+ ceph::encode(urgent_msg, urgent_msg_bl);
+ ret = rgw_rados_notify(dpp, ctl_ioctx, DEDUP_WATCH_OBJ, urgent_msg_bl,
+ timeout_ms, &reply_bl, null_yield);
+ if (ret < 0) {
+ ldpp_dout(dpp, 1) << __func__ << "::failed rgw_rados_notify("
+ << DEDUP_WATCH_OBJ << ")::err="<<cpp_strerror(-ret) << dendl;
+ return ret;
+ }
+ std::vector<librados::notify_ack_t> acks;
+ std::vector<librados::notify_timeout_t> timeouts;
+ ctl_ioctx.decode_notify_response(reply_bl, &acks, &timeouts);
+ if (timeouts.size() > 0) {
+ ldpp_dout(dpp, 1) << __func__ << "::failed rgw_rados_notify("
+ << DEDUP_WATCH_OBJ << ")::timeout error" << dendl;
+ return -EAGAIN;
+ }
+
+ for (auto& ack : acks) {
+ try {
+ ldpp_dout(dpp, 20) << __func__ << "::ACK: notifier_id=" << ack.notifier_id
+ << "::cookie=" << ack.cookie << dendl;
+ auto iter = ack.payload_bl.cbegin();
+ ceph::decode(ret, iter);
+ struct rgw::dedup::control_t ctl;
+ decode(ctl, iter);
+ ldpp_dout(dpp, 10) << __func__ << "::++ACK::ctl=" << ctl << "::ret=" << ret << dendl;
+ } catch (buffer::error& err) {
+ ldpp_dout(dpp, 1) << __func__ << "::failed decoding notify acks" << dendl;
+ return -EINVAL;
+ }
+ if (ret != 0) {
+ ldpp_dout(dpp, 1) << __func__ << "::Bad notify ack, ret=" << ret
+ << "::err=" << cpp_strerror(-ret) << dendl;
+ return ret;
+ }
+ }
+ ldpp_dout(dpp, 10) << __func__ << "::" << get_urgent_msg_names(urgent_msg)
+ << " finished successfully!" << dendl;
+ return 0;
+ }
+
+ //---------------------------------------------------------------------------
+ // command-line called from radosgw-admin.cc
+ int cluster::dedup_restart_scan(rgw::sal::RadosStore *store,
+ dedup_req_type_t dedup_type,
+ const DoutPrefixProvider *dpp)
+ {
+ ldpp_dout(dpp, 1) << __func__ << "::dedup_type = " << dedup_type << dendl;
+
+ dedup_epoch_t old_epoch;
+ // store the previous epoch for cmp-swap
+ int ret = get_epoch(store, dpp, &old_epoch, __func__);
+ if (ret != 0) {
+ // generate an empty epoch with zero counters
+ std::string cluster_id("NULL_CLUSTER_ID");
+ ldpp_dout(dpp, 1) << __func__ << "::set empty EPOCH using cluster_id: "
+ << cluster_id << dendl;
+ set_epoch(store, cluster_id, dpp, 0, 0);
+ ret = get_epoch(store, dpp, &old_epoch, __func__);
+ if (ret) {
+ return ret;
+ }
+ }
+
+ // first abort all dedup work!
+ ret = dedup_control(store, dpp, URGENT_MSG_ABORT);
+ if (ret != 0) {
+ return ret;
+ }
+#if 0
+ // then delete dedup-pool to ensure a clean start
+ const rgw_pool& dedup_pool = store->svc()->zone->get_zone_params().dedup_pool;
+ auto rados_handle = store->getRados()->get_rados_handle();
+ ldpp_dout(dpp, 5) <<__func__ << "::delete pool: " << dedup_pool.name << dendl;
+ rados_handle->pool_delete(dedup_pool.name.c_str());
+#endif
+
+ ldpp_dout(dpp, 10) << __func__ << dedup_type << dendl;
+#ifdef FULL_DEDUP_SUPPORT
+ ceph_assert(dedup_type == dedup_req_type_t::DEDUP_TYPE_ESTIMATE ||
+ dedup_type == dedup_req_type_t::DEDUP_TYPE_FULL);
+#else
+ ceph_assert(dedup_type == dedup_req_type_t::DEDUP_TYPE_ESTIMATE);
+#endif
+ ret = swap_epoch(store, dpp, &old_epoch, dedup_type, 0, 0);
+ if (ret == 0) {
+ ldpp_dout(dpp, 10) << __func__ << "::Epoch object was reset" << dendl;
+ return dedup_control(store, dpp, URGENT_MSG_RESTART);
+ }
+ else {
+ return ret;
+ }
+ }
+
+ //---------------------------------------------------------------------------
+ bool cluster::can_start_new_scan(rgw::sal::RadosStore *store)
+ {
+ ldpp_dout(dpp, 10) << __func__ << "::epoch=" << d_epoch_time << dendl;
+ dedup_epoch_t new_epoch;
+ if (get_epoch(store, dpp, &new_epoch, nullptr) != 0) {
+ ldpp_dout(dpp, 1) << __func__ << "::No Epoch Object::"
+ << "::scan can be restarted!\n\n\n" << dendl;
+ // no epoch object exists -> we should start a new scan
+ return true;
+ }
+
+ if (new_epoch.time <= d_epoch_time) {
+ if (new_epoch.time == d_epoch_time) {
+ ldpp_dout(dpp, 10) << __func__ << "::Epoch hasn't change - > Do not restart scan!!" << dendl;
+ }
+ else {
+ ldpp_dout(dpp, 1) << __func__ << " ::Do not restart scan!\n epoch="
+ << d_epoch_time << "\nnew_epoch="<< new_epoch.time <<dendl;
+ }
+ return false;
+ }
+ // allow members to join within a 30 sec limit
+ utime_t limit = {30, 0};
+ utime_t now = ceph_clock_now();
+ ldpp_dout(dpp, 1) << __func__ << "\n::new_epoch=" << new_epoch.time
+ << "\n::now =" << now << dendl;
+ if ((now > new_epoch.time) && ((now - new_epoch.time) < limit)) {
+ ldpp_dout(dpp, 1) << __func__ << "::Epoch is less than 30 seconds old!"
+ << " Restart scan\n\n\n" << dendl;
+ return true;
+ }
+ ldpp_dout(dpp, 1) << "\n::new_epoch - now = " << (new_epoch.time - now)
+ << "\n::limit = " << limit << dendl;
+
+ if (new_epoch.time > now) {
+ ldpp_dout(dpp, 1) << ":new_epoch > now = TRUE " << dendl;
+ }
+ return false;
+ }
+} // namespace rgw::dedup
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2;
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Author: Gabriel BenHanokh <gbenhano@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#pragma once
+#include "common/dout.h"
+#include "rgw_dedup_utils.h"
+#include "rgw_dedup_store.h"
+#include <string>
+
+namespace rgw::dedup {
+ static constexpr const char* WORKER_SHARD_PREFIX = "WRK.SHRD.TK.";
+ static constexpr const char* MD5_SHARD_PREFIX = "MD5.SHRD.TK.";
+ struct control_t;
+ struct dedup_epoch_t;
+
+ class cluster{
+ public:
+ //==================================================================================
+ class shard_token_oid {
+ public:
+ //---------------------------------------------------------------------------
+ shard_token_oid(const char *prefix) {
+ this->prefix_len = snprintf(this->buff, BUFF_SIZE, "%s", prefix);
+ this->total_len = this->prefix_len;
+ }
+
+ //---------------------------------------------------------------------------
+ shard_token_oid(const char *prefix, uint16_t shard) {
+ this->prefix_len = snprintf(this->buff, BUFF_SIZE, "%s", prefix);
+ set_shard(shard);
+ }
+
+ //---------------------------------------------------------------------------
+ void set_shard(uint16_t shard) {
+ int n = snprintf(this->buff + this->prefix_len, BUFF_SIZE, "%03x", shard);
+ this->total_len = this->prefix_len + n;
+ }
+
+ //---------------------------------------------------------------------------
+ static bool legal_oid_name(const std::string& oid) {
+ return ((oid.length() <= BUFF_SIZE) &&
+ (oid.starts_with(WORKER_SHARD_PREFIX)||oid.starts_with(MD5_SHARD_PREFIX)));
+ }
+ inline const char* get_buff() { return this->buff; }
+ inline unsigned get_buff_size() { return this->total_len; }
+ private:
+ static const unsigned BUFF_SIZE = 15;
+ unsigned total_len = 0;
+ unsigned prefix_len = 0;
+ char buff[BUFF_SIZE];
+ };
+
+ //==================================================================================
+ cluster(const DoutPrefixProvider *_dpp,
+ CephContext* cct,
+ rgw::sal::Driver* driver);
+ int reset(rgw::sal::RadosStore *store,
+ struct dedup_epoch_t*,
+ work_shard_t num_work_shards,
+ md5_shard_t num_md5_shards);
+
+ utime_t get_epoch_time() { return d_epoch_time; }
+ work_shard_t get_next_work_shard_token(rgw::sal::RadosStore *store,
+ work_shard_t num_work_shards);
+ md5_shard_t get_next_md5_shard_token(rgw::sal::RadosStore *store,
+ md5_shard_t num_md5_shards);
+ bool can_start_new_scan(rgw::sal::RadosStore *store);
+ static int collect_all_shard_stats(rgw::sal::RadosStore *store,
+ Formatter *p_formatter,
+ const DoutPrefixProvider *dpp);
+ static int watch_reload(rgw::sal::RadosStore *store,
+ const DoutPrefixProvider* dpp,
+ uint64_t *p_watch_handle,
+ librados::WatchCtx2 *ctx);
+ static int unwatch_reload(rgw::sal::RadosStore *store,
+ const DoutPrefixProvider* dpp,
+ uint64_t watch_handle);
+ static int ack_notify(rgw::sal::RadosStore *store,
+ const DoutPrefixProvider *dpp,
+ const struct control_t *p_ctl,
+ uint64_t notify_id,
+ uint64_t cookie,
+ int status);
+ static int dedup_control(rgw::sal::RadosStore *store,
+ const DoutPrefixProvider *dpp,
+ urgent_msg_t urgent_msg);
+ static int dedup_restart_scan(rgw::sal::RadosStore *store,
+ dedup_req_type_t dedup_type,
+ const DoutPrefixProvider *dpp);
+
+ //---------------------------------------------------------------------------
+ int mark_work_shard_token_completed(rgw::sal::RadosStore *store,
+ work_shard_t work_shard,
+ const worker_stats_t *p_stats)
+ {
+ ceph::bufferlist bl;
+ encode(*p_stats, bl);
+ d_num_completed_workers++;
+ d_completed_workers[work_shard] = TOKEN_STATE_COMPLETED;
+
+ return mark_shard_token_completed(store, work_shard, p_stats->ingress_obj,
+ WORKER_SHARD_PREFIX, bl);
+ }
+
+ //---------------------------------------------------------------------------
+ int mark_md5_shard_token_completed(rgw::sal::RadosStore *store,
+ md5_shard_t md5_shard,
+ const md5_stats_t *p_stats)
+ {
+ ceph::bufferlist bl;
+ encode(*p_stats, bl);
+ d_num_completed_md5++;
+ d_completed_md5[md5_shard] = TOKEN_STATE_COMPLETED;
+ return mark_shard_token_completed(store, md5_shard, p_stats->loaded_objects,
+ MD5_SHARD_PREFIX, bl);
+ }
+
+ int update_shard_token_heartbeat(rgw::sal::RadosStore *store,
+ unsigned shard,
+ uint64_t count_a,
+ uint64_t count_b,
+ const char *prefix);
+
+ //---------------------------------------------------------------------------
+ int all_work_shard_tokens_completed(rgw::sal::RadosStore *store,
+ work_shard_t num_work_shards)
+ {
+ return all_shard_tokens_completed(store, num_work_shards, WORKER_SHARD_PREFIX,
+ &d_num_completed_workers, d_completed_workers);
+ }
+
+ //---------------------------------------------------------------------------
+ int all_md5_shard_tokens_completed(rgw::sal::RadosStore *store,
+ md5_shard_t num_md5_shards)
+ {
+ return all_shard_tokens_completed(store, num_md5_shards, MD5_SHARD_PREFIX,
+ &d_num_completed_md5, d_completed_md5);
+ }
+
+ private:
+ static constexpr unsigned TOKEN_STATE_PENDING = 0x00;
+ static constexpr unsigned TOKEN_STATE_CORRUPTED = 0xCC;
+ static constexpr unsigned TOKEN_STATE_TIMED_OUT = 0xDD;
+ static constexpr unsigned TOKEN_STATE_COMPLETED = 0xFF;
+
+ void clear();
+ int all_shard_tokens_completed(rgw::sal::RadosStore *store,
+ unsigned shards_count,
+ const char *prefix,
+ uint16_t *p_num_completed,
+ uint8_t completed_arr[]);
+ int cleanup_prev_run(rgw::sal::RadosStore *store);
+ int32_t get_next_shard_token(rgw::sal::RadosStore *store,
+ uint16_t start_shard,
+ uint16_t max_count,
+ const char *prefix);
+ int create_shard_tokens(rgw::sal::RadosStore *store,
+ unsigned shards_count,
+ const char *prefix);
+ int verify_all_shard_tokens(rgw::sal::RadosStore *store,
+ unsigned shards_count,
+ const char *prefix);
+ int mark_shard_token_completed(rgw::sal::RadosStore *store,
+ unsigned shard,
+ uint64_t obj_count,
+ const char *prefix,
+ const bufferlist &bl);
+
+ const DoutPrefixProvider *dpp;
+ std::string d_lock_cookie;
+ std::string d_cluster_id;
+ md5_shard_t d_curr_md5_shard = 0;
+ work_shard_t d_curr_worker_shard = 0;
+ utime_t d_epoch_time;
+ utime_t d_token_creation_time;
+ uint8_t d_completed_workers[MAX_WORK_SHARD];
+ uint8_t d_completed_md5[MAX_MD5_SHARD];
+ uint16_t d_num_completed_workers = 0;
+ uint16_t d_num_completed_md5 = 0;
+ };
+
+} //namespace rgw::dedup
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2;
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Author: Gabriel BenHanokh <gbenhano@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#pragma once
+
+#include "common/Clock.h" // for ceph_clock_now()
+#include "common/dout.h"
+#include "rgw_dedup_utils.h"
+
+#include <string>
+
+namespace rgw::dedup {
+ constexpr const char* RGW_DEDUP_ATTR_EPOCH = "rgw.dedup.attr.epoch";
+ //===========================================================================
+
+ struct dedup_epoch_t {
+ uint32_t serial;
+ dedup_req_type_t dedup_type;
+ utime_t time;
+ uint32_t num_work_shards = 0;
+ uint32_t num_md5_shards = 0;
+ };
+
+ //---------------------------------------------------------------------------
+ inline void encode(const dedup_epoch_t& o, ceph::bufferlist& bl)
+ {
+ ENCODE_START(1, 1, bl);
+ encode(o.serial, bl);
+ encode(static_cast<int32_t>(o.dedup_type), bl);
+ encode(o.time, bl);
+ encode(o.num_work_shards, bl);
+ encode(o.num_md5_shards, bl);
+ ENCODE_FINISH(bl);
+ }
+
+ //---------------------------------------------------------------------------
+ inline void decode(dedup_epoch_t& o, ceph::bufferlist::const_iterator& bl)
+ {
+ DECODE_START(1, bl);
+ decode(o.serial, bl);
+ int32_t dedup_type;
+ decode(dedup_type, bl);
+ o.dedup_type = static_cast<dedup_req_type_t> (dedup_type);
+ decode(o.time, bl);
+ decode(o.num_work_shards, bl);
+ decode(o.num_md5_shards, bl);
+ DECODE_FINISH(bl);
+ }
+
+ //---------------------------------------------------------------------------
+ inline std::ostream& operator<<(std::ostream &out, const dedup_epoch_t &ep)
+ {
+ utime_t elapsed = ceph_clock_now() - ep.time;
+ out << "EPOCH::Time={" << ep.time.tv.tv_sec <<":"<< ep.time.tv.tv_nsec << "}::";
+ out << "Elapsed={" << elapsed.tv.tv_sec <<":"<< elapsed.tv.tv_nsec << "}::";
+ out << ep.dedup_type << "::serial=" << ep.serial;
+ out << "::num_work_shards=" << ep.num_work_shards;
+ out << "::num_md5_shards=" << ep.num_md5_shards;
+ return out;
+ }
+
+} //namespace rgw::dedup
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2;
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Author: Gabriel BenHanokh <gbenhano@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#pragma once
+#include "common/dout.h"
+#include <unordered_map>
+#include <cstring>
+#include <string>
+
+
+namespace rgw::dedup {
+ class remapper_t
+ {
+ public:
+ static inline constexpr uint8_t NULL_IDX = 0xFF;
+ remapper_t(uint32_t max_entries) : d_max_entries(max_entries) {}
+ uint8_t remap(const std::string &key,
+ const DoutPrefixProvider* dpp,
+ uint64_t *p_overflow_count) { // IN-OUT
+ uint8_t idx;
+
+ auto itr = d_map.find(key);
+ if (itr != d_map.end()) {
+ idx = itr->second;
+ ldpp_dout(dpp, 20) << __func__ << "::Existing key: " << key
+ << " is mapped to idx=" << (int)idx << dendl;
+ }
+ else if (d_num_entries < d_max_entries) {
+ // assign it the next entry
+ idx = d_num_entries++;
+ d_map[key] = idx;
+ ldpp_dout(dpp, 20) << __func__ << "::New key: " << key
+ << " was mapped to idx=" << (int)idx << dendl;
+ }
+ else {
+ (*p_overflow_count) ++;
+ ldpp_dout(dpp, 10) << __func__ << "::ERR: Failed adding key: "
+ << key << dendl;
+ idx = NULL_IDX;
+ }
+
+ return idx;
+ }
+
+ private:
+ uint32_t d_num_entries = 0;
+ const uint32_t d_max_entries;
+ std::unordered_map<std::string, uint8_t> d_map;
+ };
+
+} //namespace rgw::dedup
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2;
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Author: Gabriel BenHanokh <gbenhano@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "include/rados/rados_types.hpp"
+#include "include/rados/buffer.h"
+#include "include/rados/librados.hpp"
+#include "svc_zone.h"
+#include "common/config.h"
+#include "common/Cond.h"
+#include "common/debug.h"
+#include "common/errno.h"
+#include "rgw_common.h"
+#include "include/denc.h"
+#include "rgw_sal.h"
+#include "driver/rados/rgw_sal_rados.h"
+#include "rgw_dedup_utils.h"
+#include "rgw_dedup.h"
+#include "rgw_dedup_store.h"
+#include "fmt/ranges.h"
+#include <span>
+
+namespace rgw::dedup {
+
+ //---------------------------------------------------------------------------
+ disk_record_t::disk_record_t(const rgw::sal::Bucket *p_bucket,
+ const std::string &obj_name,
+ const parsed_etag_t *p_parsed_etag,
+ uint64_t obj_size,
+ const std::string &storage_class)
+ {
+ this->s.rec_version = 0;
+ this->s.flags = 0;
+ this->s.num_parts = p_parsed_etag->num_parts;
+ this->obj_name = obj_name;
+ this->s.obj_name_len = this->obj_name.length();
+ this->bucket_name = p_bucket->get_name();
+ this->s.bucket_name_len = this->bucket_name.length();
+
+ this->s.md5_high = p_parsed_etag->md5_high;
+ this->s.md5_low = p_parsed_etag->md5_low;
+ this->s.obj_bytes_size = obj_size;
+ this->s.object_version = 0;
+
+ this->bucket_id = p_bucket->get_bucket_id();
+ this->s.bucket_id_len = this->bucket_id.length();
+ this->tenant_name = p_bucket->get_tenant();
+ this->s.tenant_name_len = this->tenant_name.length();
+ this->stor_class = storage_class;
+ this->s.stor_class_len = storage_class.length();
+
+ this->s.ref_tag_len = 0;
+ this->s.manifest_len = 0;
+
+ this->s.shared_manifest = 0;
+ memset(this->s.hash, 0, sizeof(this->s.hash));
+ this->ref_tag = "";
+ this->manifest_bl.clear();
+ }
+
+ //---------------------------------------------------------------------------
+ disk_record_t::disk_record_t(const char *buff)
+ {
+ disk_record_t *p_rec = (disk_record_t*)buff;
+ this->s.rec_version = p_rec->s.rec_version;
+ // wrong version, bail out
+ if (unlikely(p_rec->s.rec_version != 0)) {
+ return;
+ }
+
+ this->s.flags = p_rec->s.flags;
+ this->s.num_parts = CEPHTOH_16(p_rec->s.num_parts);
+ this->s.obj_name_len = CEPHTOH_16(p_rec->s.obj_name_len);
+ this->s.bucket_name_len = CEPHTOH_16(p_rec->s.bucket_name_len);
+
+ this->s.md5_high = CEPHTOH_64(p_rec->s.md5_high);
+ this->s.md5_low = CEPHTOH_64(p_rec->s.md5_low);
+ this->s.obj_bytes_size = CEPHTOH_64(p_rec->s.obj_bytes_size);
+ this->s.object_version = CEPHTOH_64(p_rec->s.object_version);
+
+ this->s.bucket_id_len = CEPHTOH_16(p_rec->s.bucket_id_len);
+ this->s.tenant_name_len = CEPHTOH_16(p_rec->s.tenant_name_len);
+ this->s.stor_class_len = CEPHTOH_16(p_rec->s.stor_class_len);
+ this->s.ref_tag_len = CEPHTOH_16(p_rec->s.ref_tag_len);
+ this->s.manifest_len = CEPHTOH_16(p_rec->s.manifest_len);
+
+ const char *p = buff + sizeof(this->s);
+ this->obj_name = std::string(p, this->s.obj_name_len);
+ p += p_rec->s.obj_name_len;
+
+ this->bucket_name = std::string(p, this->s.bucket_name_len);
+ p += p_rec->s.bucket_name_len;
+
+ this->bucket_id = std::string(p, this->s.bucket_id_len);
+ p += p_rec->s.bucket_id_len;
+
+ this->tenant_name = std::string(p, this->s.tenant_name_len);
+ p += p_rec->s.tenant_name_len;
+
+ this->stor_class = std::string(p, this->s.stor_class_len);
+ p += p_rec->s.stor_class_len;
+
+ if (p_rec->s.flags.is_fastlane()) {
+ // TBD:: remove asserts
+ ceph_assert(this->s.ref_tag_len == 0);
+ ceph_assert(this->s.manifest_len == 0);
+ }
+ else {
+ this->s.shared_manifest = CEPHTOH_64(p_rec->s.shared_manifest);
+ // BLAKE3 hash has 256 bit splitted into multiple 64bit units
+ const unsigned units = (256 / (sizeof(uint64_t)*8));
+ static_assert(units == 4);
+ for (unsigned i = 0; i < units; i++) {
+ this->s.hash[i] = CEPHTOH_64(p_rec->s.hash[i]);
+ }
+ this->ref_tag = std::string(p, this->s.ref_tag_len);
+ p += p_rec->s.ref_tag_len;
+
+ this->manifest_bl.append(p, this->s.manifest_len);
+ }
+ }
+
+ //---------------------------------------------------------------------------
+ size_t disk_record_t::serialize(char *buff) const
+ {
+ ceph_assert(this->s.rec_version == 0);
+ disk_record_t *p_rec = (disk_record_t*)buff;
+ p_rec->s.rec_version = 0;
+ p_rec->s.flags = this->s.flags;
+ p_rec->s.num_parts = HTOCEPH_16(this->s.num_parts);
+ p_rec->s.obj_name_len = HTOCEPH_16(this->obj_name.length());
+ p_rec->s.bucket_name_len = HTOCEPH_16(this->bucket_name.length());
+
+ p_rec->s.md5_high = HTOCEPH_64(this->s.md5_high);
+ p_rec->s.md5_low = HTOCEPH_64(this->s.md5_low);
+ p_rec->s.obj_bytes_size = HTOCEPH_64(this->s.obj_bytes_size);
+ p_rec->s.object_version = HTOCEPH_64(this->s.object_version);
+
+ p_rec->s.bucket_id_len = HTOCEPH_16(this->bucket_id.length());
+ p_rec->s.tenant_name_len = HTOCEPH_16(this->tenant_name.length());
+ p_rec->s.stor_class_len = HTOCEPH_16(this->stor_class.length());
+ p_rec->s.ref_tag_len = HTOCEPH_16(this->ref_tag.length());
+ p_rec->s.manifest_len = HTOCEPH_16(this->manifest_bl.length());
+ char *p = buff + sizeof(this->s);
+ unsigned len = this->obj_name.length();
+ std::memcpy(p, this->obj_name.data(), len);
+ p += len;
+
+ len = this->bucket_name.length();
+ std::memcpy(p, this->bucket_name.data(), len);
+ p += len;
+
+ len = this->bucket_id.length();
+ std::memcpy(p, this->bucket_id.data(), len);
+ p += len;
+
+ len = this->tenant_name.length();
+ std::memcpy(p, this->tenant_name.data(), len);
+ p += len;
+
+ len = this->stor_class.length();
+ std::memcpy(p, this->stor_class.data(), len);
+ p += len;
+
+ if (this->s.flags.is_fastlane()) {
+ // TBD:: remove asserts
+ ceph_assert(this->s.ref_tag_len == 0);
+ ceph_assert(this->s.manifest_len == 0);
+ }
+ else {
+ p_rec->s.shared_manifest = HTOCEPH_64(this->s.shared_manifest);
+ // BLAKE3 hash has 256 bit splitted into multiple 64bit units
+ const unsigned units = (256 / (sizeof(uint64_t)*8));
+ static_assert(units == 4);
+ for (unsigned i = 0; i < units; i++) {
+ p_rec->s.hash[i] = HTOCEPH_64(this->s.hash[i]);
+ }
+ len = this->ref_tag.length();
+ std::memcpy(p, this->ref_tag.data(), len);
+ p += len;
+
+ len = this->manifest_bl.length();
+ const char *p_manifest = const_cast<disk_record_t*>(this)->manifest_bl.c_str();
+ std::memcpy(p, p_manifest, len);
+ p += len;
+ }
+ return (p - buff);
+ }
+
+ //---------------------------------------------------------------------------
+ size_t disk_record_t::length() const
+ {
+ return (sizeof(this->s) +
+ this->obj_name.length() +
+ this->bucket_name.length() +
+ this->bucket_id.length() +
+ this->tenant_name.length() +
+ this->stor_class.length() +
+ this->ref_tag.length() +
+ this->manifest_bl.length());
+ }
+
+ //---------------------------------------------------------------------------
+ int disk_record_t::validate(const char *caller,
+ const DoutPrefixProvider* dpp,
+ disk_block_id_t block_id,
+ record_id_t rec_id) const
+ {
+ // optimistic approach
+ if (likely((this->s.rec_version == 0) && (this->length() <= MAX_REC_SIZE))) {
+ ldpp_dout(dpp, 20) << __func__ << "::success" << dendl;
+ return 0;
+ }
+
+ // wrong version
+ if (this->s.rec_version != 0) {
+ // TBD
+ //p_stats->failed_wrong_ver++;
+ ldpp_dout(dpp, 5) << __func__ << "::" << caller << "::ERR: Bad record version: "
+ << this->s.rec_version
+ << "::block_id=" << block_id
+ << "::rec_id=" << rec_id
+ << dendl;
+ return -EPROTO; // Protocol error
+ }
+
+ // if arrived here record size is too large
+ // TBD
+ //p_stats->failed_rec_overflow++;
+ ldpp_dout(dpp, 5) << __func__ << "::" << caller << "::ERR: record size too big: "
+ << this->length()
+ << "::block_id=" << block_id
+ << "::rec_id=" << rec_id
+ << dendl;
+ return -EOVERFLOW; // maybe should use -E2BIG ??
+ }
+
+ //---------------------------------------------------------------------------
+ std::ostream &operator<<(std::ostream &stream, const disk_record_t & rec)
+ {
+ stream << rec.obj_name << "::" << rec.s.obj_name_len << "\n";
+ stream << rec.bucket_name << "::" << rec.s.bucket_name_len << "\n";
+ stream << rec.bucket_id << "::" << rec.s.bucket_id_len << "\n";
+ stream << rec.tenant_name << "::" << rec.s.tenant_name_len << "\n";
+ stream << rec.stor_class << "::" << rec.s.stor_class_len << "\n";
+ stream << rec.ref_tag << "::" << rec.s.ref_tag_len << "\n";
+ stream << "num_parts = " << rec.s.num_parts << "\n";
+ stream << "obj_size = " << rec.s.obj_bytes_size/1024 <<" KiB" << "\n";
+ stream << "MD5 = " << std::hex << rec.s.md5_high << rec.s.md5_low << "\n";
+ stream << "HASH = ";
+ // BLAKE3 hash has 256 bit splitted into multiple 64bit units
+ const unsigned units = (256 / (sizeof(uint64_t)*8));
+ static_assert(units == 4);
+ for (unsigned i = 0; i < units; i++) {
+ stream << rec.s.hash[i];
+ }
+ stream << "\n";
+
+ if (rec.has_shared_manifest()) {
+ stream << "Shared Manifest Object\n";
+ }
+ else {
+ stream << "Dedicated Manifest Object\n";
+ }
+ stream << "Manifest len=" << rec.s.manifest_len << "\n";
+ return stream;
+ }
+
+ //---------------------------------------------------------------------------
+ void disk_block_t::init(work_shard_t worker_id, uint32_t seq_number)
+ {
+ disk_block_header_t *p_header = get_header();
+ p_header->offset = sizeof(disk_block_header_t);
+ p_header->rec_count = 0;
+ p_header->block_id = disk_block_id_t(worker_id, seq_number);
+ }
+
+ //---------------------------------------------------------------------------
+ int disk_block_header_t::verify(disk_block_id_t expected_block_id, const DoutPrefixProvider* dpp)
+ {
+ if (unlikely(offset != BLOCK_MAGIC && offset != LAST_BLOCK_MAGIC)) {
+ ldpp_dout(dpp, 1) << __func__ << "::ERR::bad magic number (0x" << std::hex << offset << std::dec << ")" << dendl;
+ return -EINVAL;
+ }
+
+ if (unlikely(rec_count > MAX_REC_IN_BLOCK) ) {
+ ldpp_dout(dpp, 1) << __func__ << "::ERR::rec_count=" << rec_count << " > MAX_REC_IN_BLOCK" << dendl;
+ return -EINVAL;
+ }
+
+ if (unlikely(this->block_id != expected_block_id)) {
+ ldpp_dout(dpp, 1) << __func__ << "::ERR::block_id=" << block_id
+ << "!= expected_block_id=" << expected_block_id << dendl;
+ return -EINVAL;
+ }
+
+ return 0;
+ }
+
+ //---------------------------------------------------------------------------
+ record_id_t disk_block_t::add_record(const disk_record_t *p_rec,
+ const DoutPrefixProvider *dpp)
+ {
+ disk_block_header_t *p_header = get_header();
+ if (unlikely(p_header->rec_count >= MAX_REC_IN_BLOCK)) {
+ ldpp_dout(dpp, 20) << __func__ << "::rec_count=" << p_header->rec_count
+ << ", MAX_REC_IN_BLOCK=" << MAX_REC_IN_BLOCK << dendl;
+ return MAX_REC_IN_BLOCK;
+ }
+
+ if ((DISK_BLOCK_SIZE - p_header->offset) >= p_rec->length()) {
+ p_header->rec_offsets[p_header->rec_count] = p_header->offset;
+ unsigned rec_id = p_header->rec_count;
+ p_header->rec_count ++;
+ p_rec->serialize(data+p_header->offset);
+ p_header->offset += p_rec->length();
+ return rec_id;
+ }
+ else {
+ return MAX_REC_IN_BLOCK;
+ }
+ }
+
+ //---------------------------------------------------------------------------
+ void disk_block_t::close_block(const DoutPrefixProvider* dpp, bool has_more)
+ {
+ disk_block_header_t *p_header = get_header();
+ ldpp_dout(dpp, 20) << __func__ << "::rec_count=" << p_header->rec_count
+ << ", has_more=" << (has_more? "TRUE" : "FALSE") << dendl;
+
+ memset(data + p_header->offset, 0, (DISK_BLOCK_SIZE - p_header->offset));
+ if (has_more) {
+ p_header->offset = HTOCEPH_16(BLOCK_MAGIC);
+ }
+ else {
+ p_header->offset = HTOCEPH_16(LAST_BLOCK_MAGIC);
+ }
+ for (unsigned i = 0; i < p_header->rec_count; i++) {
+ p_header->rec_offsets[i] = HTOCEPH_16(p_header->rec_offsets[i]);
+ }
+ p_header->rec_count = HTOCEPH_16(p_header->rec_count);
+ p_header->block_id = HTOCEPH_32((uint32_t)p_header->block_id);
+ // TBD: CRC
+ }
+
+ //---------------------------------------------------------------------------
+ void disk_block_header_t::deserialize()
+ {
+ this->offset = CEPHTOH_16(this->offset);
+ this->rec_count = CEPHTOH_16(this->rec_count);
+ this->block_id = CEPHTOH_32((uint32_t)this->block_id);
+ for (unsigned i = 0; i < this->rec_count; i++) {
+ this->rec_offsets[i] = CEPHTOH_16(this->rec_offsets[i]);
+ }
+ }
+
+ //---------------------------------------------------------------------------
+ disk_block_seq_t::disk_block_seq_t(const DoutPrefixProvider* dpp_in,
+ disk_block_t *p_arr_in,
+ work_shard_t worker_id,
+ md5_shard_t md5_shard,
+ worker_stats_t *p_stats_in)
+ {
+ activate(dpp_in, p_arr_in, worker_id, md5_shard, p_stats_in);
+ }
+
+ //---------------------------------------------------------------------------
+ void disk_block_seq_t::activate(const DoutPrefixProvider* dpp_in,
+ disk_block_t *p_arr_in,
+ work_shard_t worker_id,
+ md5_shard_t md5_shard,
+ worker_stats_t *p_stats_in)
+ {
+ dpp = dpp_in;
+ p_arr = p_arr_in;
+ d_worker_id = worker_id;
+ d_md5_shard = md5_shard;
+ p_stats = p_stats_in;
+ p_curr_block = nullptr;
+ d_seq_number = 0;
+
+ memset(p_arr, 0, sizeof(disk_block_t));
+ slab_reset();
+ }
+
+ //---------------------------------------------------------------------------
+ [[maybe_unused]]static int print_manifest(const DoutPrefixProvider *dpp,
+ RGWRados *rados,
+ const bufferlist &manifest_bl)
+ {
+ RGWObjManifest manifest;
+ try {
+ auto bl_iter = manifest_bl.cbegin();
+ decode(manifest, bl_iter);
+ } catch (buffer::error& err) {
+ ldpp_dout(dpp, 1) << __func__ << "::ERROR: unable to decode manifest" << dendl;
+ return -EINVAL;
+ }
+
+ unsigned idx = 0;
+ for (auto p = manifest.obj_begin(dpp); p != manifest.obj_end(dpp); ++p, ++idx) {
+ rgw_raw_obj raw_obj = p.get_location().get_raw_obj(rados);
+ ldpp_dout(dpp, 20) << idx << "] " << raw_obj.oid << dendl;
+ }
+ ldpp_dout(dpp, 20) << "==============================================" << dendl;
+ return 0;
+ }
+
+ //---------------------------------------------------------------------------
+ std::ostream& operator<<(std::ostream& out, const disk_block_id_t& block_id)
+ {
+ std::ios_base::fmtflags flags = out.flags();
+ out << std::hex << "0x"
+ << (uint32_t)block_id.get_work_shard_id() << "::"
+ << (uint32_t)block_id.get_slab_id() << "::"
+ << (uint32_t)block_id.get_block_offset();
+
+ if (flags & std::ios::dec) {
+ out << std::dec;
+ }
+ return out;
+ }
+
+ //---------------------------------------------------------------------------
+ std::string disk_block_id_t::get_slab_name(md5_shard_t md5_shard) const
+ {
+ // SLAB.MD5_ID.WORKER_ID.SLAB_SEQ_ID
+ const char *SLAB_NAME_FORMAT = "SLB.%03X.%02X.%04X";
+ static constexpr uint32_t SLAB_NAME_SIZE = 16;
+ char name_buf[SLAB_NAME_SIZE];
+ slab_id_t slab_id = get_slab_id();
+ work_shard_t work_id = get_work_shard_id();
+ unsigned n = snprintf(name_buf, sizeof(name_buf), SLAB_NAME_FORMAT,
+ md5_shard, work_id, slab_id);
+ std::string oid(name_buf, n);
+ return oid;
+ }
+
+ //---------------------------------------------------------------------------
+ int load_record(librados::IoCtx &ioctx,
+ const disk_record_t *p_tgt_rec,
+ disk_record_t *p_src_rec, /* OUT */
+ disk_block_id_t block_id,
+ record_id_t rec_id,
+ md5_shard_t md5_shard,
+ const DoutPrefixProvider *dpp)
+ {
+ std::string oid(block_id.get_slab_name(md5_shard));
+ int read_len = DISK_BLOCK_SIZE;
+ static_assert(sizeof(disk_block_t) == DISK_BLOCK_SIZE);
+ int byte_offset = block_id.get_block_offset() * DISK_BLOCK_SIZE;
+ bufferlist bl;
+ int ret = ioctx.read(oid, bl, read_len, byte_offset);
+ if (unlikely(ret != read_len)) {
+ ldpp_dout(dpp, 1) << __func__ << "::ERR: failed to read block from " << oid
+ << "::ret=" << ret << "::err=" << cpp_strerror(-ret)<<dendl;
+ return ret;
+ }
+ else {
+ ldpp_dout(dpp, 20) << __func__ << "::oid=" << oid << "::ret=" << ret
+ << "::len=" << bl.length() << dendl;
+ }
+
+ const char *p = bl.c_str();
+ disk_block_t *p_disk_block = (disk_block_t*)p;
+ disk_block_header_t *p_header = p_disk_block->get_header();
+ p_header->deserialize();
+ ret = p_header->verify(block_id, dpp);
+ if (ret != 0) {
+ return ret;
+ }
+
+ unsigned offset = p_header->rec_offsets[rec_id];
+ // We deserialize the record inside the CTOR
+ disk_record_t rec(p + offset);
+ ret = rec.validate(__func__, dpp, block_id, rec_id);
+ if (unlikely(ret != 0)) {
+ //p_stats->failed_rec_load++;
+ return ret;
+ }
+
+ if (rec.s.md5_high == p_tgt_rec->s.md5_high &&
+ rec.s.md5_low == p_tgt_rec->s.md5_low &&
+ rec.s.num_parts == p_tgt_rec->s.num_parts &&
+ rec.s.obj_bytes_size == p_tgt_rec->s.obj_bytes_size &&
+ rec.stor_class == p_tgt_rec->stor_class) {
+
+ *p_src_rec = rec;
+ return 0;
+ }
+ else {
+ ldpp_dout(dpp, 5) << __func__ << "::ERR: Bad record in block=" << block_id
+ << ", rec_id=" << rec_id << dendl;
+ return -EIO;
+ }
+
+ return 0;
+ }
+
+ //---------------------------------------------------------------------------
+ [[maybe_unused]]static void
+ copy_bl_multi_parts(const bufferlist &bl_in, bufferlist &bl_out,
+ const DoutPrefixProvider* dpp)
+ {
+ const size_t MAX = 260*1024;
+ char buff[MAX];
+ std::srand(std::time({}));
+
+ std::vector<int> vec;
+ auto bl_itr = bl_in.cbegin();
+ size_t len = bl_in.length();
+ while (len) {
+ const int random_value = std::rand();
+ size_t req_len = std::min((random_value % MAX), len);
+ if (len < MAX) {
+ req_len = len;
+ }
+ vec.push_back(req_len);
+ const char *p = get_next_data_ptr(bl_itr, buff, req_len, dpp);
+ bufferptr ptr(p, req_len);
+ bl_out.append(ptr);
+ len -= req_len;
+ }
+ ldpp_dout(dpp, 20) << __func__ << "::req_len=" << vec << dendl;
+ }
+
+ //---------------------------------------------------------------------------
+ int load_slab(librados::IoCtx &ioctx,
+ bufferlist &bl_out,
+ md5_shard_t md5_shard,
+ work_shard_t worker_id,
+ uint32_t seq_number,
+ const DoutPrefixProvider* dpp)
+ {
+ disk_block_id_t block_id(worker_id, seq_number);
+ std::string oid(block_id.get_slab_name(md5_shard));
+ ldpp_dout(dpp, 20) << __func__ << "::worker_id=" << (uint32_t)worker_id
+ << ", md5_shard=" << (uint32_t)md5_shard
+ << ", seq_number=" << seq_number
+ << ":: oid=" << oid << dendl;
+#ifndef DEBUG_FRAGMENTED_BUFFERLIST
+ int ret = ioctx.read(oid, bl_out, 0, 0);
+ if (ret > 0) {
+ ldpp_dout(dpp, 20) << __func__ << "::oid=" << oid << ", len="
+ << bl_out.length() << dendl;
+ }
+#else
+ // DEBUG MODE to test with fragmented bufferlist
+ bufferlist bl_in;
+ // read full object
+ int ret = ioctx.read(oid, bl_in, 0, 0);
+ if (ret > 0) {
+ ldpp_dout(dpp, 20) << __func__ << "::oid=" << oid << ", len="
+ << bl_in.length() << dendl;
+ copy_bl_multi_parts(bl_in, bl_out, dpp);
+ }
+#endif
+ else {
+ if (ret == 0) {
+ // no error reported, but we read nothing which should never happen
+ ldpp_dout(dpp, 1) << __func__ << "::ERR: Empty SLAB " << oid << dendl;
+ ret = -ENODATA;
+ }
+ ldpp_dout(dpp, 5) << __func__ << "::ERR: failed to read " << oid
+ << ", error is " << cpp_strerror(-ret) << dendl;
+ }
+ return ret;
+ }
+
+ //---------------------------------------------------------------------------
+ int store_slab(librados::IoCtx &ioctx,
+ bufferlist &bl,
+ md5_shard_t md5_shard,
+ work_shard_t worker_id,
+ uint32_t seq_number,
+ const DoutPrefixProvider* dpp)
+ {
+ disk_block_id_t block_id(worker_id, seq_number);
+ std::string oid(block_id.get_slab_name(md5_shard));
+ ldpp_dout(dpp, 20) << __func__ << "::oid=" << oid << ", len="
+ << bl.length() << dendl;
+ ceph_assert(bl.length());
+
+ int ret = ioctx.write_full(oid, bl);
+ if (ret == (int)bl.length()) {
+ ldpp_dout(dpp, 20) << __func__ << "::wrote " << bl.length() << " bytes to "
+ << oid << dendl;
+ }
+ else {
+ if (ret == 0) {
+ // no error reported, but we wrote nothing which should never happen
+ ldpp_dout(dpp, 5) << __func__ << "::ERR: No Data was written to " << oid
+ << ", bl.length()=" << bl.length() << dendl;
+ ret = -ENODATA;
+ }
+ ldpp_dout(dpp, 1) << "ERROR: failed to write " << oid
+ << " with: " << cpp_strerror(-ret) << dendl;
+ }
+
+ return ret;
+ }
+
+ //---------------------------------------------------------------------------
+ int disk_block_seq_t::flush(librados::IoCtx &ioctx)
+ {
+ unsigned len = (p_curr_block + 1 - p_arr) * sizeof(disk_block_t);
+ bufferlist bl = bufferlist::static_from_mem((char*)p_arr, len);
+ int ret = store_slab(ioctx, bl, d_md5_shard, d_worker_id, d_seq_number, dpp);
+ // Need to make sure the call to rgw_put_system_obj was fully synchronous
+
+ // d_seq_number++ must be called **after** flush!!
+ d_seq_number++;
+ p_stats->egress_slabs++;
+ slab_reset();
+ return ret;
+ }
+
+ //---------------------------------------------------------------------------
+ int disk_block_seq_t::flush_disk_records(librados::IoCtx &ioctx)
+ {
+ ceph_assert(p_arr);
+ ldpp_dout(dpp, 20) << __func__ << "::worker_id=" << (uint32_t)d_worker_id
+ << ", md5_shard=" << (uint32_t)d_md5_shard << dendl;
+
+ // we need to force flush at the end of a cycle even if there was no work done
+ // it is used as a signal to worker in the next step
+ if (p_curr_block == &p_arr[0] && p_curr_block->is_empty()) {
+ ldpp_dout(dpp, 20) << __func__ << "::Empty buffers, generate terminating block" << dendl;
+ }
+ p_stats->egress_blocks++;
+ p_curr_block->close_block(dpp, false);
+
+ int ret = flush(ioctx);
+ return ret;
+ }
+
+ //---------------------------------------------------------------------------
+ int disk_block_seq_t::add_record(librados::IoCtx &ioctx,
+ const disk_record_t *p_rec, // IN-OUT
+ record_info_t *p_rec_info) // OUT-PARAM
+ {
+ disk_block_id_t null_block_id;
+ int ret = p_rec->validate(__func__, dpp, null_block_id, MAX_REC_IN_BLOCK);
+ if (unlikely(ret != 0)) {
+ // TBD
+ //p_stats->failed_rec_store++;
+ return ret;
+ }
+
+ p_stats->egress_records ++;
+ // first, try and add the record to the current open block
+ p_rec_info->rec_id = p_curr_block->add_record(p_rec, dpp);
+ if (p_rec_info->rec_id < MAX_REC_IN_BLOCK) {
+ p_rec_info->block_id = p_curr_block->get_block_id();
+ return 0;
+ }
+ else {
+ // Not enough space left in current block, close it and open the next block
+ ldpp_dout(dpp, 20) << __func__ << "::Block is full-> close and move to next" << dendl;
+ p_stats->egress_blocks++;
+ p_curr_block->close_block(dpp, true);
+ }
+
+ // Do we have more Blocks in the block-array ?
+ if (p_curr_block < last_block()) {
+ p_curr_block ++;
+ d_seq_number ++;
+ p_curr_block->init(d_worker_id, d_seq_number);
+ p_rec_info->rec_id = p_curr_block->add_record(p_rec, dpp);
+ }
+ else {
+ ldpp_dout(dpp, 20) << __func__ << "::calling flush()" << dendl;
+ ret = flush(ioctx);
+ p_rec_info->rec_id = p_curr_block->add_record(p_rec, dpp);
+ }
+
+ p_rec_info->block_id = p_curr_block->get_block_id();
+ return ret;
+ }
+
+ //---------------------------------------------------------------------------
+ disk_block_array_t::disk_block_array_t(const DoutPrefixProvider* dpp,
+ uint8_t *raw_mem,
+ uint64_t raw_mem_size,
+ work_shard_t worker_id,
+ worker_stats_t *p_stats,
+ md5_shard_t num_md5_shards)
+ {
+ d_num_md5_shards = num_md5_shards;
+ d_worker_id = worker_id;
+ disk_block_t *p = (disk_block_t *)raw_mem;
+ disk_block_t *p_end = (disk_block_t *)(raw_mem + raw_mem_size);
+
+ for (unsigned md5_shard = 0; md5_shard < d_num_md5_shards; md5_shard++) {
+ ldpp_dout(dpp, 20) << __func__ << "::p=" << p << "::p_end=" << p_end << dendl;
+ if (p + DISK_BLOCK_COUNT <= p_end) {
+ d_disk_arr[md5_shard].activate(dpp, p, d_worker_id, md5_shard, p_stats);
+ p += DISK_BLOCK_COUNT;
+ }
+ else {
+ ldpp_dout(dpp, 1) << __func__ << "::ERR: buffer overflow! "
+ << "::md5_shard=" << md5_shard << "/" << d_num_md5_shards
+ << "::raw_mem_size=" << raw_mem_size << dendl;
+ ldpp_dout(dpp, 1) << __func__
+ << "::sizeof(disk_block_t)=" << sizeof(disk_block_t)
+ << "::DISK_BLOCK_COUNT=" << DISK_BLOCK_COUNT << dendl;
+ ceph_abort();
+ }
+ }
+ }
+
+ //---------------------------------------------------------------------------
+ void disk_block_array_t::flush_output_buffers(const DoutPrefixProvider* dpp,
+ librados::IoCtx &ioctx)
+ {
+ for (md5_shard_t md5_shard = 0; md5_shard < d_num_md5_shards; md5_shard++) {
+ ldpp_dout(dpp, 20) <<__func__ << "::flush buffers:: worker_id="
+ << d_worker_id<< ", md5_shard=" << md5_shard << dendl;
+ d_disk_arr[md5_shard].flush_disk_records(ioctx);
+ }
+ }
+} // namespace rgw::dedup
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2;
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Author: Gabriel BenHanokh <gbenhano@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#pragma once
+#include "common/dout.h"
+#include "rgw_common.h"
+#include "rgw_realm_reloader.h"
+#include <string>
+#include <unordered_map>
+#include <variant>
+#include <iostream>
+#include <ostream>
+#include <cstring>
+#include <string>
+#include "include/rados/rados_types.hpp"
+#include "include/rados/buffer.h"
+#include "include/rados/librados.hpp"
+#include "rgw_dedup_utils.h"
+#include "BLAKE3/c/blake3.h"
+
+namespace rgw::dedup {
+ struct key_t;
+#define CEPHTOH_16 le16toh
+#define CEPHTOH_32 le32toh
+#define CEPHTOH_64 le64toh
+#define HTOCEPH_16 htole16
+#define HTOCEPH_32 htole32
+#define HTOCEPH_64 htole64
+
+ static inline constexpr unsigned DISK_BLOCK_SIZE = 8*1024;
+ // we use 16 bit offset
+ static_assert(DISK_BLOCK_SIZE < 64*1024);
+ static constexpr unsigned DISK_BLOCK_COUNT = 256;
+ static_assert(DISK_BLOCK_COUNT <= (4*1024*1024/DISK_BLOCK_SIZE));
+ static constexpr unsigned MAX_REC_IN_BLOCK = 32;
+ // we use 8bit record indices
+ static_assert(MAX_REC_IN_BLOCK < 0xFF);
+ using slab_id_t = uint16_t;
+ using block_offset_t = uint8_t;
+ using record_id_t = uint8_t;
+
+ // disk_block_id_t is a 32 bits concataion of shard_id, slab_id and block_off
+ // ---8---- | -------16------- | ---8----
+ // shard_id | slab_id | block_off
+ struct __attribute__ ((packed)) disk_block_id_t
+ {
+ public:
+ disk_block_id_t() {
+ block_id = 0;
+ }
+
+ disk_block_id_t(work_shard_t shard_id, uint32_t seq_number) {
+ ceph_assert((seq_number & SEQ_NUMBER_MASK) == seq_number);
+ ceph_assert(shard_id <= MAX_WORK_SHARD);
+ block_id = (uint32_t)shard_id << OBJ_SHARD_SHIFT | seq_number;
+ }
+
+ disk_block_id_t& operator =(const disk_block_id_t &other) {
+ this->block_id = other.block_id;
+ return *this;
+ }
+
+ inline disk_block_id_t& operator =(uint32_t val) {
+ this->block_id = val;
+ return *this;
+ }
+
+ inline bool operator ==(const disk_block_id_t &other) const {
+ return (this->block_id == other.block_id);
+ }
+
+ inline explicit operator uint32_t() const {
+ return this->block_id;
+ }
+
+ friend std::ostream& operator<<(std::ostream& os, const disk_block_id_t& block_id);
+
+ std::string get_slab_name(md5_shard_t md5_shard) const;
+
+ static inline slab_id_t seq_num_to_slab_id(uint32_t seq_number) {
+ return (seq_number & SLAB_ID_MASK) >> SLAB_ID_SHIFT;
+ }
+
+ static inline uint32_t slab_id_to_seq_num(uint32_t slab_id) {
+ return (slab_id << SLAB_ID_SHIFT);
+ }
+
+ inline block_offset_t get_block_offset() const {
+ return get_block_offset(get_seq_num());
+ }
+
+ inline work_shard_t get_work_shard_id() const {
+ return (block_id & OBJ_SHARD_MASK) >> OBJ_SHARD_SHIFT;
+ }
+
+ private:
+ inline uint32_t get_seq_num() const {
+ return (block_id & SEQ_NUMBER_MASK);
+ }
+
+ inline slab_id_t get_slab_id() const {
+ return seq_num_to_slab_id(get_seq_num());
+ }
+
+ inline block_offset_t get_block_offset(uint32_t seq_number) const {
+ return (seq_number & BLOCK_OFF_MASK);
+ }
+
+ static constexpr uint32_t OBJ_SHARD_SHIFT = 24;
+ static constexpr uint32_t OBJ_SHARD_MASK = 0xFF000000;
+
+ static constexpr uint32_t SEQ_NUMBER_SHIFT = 0;
+ static constexpr uint32_t SEQ_NUMBER_MASK = 0x00FFFFFF;
+
+ static constexpr uint32_t SLAB_ID_SHIFT = 8;
+ static constexpr uint32_t SLAB_ID_MASK = 0x00FFFF00;
+
+ static constexpr uint32_t BLOCK_OFF_SHIFT = 0;
+ static constexpr uint32_t BLOCK_OFF_MASK = 0x000000FF;
+
+ uint32_t block_id;
+ };
+
+ struct disk_record_t
+ {
+ disk_record_t(const char *buff);
+ disk_record_t(const rgw::sal::Bucket *p_bucket,
+ const std::string &obj_name,
+ const parsed_etag_t *p_parsed_etag,
+ uint64_t obj_size,
+ const std::string &storage_class);
+ disk_record_t() {}
+ size_t serialize(char *buff) const;
+ size_t length() const;
+ int validate(const char *caller,
+ const DoutPrefixProvider* dpp,
+ disk_block_id_t block_id,
+ record_id_t rec_id) const;
+ inline bool has_shared_manifest() const { return s.flags.has_shared_manifest(); }
+ inline void set_shared_manifest() { s.flags.set_shared_manifest(); }
+
+ struct __attribute__ ((packed)) packed_rec_t
+ {
+ uint8_t rec_version; // allows changing record format
+ dedup_flags_t flags; // 1 Byte flags
+ uint16_t num_parts; // For multipart upload (AWS MAX-PART is 10,000)
+ uint16_t obj_name_len;
+ uint16_t bucket_name_len;
+
+ uint64_t md5_high; // High Bytes of the Object Data MD5
+ uint64_t md5_low; // Low Bytes of the Object Data MD5
+ uint64_t obj_bytes_size;
+ uint64_t object_version;
+
+ uint16_t bucket_id_len;
+ uint16_t tenant_name_len;
+ uint16_t stor_class_len;
+ uint16_t ref_tag_len;
+
+ uint16_t manifest_len;
+ uint8_t pad[6];
+
+ uint64_t shared_manifest; // 64bit hash of the SRC object manifest
+ uint64_t hash[4]; // 4 * 8 Bytes of BLAKE3
+ }s;
+ std::string obj_name;
+ // TBD: find pool name making it easier to get ioctx
+ std::string bucket_name;
+ std::string bucket_id;
+ std::string tenant_name;
+ std::string ref_tag;
+ std::string stor_class;
+ bufferlist manifest_bl;
+ };
+ static_assert(BLAKE3_OUT_LEN == sizeof(disk_record_t::packed_rec_t::hash));
+ std::ostream &operator<<(std::ostream &stream, const disk_record_t & rec);
+
+ static constexpr unsigned BLOCK_MAGIC = 0xFACE;
+ static constexpr unsigned LAST_BLOCK_MAGIC = 0xCAD7;
+ struct __attribute__ ((packed)) disk_block_header_t {
+ void deserialize();
+ int verify(disk_block_id_t block_id, const DoutPrefixProvider* dpp);
+ uint16_t offset;
+ uint16_t rec_count;
+ disk_block_id_t block_id;
+ uint16_t rec_offsets[MAX_REC_IN_BLOCK];
+ };
+ static constexpr unsigned MAX_REC_SIZE = (DISK_BLOCK_SIZE - sizeof(disk_block_header_t));
+
+ struct __attribute__ ((packed)) disk_block_t
+ {
+ const disk_block_header_t* get_header() const { return (disk_block_header_t*)data; }
+ disk_block_header_t* get_header() { return (disk_block_header_t*)data; }
+ bool is_empty() const { return (get_header()->rec_count == 0); }
+
+ void init(work_shard_t worker_id, uint32_t seq_number);
+ record_id_t add_record(const disk_record_t *p_rec, const DoutPrefixProvider *dpp);
+ void close_block(const DoutPrefixProvider* dpp, bool has_more);
+ disk_block_id_t get_block_id() {
+ disk_block_header_t *p_header = get_header();
+ return p_header->block_id;
+ }
+ char data[DISK_BLOCK_SIZE];
+ };
+
+ int load_record(librados::IoCtx &ioctx,
+ const disk_record_t *p_tgt_rec,
+ disk_record_t *p_src_rec, /* OUT */
+ disk_block_id_t block_id,
+ record_id_t rec_id,
+ md5_shard_t md5_shard,
+ const DoutPrefixProvider *dpp);
+
+ int load_slab(librados::IoCtx &ioctx,
+ bufferlist &bl,
+ md5_shard_t md5_shard,
+ work_shard_t worker_id,
+ uint32_t seq_number,
+ const DoutPrefixProvider* dpp);
+
+ int store_slab(librados::IoCtx &ioctx,
+ bufferlist &bl,
+ md5_shard_t md5_shard,
+ work_shard_t worker_id,
+ uint32_t seq_number,
+ const DoutPrefixProvider* dpp);
+
+ class disk_block_array_t;
+ class disk_block_seq_t
+ {
+ friend class disk_block_array_t;
+ public:
+ struct record_info_t {
+ disk_block_id_t block_id;
+ record_id_t rec_id;
+ };
+
+ disk_block_seq_t(const DoutPrefixProvider* dpp_in,
+ disk_block_t *p_arr_in,
+ work_shard_t worker_id,
+ md5_shard_t md5_shard,
+ worker_stats_t *p_stats_in);
+ int flush_disk_records(librados::IoCtx &ioctx);
+ md5_shard_t get_md5_shard() { return d_md5_shard; }
+ int add_record(librados::IoCtx &ioctx,
+ const disk_record_t *p_rec, // IN-OUT
+ record_info_t *p_rec_info); // OUT-PARAM
+
+ private:
+ disk_block_seq_t() {;}
+ void activate(const DoutPrefixProvider* _dpp,
+ disk_block_t *_p_arr,
+ work_shard_t worker_id,
+ md5_shard_t md5_shard,
+ worker_stats_t *p_stats);
+ inline const disk_block_t* last_block() { return &p_arr[DISK_BLOCK_COUNT-1]; }
+ int flush(librados::IoCtx &ioctx);
+ void slab_reset() {
+ p_curr_block = p_arr;
+ p_curr_block->init(d_worker_id, d_seq_number);
+ }
+
+ disk_block_t *p_arr = nullptr;
+ disk_block_t *p_curr_block = nullptr;
+ worker_stats_t *p_stats = nullptr;
+ const DoutPrefixProvider *dpp = nullptr;
+ uint32_t d_seq_number = 0;
+ work_shard_t d_worker_id = NULL_WORK_SHARD;
+ md5_shard_t d_md5_shard = NULL_MD5_SHARD;
+ };
+
+ class disk_block_array_t
+ {
+ public:
+ disk_block_array_t(const DoutPrefixProvider* _dpp,
+ uint8_t *raw_mem,
+ uint64_t raw_mem_size,
+ work_shard_t worker_id,
+ worker_stats_t *p_worker_stats,
+ md5_shard_t num_md5_shards);
+ void flush_output_buffers(const DoutPrefixProvider* dpp,
+ librados::IoCtx &ioctx);
+ disk_block_seq_t* get_shard_block_seq(uint64_t md5_low) {
+ md5_shard_t md5_shard = md5_low % d_num_md5_shards;
+ return d_disk_arr + md5_shard;
+ }
+
+ //private:
+ disk_block_seq_t d_disk_arr[MAX_MD5_SHARD];
+ work_shard_t d_worker_id;
+ md5_shard_t d_num_md5_shards;
+ };
+} //namespace rgw::dedup
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2;
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Author: Gabriel BenHanokh <gbenhano@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "rgw_dedup_table.h"
+#include "include/ceph_assert.h"
+#include <cstring>
+#include <iostream>
+
+namespace rgw::dedup {
+
+ //---------------------------------------------------------------------------
+ dedup_table_t::dedup_table_t(const DoutPrefixProvider* _dpp,
+ uint32_t _head_object_size,
+ uint8_t *p_slab,
+ uint64_t slab_size)
+ {
+ dpp = _dpp;
+ head_object_size = _head_object_size;
+ memset(p_slab, 0, slab_size);
+ hash_tab = (table_entry_t*)p_slab;
+ entries_count = slab_size/sizeof(table_entry_t);
+ values_count = 0;
+ occupied_count = 0;
+ }
+
+ //---------------------------------------------------------------------------
+ void dedup_table_t::remove_singletons_and_redistribute_keys()
+ {
+ for (uint32_t tab_idx = 0; tab_idx < entries_count; tab_idx++) {
+ if (!hash_tab[tab_idx].val.is_occupied()) {
+ continue;
+ }
+
+ if (hash_tab[tab_idx].val.is_singleton()) {
+ hash_tab[tab_idx].val.clear_flags();
+ redistributed_clear++;
+ continue;
+ }
+
+ const key_t &key = hash_tab[tab_idx].key;
+ // This is an approximation only since size is stored in 4KB resolution
+ uint64_t byte_size_approx = disk_blocks_to_byte_size(key.size_4k_units);
+ if (!key.multipart_object() && (byte_size_approx <= head_object_size)) {
+ hash_tab[tab_idx].val.clear_flags();
+ redistributed_clear++;
+ continue;
+ }
+
+ uint32_t key_idx = key.hash() % entries_count;
+ if (key_idx != tab_idx) {
+ uint64_t count = 1;
+ redistributed_count++;
+ uint32_t idx = key_idx;
+ while (hash_tab[idx].val.is_occupied() &&
+ !hash_tab[idx].val.is_singleton() &&
+ (hash_tab[idx].key != key)) {
+ count++;
+ idx = (idx + 1) % entries_count;
+ }
+
+ if (idx != tab_idx) {
+ if (hash_tab[idx].val.is_occupied() && hash_tab[idx].val.is_singleton() ) {
+ redistributed_clear++;
+ }
+ if (idx == key_idx) {
+ redistributed_perfect++;
+ }
+ hash_tab[idx] = hash_tab[tab_idx];
+ hash_tab[tab_idx].val.clear_flags();
+ }
+ else {
+ redistributed_loopback++;
+ }
+
+ redistributed_search_max = std::max(redistributed_search_max, count);
+ redistributed_search_total += count;
+ }
+ else {
+ redistributed_not_needed++;
+ }
+ }
+ }
+
+ //---------------------------------------------------------------------------
+ uint32_t dedup_table_t::find_entry(const key_t *p_key) const
+ {
+ uint32_t idx = p_key->hash() % entries_count;
+
+ // search until we either find the key, or find an empty slot.
+ while (hash_tab[idx].val.is_occupied() && (hash_tab[idx].key != *p_key)) {
+ idx = (idx + 1) % entries_count;
+ }
+ return idx;
+ }
+
+ //---------------------------------------------------------------------------
+ int dedup_table_t::add_entry(key_t *p_key,
+ disk_block_id_t block_id,
+ record_id_t rec_id,
+ bool shared_manifest)
+ {
+ value_t new_val(block_id, rec_id, shared_manifest);
+ uint32_t idx = find_entry(p_key);
+ value_t &val = hash_tab[idx].val;
+ if (!val.is_occupied()) {
+ if (occupied_count < entries_count) {
+ occupied_count++;
+ }
+ else {
+ return -EOVERFLOW;
+ }
+
+ hash_tab[idx].key = *p_key;
+ hash_tab[idx].val = new_val;
+ ldpp_dout(dpp, 20) << __func__ << "::add new entry" << dendl;
+ ceph_assert(val.count == 1);
+ }
+ else {
+ ceph_assert(hash_tab[idx].key == *p_key);
+ val.count ++;
+ if (!val.has_shared_manifest() && shared_manifest) {
+ // replace value!
+ ldpp_dout(dpp, 20) << __func__ << "::Replace with shared_manifest::["
+ << val.block_idx << "/" << (int)val.rec_id << "] -> ["
+ << block_id << "/" << (int)rec_id << "]" << dendl;
+ new_val.count = val.count;
+ hash_tab[idx].val = new_val;
+ }
+ ceph_assert(val.count > 1);
+ }
+ values_count++;
+ ldpp_dout(dpp, 20) << __func__ << "::COUNT="<< val.count << dendl;
+ return 0;
+ }
+
+ //---------------------------------------------------------------------------
+ void dedup_table_t::update_entry(key_t *p_key,
+ disk_block_id_t block_id,
+ record_id_t rec_id,
+ bool shared_manifest)
+ {
+ uint32_t idx = find_entry(p_key);
+ ceph_assert(hash_tab[idx].key == *p_key);
+ value_t &val = hash_tab[idx].val;
+ ceph_assert(val.is_occupied());
+ // we only update non-singletons since we purge singletons after the first pass
+ ceph_assert(val.count > 1);
+
+ // need to overwrite the block_idx/rec_id from the first pass
+ // unless already set with shared_manifest with the correct block-id/rec-id
+ // We only set the shared_manifest flag on the second pass where we
+ // got valid block-id/rec-id
+ if (!val.has_shared_manifest()) {
+ // replace value!
+ value_t new_val(block_id, rec_id, shared_manifest);
+ new_val.count = val.count;
+ hash_tab[idx].val = new_val;
+ ldpp_dout(dpp, 20) << __func__ << "::Replaced table entry::["
+ << val.block_idx << "/" << (int)val.rec_id << "] -> ["
+ << block_id << "/" << (int)rec_id << "]" << dendl;
+ }
+ }
+
+ //---------------------------------------------------------------------------
+ int dedup_table_t::set_shared_manifest_src_mode(const key_t *p_key,
+ disk_block_id_t block_id,
+ record_id_t rec_id)
+ {
+ uint32_t idx = find_entry(p_key);
+ value_t &val = hash_tab[idx].val;
+ if (val.is_occupied()) {
+ if (val.block_idx == block_id && val.rec_id == rec_id) {
+ val.set_shared_manifest_src();
+ return 0;
+ }
+ }
+
+ return -ENOENT;
+ }
+
+ //---------------------------------------------------------------------------
+ int dedup_table_t::get_val(const key_t *p_key, struct value_t *p_val /*OUT*/)
+ {
+ uint32_t idx = find_entry(p_key);
+ const value_t &val = hash_tab[idx].val;
+ if (!val.is_occupied()) {
+ return -ENOENT;
+ }
+
+ *p_val = val;
+ return 0;
+ }
+
+ //---------------------------------------------------------------------------
+ void dedup_table_t::count_duplicates(dedup_stats_t *p_small_objs,
+ dedup_stats_t *p_big_objs,
+ uint64_t *p_duplicate_head_bytes)
+ {
+ for (uint32_t tab_idx = 0; tab_idx < entries_count; tab_idx++) {
+ if (!hash_tab[tab_idx].val.is_occupied()) {
+ continue;
+ }
+
+ const key_t &key = hash_tab[tab_idx].key;
+ // This is an approximation only since size is stored in 4KB resolution
+ uint64_t byte_size_approx = disk_blocks_to_byte_size(key.size_4k_units);
+ uint32_t duplicate_count = (hash_tab[tab_idx].val.count -1);
+
+ // skip small single part objects which we can't dedup
+ if (!key.multipart_object() && (byte_size_approx <= head_object_size)) {
+ if (hash_tab[tab_idx].val.is_singleton()) {
+ p_small_objs->singleton_count++;
+ }
+ else {
+ p_small_objs->duplicate_count += duplicate_count;
+ p_small_objs->unique_count ++;
+ p_small_objs->dedup_bytes_estimate += (duplicate_count * byte_size_approx);
+ }
+ continue;
+ }
+
+ if (hash_tab[tab_idx].val.is_singleton()) {
+ p_big_objs->singleton_count++;
+ }
+ else {
+ ceph_assert(hash_tab[tab_idx].val.count > 1);
+ uint64_t dup_bytes_approx = calc_deduped_bytes(head_object_size,
+ key.num_parts,
+ byte_size_approx);
+ p_big_objs->dedup_bytes_estimate += (duplicate_count * dup_bytes_approx);
+ p_big_objs->duplicate_count += duplicate_count;
+ p_big_objs->unique_count ++;
+
+ if (!key.multipart_object()) {
+ // single part objects duplicate the head object when dedup is used
+ uint64_t dup_head_bytes = duplicate_count * head_object_size;
+ *p_duplicate_head_bytes += dup_head_bytes;
+ }
+ }
+ }
+ }
+
+} // namespace rgw::dedup
+
+#if 0
+#include <climits>
+#include <cstdlib>
+#include <iostream>
+#include <cmath>
+#include <iomanip>
+#include <random>
+
+//---------------------------------------------------------------------------
+int main()
+{
+ static constexpr unsigned MAX_ENTRIES = 1024;
+ rgw::dedup::key_t *key_tab = new rgw::dedup::key_t[MAX_ENTRIES];
+ if (!key_tab) {
+ std::cerr << "faild alloc!" << std::endl;
+ return 1;
+ }
+ rgw::dedup::key_t *p_key = key_tab;
+ //rgw::dedup::dedup_table_t tab(MAX_ENTRIES + MAX_ENTRIES/5);
+ rgw::dedup::dedup_table_t tab(MAX_ENTRIES);
+
+ std::cout << "sizeof(key)=" << sizeof(rgw::dedup::key_t) << std::endl;
+ // Seed with a real random value, if available
+ std::random_device r;
+ // Choose a random mean between 1 ULLONG_MAX
+ std::default_random_engine e1(r());
+ std::uniform_int_distribution<uint64_t> uniform_dist(1, std::numeric_limits<uint64_t>::max());
+
+ for (unsigned i = 0; i < MAX_ENTRIES; i++) {
+ uint64_t md5_high = uniform_dist(e1);
+ uint64_t md5_low = uniform_dist(e1);
+ uint32_t size_4k_units = std::rand();
+ uint16_t num_parts = std::rand();
+ //std::cout << std::hex << md5_high << "::" << md5_low << "::" << block_id << std::endl;
+ rgw::dedup::key_t key(md5_high, md5_low, size_4k_units, num_parts);
+ *p_key = key;
+ p_key++;
+ }
+ work_shard_t work_shard = 3;
+ for (unsigned i = 0; i < MAX_ENTRIES; i++) {
+ disk_block_id_t block_id(worker_id, std::rand());
+ tab.add_entry(key_tab+i, block_id, 0, false, false);
+ }
+ double avg = (double)total / MAX_ENTRIES;
+ std::cout << "Insert::num entries=" << MAX_ENTRIES << ", total=" << total
+ << ", avg=" << avg << ", max=" << max << std::endl;
+ std::cout << "==========================================\n";
+
+ total = 0;
+ max = 0;
+ for (unsigned i = 0; i < MAX_ENTRIES; i++) {
+ tab.find_entry(key_tab+i);
+ }
+ avg = (double)total / MAX_ENTRIES;
+ std::cout << "Find::num entries=" << MAX_ENTRIES << ", total=" << total
+ << ", avg=" << avg << ", max=" << max << std::endl;
+ std::cout << "==========================================\n";
+ tab.remove_singletons_and_redistribute_keys();
+ tab.print_redistribute_stats();
+ tab.stat_counters_reset();
+ std::cout << "==========================================\n";
+ total = 0;
+ max = 0;
+ uint32_t cnt = 0;
+ for (unsigned i = 0; i < MAX_ENTRIES; i++) {
+ rgw::dedup::key_t *p_key = key_tab+i;
+ tab.find_entry(p_key);
+ cnt++;
+#if 0
+ if (p_key->md5_high % 5 == 0) {
+ tab.find_entry(p_key);
+ cnt++;
+ }
+#endif
+ }
+ avg = (double)total / cnt;
+ std::cout << "num entries=" << cnt << ", total=" << total
+ << ", avg=" << avg << ", max=" << max << std::endl;
+}
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2;
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Author: Gabriel BenHanokh <gbenhano@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#pragma once
+#include <cstdint>
+#include <cstddef>
+#include <iterator>
+#include "common/dout.h"
+#include "rgw_dedup_store.h"
+namespace rgw::dedup {
+
+ // 24 Bytes key
+ struct key_t {
+ key_t() { ;}
+ key_t(uint64_t _md5_high,
+ uint64_t _md5_low,
+ uint32_t _size_4k_units,
+ uint16_t _num_parts,
+ uint8_t _stor_class_idx) {
+ md5_high = _md5_high;
+ md5_low = _md5_low;
+ size_4k_units = _size_4k_units;
+ num_parts = _num_parts;
+ stor_class_idx = _stor_class_idx;
+ pad8 = 0;
+ }
+
+ bool operator==(const struct key_t& other) const {
+ return (memcmp(this, &other, sizeof(other)) == 0);
+ }
+
+ bool operator!=(const struct key_t& other) const {
+ return !operator==(other);
+ }
+
+ uint64_t hash() const {
+ // The MD5 is already a hashing function so no need for another hash
+ return this->md5_low;
+ }
+
+ bool multipart_object() const {
+ return num_parts > 0;
+ }
+
+ uint64_t md5_high; // High Bytes of the Object Data MD5
+ uint64_t md5_low; // Low Bytes of the Object Data MD5
+ uint32_t size_4k_units; // Object size in 4KB units max out at 16TB (AWS MAX-SIZE is 5TB)
+ uint16_t num_parts; // How many parts were used in multipart upload (AWS MAX-PART is 10,000)
+ uint8_t stor_class_idx;// storage class id
+ uint8_t pad8;
+ } __attribute__((__packed__));
+ static_assert(sizeof(key_t) == 24);
+
+ class dedup_table_t {
+ public:
+ // 8 Bytes Value
+ struct value_t {
+ value_t() {
+ this->block_idx = 0xFFFFFFFF;
+ this->count = 0;
+ this->rec_id = 0xFF;
+ this->flags.clear();
+ }
+
+ value_t(disk_block_id_t block_id, record_id_t rec_id, bool shared_manifest) {
+ this->block_idx = block_id;
+ this->count = 1;
+ this->rec_id = rec_id;
+ this->flags.clear();
+ this->flags.set_occupied();
+ if (shared_manifest) {
+ flags.set_shared_manifest();
+ }
+ }
+
+ inline void clear_flags() { flags.clear(); }
+ inline bool has_shared_manifest() const {return flags.has_shared_manifest(); }
+ inline void set_shared_manifest_src() { this->flags.set_shared_manifest(); }
+ inline bool is_singleton() const { return (count == 1); }
+ inline bool is_occupied() const { return flags.is_occupied(); }
+ inline void set_occupied() { this->flags.set_occupied(); }
+ inline void clear_occupied() { this->flags.clear_occupied(); }
+
+ disk_block_id_t block_idx; // 32 bits
+ uint16_t count; // 16 bits
+ record_id_t rec_id; // 8 bits
+ dedup_flags_t flags; // 8 bits
+ } __attribute__((__packed__));
+ static_assert(sizeof(value_t) == 8);
+
+ dedup_table_t(const DoutPrefixProvider* _dpp,
+ uint32_t _head_object_size,
+ uint8_t *p_slab,
+ uint64_t slab_size);
+ int add_entry(key_t *p_key, disk_block_id_t block_id, record_id_t rec_id,
+ bool shared_manifest);
+ void update_entry(key_t *p_key, disk_block_id_t block_id, record_id_t rec_id,
+ bool shared_manifest);
+
+ int get_val(const key_t *p_key, struct value_t *p_val /*OUT*/);
+
+ int set_shared_manifest_src_mode(const key_t *p_key,
+ disk_block_id_t block_id,
+ record_id_t rec_id);
+
+ void count_duplicates(dedup_stats_t *p_small_objs_stat,
+ dedup_stats_t *p_big_objs_stat,
+ uint64_t *p_duplicate_head_bytes);
+
+ void remove_singletons_and_redistribute_keys();
+ private:
+ // 32 Bytes unified entries
+ struct table_entry_t {
+ key_t key;
+ value_t val;
+ } __attribute__((__packed__));
+ static_assert(sizeof(table_entry_t) == 32);
+
+ uint32_t find_entry(const key_t *p_key) const;
+ uint32_t values_count = 0;
+ uint32_t entries_count = 0;
+ uint32_t occupied_count = 0;
+ uint32_t head_object_size = (4ULL * 1024 * 1024);
+ table_entry_t *hash_tab = nullptr;
+
+ // stat counters
+ uint64_t redistributed_count = 0;
+ uint64_t redistributed_search_total = 0;
+ uint64_t redistributed_search_max = 0;
+ uint64_t redistributed_loopback = 0;
+ uint64_t redistributed_perfect = 0;
+ uint64_t redistributed_clear = 0;
+ uint64_t redistributed_not_needed = 0;
+ const DoutPrefixProvider* dpp;
+ };
+
+} //namespace rgw::dedup
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2;
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Author: Gabriel BenHanokh <gbenhano@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "rgw_dedup_utils.h"
+#include "common/ceph_crypto.h"
+
+namespace rgw::dedup {
+ //---------------------------------------------------------------------------
+ std::ostream& operator<<(std::ostream &out, const dedup_req_type_t& dedup_type)
+ {
+ if (dedup_type == dedup_req_type_t::DEDUP_TYPE_NONE) {
+ out << "DEDUP_TYPE_NONE";
+ }
+ else if (dedup_type == dedup_req_type_t::DEDUP_TYPE_ESTIMATE) {
+ out << "DEDUP_TYPE_ESTIMATE";
+ }
+ else if (dedup_type == dedup_req_type_t::DEDUP_TYPE_FULL) {
+ out << "DEDUP_TYPE_FULL";
+ }
+ else {
+ out << "\n*** unexpected dedup_type ***\n";
+ }
+
+ return out;
+ }
+
+ //---------------------------------------------------------------------------
+ dedup_stats_t& dedup_stats_t::operator+=(const dedup_stats_t& other)
+ {
+ this->singleton_count += other.singleton_count;
+ this->unique_count += other.unique_count;
+ this->duplicate_count += other.duplicate_count;
+ this->dedup_bytes_estimate += other.dedup_bytes_estimate;
+ return *this;
+ }
+
+ //---------------------------------------------------------------------------
+ std::ostream& operator<<(std::ostream &out, const dedup_stats_t& stats)
+ {
+ out << "::singleton_count=" << stats.singleton_count
+ << "::unique_count=" << stats.unique_count
+ << "::duplicate_count=" << stats.duplicate_count
+ << "::duplicated_bytes=" << stats.dedup_bytes_estimate;
+ return out;
+ }
+
+ //---------------------------------------------------------------------------
+ void encode(const dedup_stats_t& ds, ceph::bufferlist& bl)
+ {
+ ENCODE_START(1, 1, bl);
+ encode(ds.singleton_count, bl);
+ encode(ds.unique_count, bl);
+ encode(ds.duplicate_count, bl);
+ encode(ds.dedup_bytes_estimate, bl);
+ ENCODE_FINISH(bl);
+ }
+
+ //---------------------------------------------------------------------------
+ void decode(dedup_stats_t& ds, ceph::bufferlist::const_iterator& bl)
+ {
+ DECODE_START(1, bl);
+ decode(ds.singleton_count, bl);
+ decode(ds.unique_count, bl);
+ decode(ds.duplicate_count, bl);
+ decode(ds.dedup_bytes_estimate, bl);
+ DECODE_FINISH(bl);
+ }
+
+ // convert a hex-string to a 64bit integer (max 16 hex digits)
+ //---------------------------------------------------------------------------
+ bool hex2int(const char *p, const char *p_end, uint64_t *p_val)
+ {
+ if (p_end - p <= (int)(sizeof(uint64_t) * 2)) {
+ uint64_t val = 0;
+ while (p < p_end) {
+ // get current character then increment
+ uint8_t byte = *p++;
+ // transform hex character to the 4bit equivalent number, using the ASCII table indexes
+ if (byte >= '0' && byte <= '9') {
+ byte = byte - '0';
+ }
+ else if (byte >= 'a' && byte <='f') {
+ byte = byte - 'a' + 10;
+ }
+ else if (byte >= 'A' && byte <='F') {
+ byte = byte - 'A' + 10;
+ }
+ else {
+ // terminate on the first non hex char
+ return false;
+ }
+ // shift 4 to make space for new digit, and add the 4 bits of the new digit
+ val = (val << 4) | (byte & 0xF);
+ }
+ *p_val = val;
+ return true;
+ }
+ else {
+ return false;
+ }
+ }
+
+ //---------------------------------------------------------------------------
+ bool dec2int(const char *p, const char* p_end, uint16_t *p_val)
+ {
+ uint16_t val = 0;
+ while (p < p_end) {
+ uint8_t byte = *p++;
+ if (byte >= '0' && byte <= '9') {
+ val = val * 10 + (byte - '0');
+ }
+ else {
+ // terminate on the first non hex char
+ return false;
+ }
+ }
+ *p_val = val;
+ return true;
+ }
+
+ // 16Bytes MD5 takes 32 chars
+ const unsigned MD5_LENGTH = 32;
+
+ //---------------------------------------------------------------------------
+ static bool get_num_parts(const std::string & etag, uint16_t *p_num_parts)
+ {
+ // Amazon S3 multipart upload Maximum number = 10,000
+ const unsigned MAX_PARTS = 10000;
+ if (etag.length() <= MD5_LENGTH) {
+ // i.e. no multipart
+ *p_num_parts = 0;
+ return true;
+ }
+
+ // Amazon S3 multipart upload Maximum number = 10,000 (5 decimal digits)
+ // We need 1 extra byte for the '-' delimiter and 1 extra byte for '"' at the end
+ // 7 Bytes should suffice, but we roundup to 8 Bytes
+ const unsigned MAX_PART_LEN = 8;
+ if (unlikely(etag.length() > MD5_LENGTH + MAX_PART_LEN)) {
+ // illegal ETAG
+ return false;
+ }
+
+ std::string::size_type n = etag.find('-', etag.length() - MAX_PART_LEN);
+ if (n != std::string::npos) {
+ char buff[MAX_PART_LEN];
+ // again, 1 extra byte for the '-' delimiter
+ unsigned copy_size = etag.length() - (n + 1);
+ if (copy_size <= MAX_PART_LEN) {
+ unsigned nbytes = etag.copy(buff, copy_size, n+1);
+ uint16_t num_parts;
+ const unsigned MAX_UINT16_DIGITS = 5; // 65536
+ if (nbytes <= MAX_UINT16_DIGITS) {
+ if (dec2int(buff, buff+nbytes, &num_parts) && num_parts <= MAX_PARTS) {
+ *p_num_parts = num_parts;
+ return true;
+ } // else, not all digits are legal
+ } // else, more than 5 digits
+ } // else, copy len too large
+ } // else, '-' delimiter was not found
+
+ // illegal number of parts
+ return false;
+ }
+
+ //---------------------------------------------------------------------------
+ bool parse_etag_string(const std::string& etag, parsed_etag_t *parsed_etag)
+ {
+ char buff[MD5_LENGTH*2];
+ uint16_t num_parts = 0;
+ if (get_num_parts(etag, &num_parts)) {
+ etag.copy(buff, MD5_LENGTH, 0);
+ uint64_t high, low;
+ if (hex2int(buff, buff+16, &high)) {
+ if (hex2int(buff+16, buff+32, &low)) {
+ parsed_etag->md5_high = high; // High Bytes of the Object Data MD5
+ parsed_etag->md5_low = low; // Low Bytes of the Object Data MD5
+ parsed_etag->num_parts = num_parts; // How many parts were used in multipart upload
+ return true;
+ }
+ }
+ }
+
+ // an illegal etag string
+ return false;
+ }
+
+ //---------------------------------------------------------------------------
+ void etag_to_bufferlist(uint64_t md5_high, uint64_t md5_low, uint16_t num_parts,
+ ceph::bufferlist *bl)
+ {
+ char buff[64];
+ int n = snprintf(buff, sizeof(buff), "%016lx%016lx", md5_high, md5_low);
+ if (num_parts >= 1) {
+ n += snprintf(buff + n, sizeof(buff) - n, "-%u", num_parts);
+ }
+ bl->append(buff, n);
+ }
+
+ //---------------------------------------------------------------------------
+ const char* get_next_data_ptr(bufferlist::const_iterator &bl_itr,
+ char data_buff[],
+ size_t len,
+ const DoutPrefixProvider* dpp)
+ {
+ const char *p = nullptr;
+ size_t n = bl_itr.get_ptr_and_advance(len, &p);
+ if (n == len) {
+ // we got a zero-copy raw pointer to contiguous data on the buffer-list
+ return p;
+ }
+
+ std::vector<int> vec;
+ // otherwise - copy the data to the @data_buff
+ char *p_buff = data_buff;
+ do {
+ vec.push_back(n);
+ std::memcpy(p_buff, p, n);
+ p_buff += n;
+ len -= n;
+ if (len > 0) {
+ n = bl_itr.get_ptr_and_advance(len, &p);
+ }
+ } while (len > 0);
+
+ ldpp_dout(dpp, 20) << __func__ << "::vec=" << vec << dendl;
+ return data_buff;
+ }
+
+ static const char* s_urgent_msg_names[] = {
+ "URGENT_MSG_NONE",
+ "URGENT_MSG_ABORT",
+ "URGENT_MSG_PASUE",
+ "URGENT_MSG_RESUME",
+ "URGENT_MSG_RESTART",
+ "URGENT_MSG_INVALID"
+ };
+
+ //---------------------------------------------------------------------------
+ const char* get_urgent_msg_names(int msg)
+ {
+ if (msg <= URGENT_MSG_INVALID && msg >= URGENT_MSG_NONE) {
+ return s_urgent_msg_names[msg];
+ }
+ else {
+ return s_urgent_msg_names[URGENT_MSG_INVALID];
+ }
+ }
+
+ //---------------------------------------------------------------------------
+ worker_stats_t& worker_stats_t::operator+=(const worker_stats_t& other)
+ {
+ this->ingress_obj += other.ingress_obj;
+ this->ingress_obj_bytes += other.ingress_obj_bytes;
+ this->egress_records += other.egress_records;
+ this->egress_blocks += other.egress_blocks;
+ this->egress_slabs += other.egress_slabs;
+ this->single_part_objs += other.single_part_objs;
+ this->multipart_objs += other.multipart_objs;
+ this->small_multipart_obj += other.small_multipart_obj;
+ this->default_storage_class_objs += other.default_storage_class_objs;
+ this->default_storage_class_objs_bytes += other.default_storage_class_objs_bytes;
+ this->non_default_storage_class_objs += other.non_default_storage_class_objs;
+ this->non_default_storage_class_objs_bytes += other.non_default_storage_class_objs_bytes;
+ this->ingress_corrupted_etag += other.ingress_corrupted_etag;
+ this->ingress_skip_too_small_bytes += other.ingress_skip_too_small_bytes;
+ this->ingress_skip_too_small += other.ingress_skip_too_small;
+ this->ingress_skip_too_small_64KB_bytes += other.ingress_skip_too_small_64KB_bytes;
+ this->ingress_skip_too_small_64KB += other.ingress_skip_too_small_64KB;
+
+ return *this;
+ }
+ //---------------------------------------------------------------------------
+ void worker_stats_t::dump(Formatter *f) const
+ {
+ // main section
+ {
+ Formatter::ObjectSection main(*f, "main");
+
+ f->dump_unsigned("Ingress Objs count", this->ingress_obj);
+ f->dump_unsigned("Accum byte size Ingress Objs", this->ingress_obj_bytes);
+ f->dump_unsigned("Egress Records count", this->egress_records);
+ f->dump_unsigned("Egress Blocks count", this->egress_blocks);
+ f->dump_unsigned("Egress Slabs count", this->egress_slabs);
+ f->dump_unsigned("Single part obj count", this->single_part_objs);
+ f->dump_unsigned("Multipart obj count", this->multipart_objs);
+ if (this->small_multipart_obj) {
+ f->dump_unsigned("Small Multipart obj count", this->small_multipart_obj);
+ }
+ }
+
+ {
+ Formatter::ObjectSection notify(*f, "notify");
+
+ if(this->non_default_storage_class_objs) {
+ f->dump_unsigned("non default storage class objs",
+ this->non_default_storage_class_objs);
+ f->dump_unsigned("non default storage class objs bytes",
+ this->non_default_storage_class_objs_bytes);
+ }
+ else {
+ ceph_assert(this->default_storage_class_objs == this->ingress_obj);
+ ceph_assert(this->default_storage_class_objs_bytes == this->ingress_obj_bytes);
+ }
+ }
+
+ {
+ Formatter::ObjectSection skipped(*f, "skipped");
+ if(this->ingress_skip_too_small) {
+ f->dump_unsigned("Ingress skip: too small objs",
+ this->ingress_skip_too_small);
+ f->dump_unsigned("Ingress skip: too small bytes",
+ this->ingress_skip_too_small_bytes);
+
+ if(this->ingress_skip_too_small_64KB) {
+ f->dump_unsigned("Ingress skip: 64KB<=size<=4MB Obj",
+ this->ingress_skip_too_small_64KB);
+ f->dump_unsigned("Ingress skip: 64KB<=size<=4MB Bytes",
+ this->ingress_skip_too_small_64KB_bytes);
+ }
+ }
+ }
+
+ {
+ Formatter::ObjectSection failed(*f, "failed");
+ if(this->ingress_corrupted_etag) {
+ f->dump_unsigned("Corrupted ETAG", this->ingress_corrupted_etag);
+ }
+ }
+ }
+
+ //---------------------------------------------------------------------------
+ std::ostream& operator<<(std::ostream &out, const worker_stats_t &s)
+ {
+ JSONFormatter formatter(false);
+ s.dump(&formatter);
+ std::stringstream sstream;
+ formatter.flush(sstream);
+ out << sstream.str();
+ return out;
+ }
+
+ //---------------------------------------------------------------------------
+ void encode(const worker_stats_t& w, ceph::bufferlist& bl)
+ {
+ ENCODE_START(1, 1, bl);
+ encode(w.ingress_obj, bl);
+ encode(w.ingress_obj_bytes, bl);
+ encode(w.egress_records, bl);
+ encode(w.egress_blocks, bl);
+ encode(w.egress_slabs, bl);
+
+ encode(w.single_part_objs, bl);
+ encode(w.multipart_objs, bl);
+ encode(w.small_multipart_obj, bl);
+
+ encode(w.default_storage_class_objs, bl);
+ encode(w.default_storage_class_objs_bytes, bl);
+ encode(w.non_default_storage_class_objs, bl);
+ encode(w.non_default_storage_class_objs_bytes, bl);
+
+ encode(w.ingress_corrupted_etag, bl);
+
+ encode(w.ingress_skip_too_small_bytes, bl);
+ encode(w.ingress_skip_too_small, bl);
+
+ encode(w.ingress_skip_too_small_64KB_bytes, bl);
+ encode(w.ingress_skip_too_small_64KB, bl);
+
+ encode(w.duration, bl);
+ ENCODE_FINISH(bl);
+ }
+
+ //---------------------------------------------------------------------------
+ void decode(worker_stats_t& w, ceph::bufferlist::const_iterator& bl)
+ {
+ DECODE_START(1, bl);
+ decode(w.ingress_obj, bl);
+ decode(w.ingress_obj_bytes, bl);
+ decode(w.egress_records, bl);
+ decode(w.egress_blocks, bl);
+ decode(w.egress_slabs, bl);
+ decode(w.single_part_objs, bl);
+ decode(w.multipart_objs, bl);
+ decode(w.small_multipart_obj, bl);
+ decode(w.default_storage_class_objs, bl);
+ decode(w.default_storage_class_objs_bytes, bl);
+ decode(w.non_default_storage_class_objs, bl);
+ decode(w.non_default_storage_class_objs_bytes, bl);
+ decode(w.ingress_corrupted_etag, bl);
+ decode(w.ingress_skip_too_small_bytes, bl);
+ decode(w.ingress_skip_too_small, bl);
+ decode(w.ingress_skip_too_small_64KB_bytes, bl);
+ decode(w.ingress_skip_too_small_64KB, bl);
+
+ decode(w.duration, bl);
+ DECODE_FINISH(bl);
+ }
+
+ //---------------------------------------------------------------------------
+ md5_stats_t& md5_stats_t::operator+=(const md5_stats_t& other)
+ {
+ this->small_objs_stat += other.small_objs_stat;
+ this->big_objs_stat += other.big_objs_stat;
+ this->ingress_failed_load_bucket += other.ingress_failed_load_bucket;
+ this->ingress_failed_get_object += other.ingress_failed_get_object;
+ this->ingress_failed_get_obj_attrs += other.ingress_failed_get_obj_attrs;
+ this->ingress_corrupted_etag += other.ingress_corrupted_etag;
+ this->ingress_corrupted_obj_attrs += other.ingress_corrupted_obj_attrs;
+ this->ingress_skip_encrypted += other.ingress_skip_encrypted;
+ this->ingress_skip_encrypted_bytes += other.ingress_skip_encrypted_bytes;
+ this->ingress_skip_compressed += other.ingress_skip_compressed;
+ this->ingress_skip_compressed_bytes += other.ingress_skip_compressed_bytes;
+ this->ingress_skip_changed_objs += other.ingress_skip_changed_objs;
+ this->shared_manifest_dedup_bytes += other.shared_manifest_dedup_bytes;
+
+ this->skipped_shared_manifest += other.skipped_shared_manifest;
+ this->skipped_purged_small += other.skipped_purged_small;
+ this->skipped_singleton += other.skipped_singleton;
+ this->skipped_singleton_bytes += other.skipped_singleton_bytes;
+ this->skipped_source_record += other.skipped_source_record;
+ this->duplicate_records += other.duplicate_records;
+ this->size_mismatch += other.size_mismatch;
+ this->hash_mismatch += other.hash_mismatch;
+ this->failed_src_load += other.failed_src_load;
+ this->failed_rec_load += other.failed_rec_load;
+ this->failed_block_load += other.failed_block_load;
+
+ this->valid_hash_attrs += other.valid_hash_attrs;
+ this->invalid_hash_attrs += other.invalid_hash_attrs;
+ this->set_hash_attrs += other.set_hash_attrs;
+ this->skip_hash_cmp += other.skip_hash_cmp;
+
+ this->set_shared_manifest_src += other.set_shared_manifest_src;
+ this->loaded_objects += other.loaded_objects;
+ this->processed_objects += other.processed_objects;
+ this->dup_head_bytes_estimate += other.dup_head_bytes_estimate;
+ this->deduped_objects += other.deduped_objects;
+ this->deduped_objects_bytes += other.deduped_objects_bytes;
+ this->dup_head_bytes += other.dup_head_bytes;
+
+ this->failed_dedup += other.failed_dedup;
+ this->failed_table_load += other.failed_table_load;
+ this->failed_map_overflow += other.failed_map_overflow;
+ return *this;
+ }
+
+ //---------------------------------------------------------------------------
+ std::ostream& operator<<(std::ostream &out, const md5_stats_t &s)
+ {
+ JSONFormatter formatter(false);
+ s.dump(&formatter);
+ std::stringstream sstream;
+ formatter.flush(sstream);
+ out << sstream.str();
+ return out;
+ }
+
+ //---------------------------------------------------------------------------
+ void md5_stats_t::dump(Formatter *f) const
+ {
+ // main section
+ {
+ Formatter::ObjectSection main(*f, "main");
+
+ f->dump_unsigned("Total processed objects", this->processed_objects);
+ f->dump_unsigned("Loaded objects", this->loaded_objects);
+ f->dump_unsigned("Set Shared-Manifest SRC", this->set_shared_manifest_src);
+ f->dump_unsigned("Deduped Obj (this cycle)", this->deduped_objects);
+ f->dump_unsigned("Deduped Bytes(this cycle)", this->deduped_objects_bytes);
+ f->dump_unsigned("Dup head bytes (not dedup)", this->dup_head_bytes);
+ f->dump_unsigned("Already Deduped bytes (prev cycles)",
+ this->shared_manifest_dedup_bytes);
+
+ const dedup_stats_t &ds = this->big_objs_stat;
+ f->dump_unsigned("Singleton Obj", ds.singleton_count);
+ f->dump_unsigned("Unique Obj", ds.unique_count);
+ f->dump_unsigned("Duplicate Obj", ds.duplicate_count);
+ f->dump_unsigned("Dedup Bytes Estimate", ds.dedup_bytes_estimate);
+ }
+
+ // Potential Dedup Section:
+ // What could be gained by allowing dedup for smaller objects (64KB-4MB)
+ // Space wasted because of duplicated head-object (4MB)
+ {
+ Formatter::ObjectSection potential(*f, "Potential Dedup");
+ const dedup_stats_t &ds = this->small_objs_stat;
+ f->dump_unsigned("Singleton Obj (64KB-4MB)", ds.singleton_count);
+ f->dump_unsigned("Unique Obj (64KB-4MB)", ds.unique_count);
+ f->dump_unsigned("Duplicate Obj (64KB-4MB)", ds.duplicate_count);
+ f->dump_unsigned("Dedup Bytes Estimate (64KB-4MB)", ds.dedup_bytes_estimate);
+ f->dump_unsigned("Duplicated Head Bytes Estimate",
+ this->dup_head_bytes_estimate);
+ f->dump_unsigned("Duplicated Head Bytes", this->dup_head_bytes);
+ }
+
+ {
+ Formatter::ObjectSection notify(*f, "notify");
+ if (this->failed_table_load) {
+ f->dump_unsigned("Failed Table Load", this->failed_table_load);
+ }
+ if (this->failed_map_overflow) {
+ f->dump_unsigned("Failed Remap Overflow", this->failed_map_overflow);
+ }
+
+ f->dump_unsigned("Valid HASH attrs", this->valid_hash_attrs);
+ f->dump_unsigned("Invalid HASH attrs", this->invalid_hash_attrs);
+
+ if (this->set_hash_attrs) {
+ f->dump_unsigned("Set HASH", this->set_hash_attrs);
+ }
+
+ if (this->skip_hash_cmp) {
+ f->dump_unsigned("Can't run HASH compare", this->skip_hash_cmp);
+ }
+ }
+
+ {
+ Formatter::ObjectSection skipped(*f, "skipped");
+ f->dump_unsigned("Skipped shared_manifest", this->skipped_shared_manifest);
+ f->dump_unsigned("Skipped purged small objs", this->skipped_purged_small);
+ f->dump_unsigned("Skipped singleton objs", this->skipped_singleton);
+ if (this->skipped_singleton) {
+ f->dump_unsigned("Skipped singleton Bytes", this->skipped_singleton_bytes);
+ }
+ f->dump_unsigned("Skipped source record", this->skipped_source_record);
+
+ if (this->ingress_skip_encrypted) {
+ f->dump_unsigned("Skipped Encrypted objs", this->ingress_skip_encrypted);
+ f->dump_unsigned("Skipped Encrypted Bytes",this->ingress_skip_encrypted_bytes);
+ }
+ if (this->ingress_skip_compressed) {
+ f->dump_unsigned("Skipped Compressed objs", this->ingress_skip_compressed);
+ f->dump_unsigned("Skipped Compressed Bytes", this->ingress_skip_compressed_bytes);
+ }
+ if (this->ingress_skip_changed_objs) {
+ f->dump_unsigned("Skipped Changed Object", this->ingress_skip_changed_objs);
+ }
+ }
+
+ {
+ Formatter::ObjectSection sys_failures(*f, "system failures");
+ if (this->ingress_failed_load_bucket) {
+ f->dump_unsigned("Failed load_bucket()", this->ingress_failed_load_bucket);
+ }
+ if (this->ingress_failed_get_object) {
+ f->dump_unsigned("Failed get_object()", this->ingress_failed_get_object);
+ }
+ if (this->ingress_failed_get_obj_attrs) {
+ f->dump_unsigned("Failed get_obj_attrs", this->ingress_failed_get_obj_attrs);
+ }
+ if (this->ingress_corrupted_etag) {
+ f->dump_unsigned("Corrupted ETAG", this->ingress_corrupted_etag);
+ }
+ if (this->ingress_corrupted_obj_attrs) {
+ f->dump_unsigned("Corrupted obj attributes", this->ingress_corrupted_obj_attrs);
+ }
+ if (this->failed_src_load) {
+ f->dump_unsigned("Failed SRC-Load ", this->failed_src_load);
+ }
+ if (this->failed_rec_load) {
+ f->dump_unsigned("Failed Record-Load ", this->failed_rec_load);
+ }
+ if (this->failed_block_load) {
+ f->dump_unsigned("Failed Block-Load ", this->failed_block_load);
+ }
+ if (this->failed_dedup) {
+ f->dump_unsigned("Failed Dedup", this->failed_dedup);
+ }
+ }
+
+ {
+ Formatter::ObjectSection logical_failures(*f, "logical failures");
+ if (this->hash_mismatch) {
+ f->dump_unsigned("HASH mismatch", this->hash_mismatch);
+ }
+ if (this->duplicate_records) {
+ f->dump_unsigned("Duplicate SRC/TGT", this->duplicate_records);
+ }
+ if (this->size_mismatch) {
+ f->dump_unsigned("Size mismatch SRC/TGT", this->size_mismatch);
+ }
+ }
+ }
+
+ //---------------------------------------------------------------------------
+ void encode(const md5_stats_t& m, ceph::bufferlist& bl)
+ {
+ ENCODE_START(1, 1, bl);
+
+ encode(m.small_objs_stat, bl);
+ encode(m.big_objs_stat, bl);
+ encode(m.ingress_failed_load_bucket, bl);
+ encode(m.ingress_failed_get_object, bl);
+ encode(m.ingress_failed_get_obj_attrs, bl);
+ encode(m.ingress_corrupted_etag, bl);
+ encode(m.ingress_corrupted_obj_attrs, bl);
+ encode(m.ingress_skip_encrypted, bl);
+ encode(m.ingress_skip_encrypted_bytes, bl);
+ encode(m.ingress_skip_compressed, bl);
+ encode(m.ingress_skip_compressed_bytes, bl);
+ encode(m.ingress_skip_changed_objs, bl);
+ encode(m.shared_manifest_dedup_bytes, bl);
+
+ encode(m.skipped_shared_manifest, bl);
+ encode(m.skipped_purged_small, bl);
+ encode(m.skipped_singleton, bl);
+ encode(m.skipped_singleton_bytes, bl);
+ encode(m.skipped_source_record, bl);
+ encode(m.duplicate_records, bl);
+ encode(m.size_mismatch, bl);
+ encode(m.hash_mismatch, bl);
+ encode(m.failed_src_load, bl);
+ encode(m.failed_rec_load, bl);
+ encode(m.failed_block_load, bl);
+
+ encode(m.valid_hash_attrs, bl);
+ encode(m.invalid_hash_attrs, bl);
+ encode(m.set_hash_attrs, bl);
+ encode(m.skip_hash_cmp, bl);
+ encode(m.set_shared_manifest_src, bl);
+
+ encode(m.loaded_objects, bl);
+ encode(m.processed_objects, bl);
+ encode(m.dup_head_bytes_estimate, bl);
+ encode(m.deduped_objects, bl);
+ encode(m.deduped_objects_bytes, bl);
+ encode(m.dup_head_bytes, bl);
+ encode(m.failed_dedup, bl);
+ encode(m.failed_table_load, bl);
+ encode(m.failed_map_overflow, bl);
+
+ encode(m.duration, bl);
+ ENCODE_FINISH(bl);
+ }
+
+ //---------------------------------------------------------------------------
+ void decode(md5_stats_t& m, ceph::bufferlist::const_iterator& bl)
+ {
+ DECODE_START(1, bl);
+ decode(m.small_objs_stat, bl);
+ decode(m.big_objs_stat, bl);
+ decode(m.ingress_failed_load_bucket, bl);
+ decode(m.ingress_failed_get_object, bl);
+ decode(m.ingress_failed_get_obj_attrs, bl);
+ decode(m.ingress_corrupted_etag, bl);
+ decode(m.ingress_corrupted_obj_attrs, bl);
+ decode(m.ingress_skip_encrypted, bl);
+ decode(m.ingress_skip_encrypted_bytes, bl);
+ decode(m.ingress_skip_compressed, bl);
+ decode(m.ingress_skip_compressed_bytes, bl);
+ decode(m.ingress_skip_changed_objs, bl);
+ decode(m.shared_manifest_dedup_bytes, bl);
+
+ decode(m.skipped_shared_manifest, bl);
+ decode(m.skipped_purged_small, bl);
+ decode(m.skipped_singleton, bl);
+ decode(m.skipped_singleton_bytes, bl);
+ decode(m.skipped_source_record, bl);
+ decode(m.duplicate_records, bl);
+ decode(m.size_mismatch, bl);
+ decode(m.hash_mismatch, bl);
+ decode(m.failed_src_load, bl);
+ decode(m.failed_rec_load, bl);
+ decode(m.failed_block_load, bl);
+
+ decode(m.valid_hash_attrs, bl);
+ decode(m.invalid_hash_attrs, bl);
+ decode(m.set_hash_attrs, bl);
+ decode(m.skip_hash_cmp, bl);
+ decode(m.set_shared_manifest_src, bl);
+
+ decode(m.loaded_objects, bl);
+ decode(m.processed_objects, bl);
+ decode(m.dup_head_bytes_estimate, bl);
+ decode(m.deduped_objects, bl);
+ decode(m.deduped_objects_bytes, bl);
+ decode(m.dup_head_bytes, bl);
+ decode(m.failed_dedup, bl);
+ decode(m.failed_table_load, bl);
+ decode(m.failed_map_overflow, bl);
+
+ decode(m.duration, bl);
+ DECODE_FINISH(bl);
+ }
+} //namespace rgw::dedup
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2;
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Author: Gabriel BenHanokh <gbenhano@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#pragma once
+#include <string>
+#include "include/rados/buffer.h"
+#include "include/encoding.h"
+#include "common/Formatter.h"
+#include "common/ceph_json.h"
+#include <time.h>
+#include "include/utime.h"
+#include "include/encoding.h"
+#include "common/dout.h"
+
+#define FULL_DEDUP_SUPPORT
+namespace rgw::dedup {
+ using work_shard_t = uint16_t;
+ using md5_shard_t = uint16_t;
+
+ // settings to help debug small systems
+ const work_shard_t MIN_WORK_SHARD = 2;
+ const md5_shard_t MIN_MD5_SHARD = 4;
+
+ // Those are the correct values for production system
+ const work_shard_t MAX_WORK_SHARD = 255;
+ const md5_shard_t MAX_MD5_SHARD = 512;
+
+ const work_shard_t NULL_WORK_SHARD = 0xFFFF;
+ const md5_shard_t NULL_MD5_SHARD = 0xFFFF;
+ const unsigned NULL_SHARD = 0xFFFF;
+
+ // work_shard is an 8 bits int with 255 legal values for the first iteration
+ // and one value (0xFF) reserved for second iteration
+ const unsigned WORK_SHARD_HARD_LIMIT = 0x0FF;
+ // md5_shard_t is a 12 bits int with 4096 possible values
+ const unsigned MD5_SHARD_HARD_LIMIT = 0xFFF;
+
+ static_assert(MAX_WORK_SHARD < NULL_WORK_SHARD);
+ static_assert(MAX_WORK_SHARD < NULL_SHARD);
+ static_assert(MAX_WORK_SHARD <= WORK_SHARD_HARD_LIMIT);
+ static_assert(MAX_MD5_SHARD < NULL_MD5_SHARD);
+ static_assert(MAX_MD5_SHARD < NULL_SHARD);
+ static_assert(MAX_MD5_SHARD <= MD5_SHARD_HARD_LIMIT);
+
+ //---------------------------------------------------------------------------
+ enum dedup_req_type_t {
+ DEDUP_TYPE_NONE = 0,
+ DEDUP_TYPE_ESTIMATE = 1,
+ DEDUP_TYPE_FULL = 2
+ };
+
+ std::ostream& operator<<(std::ostream &out, const dedup_req_type_t& dedup_type);
+ struct __attribute__ ((packed)) dedup_flags_t {
+ private:
+ static constexpr uint8_t RGW_DEDUP_FLAG_HASH_CALCULATED = 0x01; // REC
+ static constexpr uint8_t RGW_DEDUP_FLAG_SHARED_MANIFEST = 0x02; // REC + TAB
+ static constexpr uint8_t RGW_DEDUP_FLAG_OCCUPIED = 0x04; // TAB
+ static constexpr uint8_t RGW_DEDUP_FLAG_FASTLANE = 0x08; // REC
+
+ public:
+ dedup_flags_t() : flags(0) {}
+ dedup_flags_t(uint8_t _flags) : flags(_flags) {}
+ inline void clear() { this->flags = 0; }
+ inline bool hash_calculated() const { return ((flags & RGW_DEDUP_FLAG_HASH_CALCULATED) != 0); }
+ inline void set_hash_calculated() { flags |= RGW_DEDUP_FLAG_HASH_CALCULATED; }
+ inline bool has_shared_manifest() const { return ((flags & RGW_DEDUP_FLAG_SHARED_MANIFEST) != 0); }
+ inline void set_shared_manifest() { flags |= RGW_DEDUP_FLAG_SHARED_MANIFEST; }
+ inline bool is_occupied() const {return ((this->flags & RGW_DEDUP_FLAG_OCCUPIED) != 0); }
+ inline void set_occupied() {this->flags |= RGW_DEDUP_FLAG_OCCUPIED; }
+ inline void clear_occupied() { this->flags &= ~RGW_DEDUP_FLAG_OCCUPIED; }
+ inline bool is_fastlane() const { return ((flags & RGW_DEDUP_FLAG_FASTLANE) != 0); }
+ inline void set_fastlane() { flags |= RGW_DEDUP_FLAG_FASTLANE; }
+ private:
+ uint8_t flags;
+ };
+
+ struct dedup_stats_t {
+ dedup_stats_t& operator+=(const dedup_stats_t& other);
+
+ uint64_t singleton_count = 0;
+ uint64_t unique_count = 0;
+ uint64_t duplicate_count = 0;
+ uint64_t dedup_bytes_estimate = 0;
+ };
+
+ std::ostream& operator<<(std::ostream &out, const dedup_stats_t& stats);
+ void encode(const dedup_stats_t& ds, ceph::bufferlist& bl);
+ void decode(dedup_stats_t& ds, ceph::bufferlist::const_iterator& bl);
+
+ struct worker_stats_t {
+ worker_stats_t& operator +=(const worker_stats_t& other);
+ void dump(Formatter *f) const;
+
+ uint64_t ingress_obj = 0;
+ uint64_t ingress_obj_bytes = 0;
+ uint64_t egress_records = 0;
+ uint64_t egress_blocks = 0;
+ uint64_t egress_slabs = 0;
+
+ uint64_t single_part_objs = 0;
+ uint64_t multipart_objs = 0;
+ uint64_t small_multipart_obj = 0;
+
+ uint64_t default_storage_class_objs = 0;
+ uint64_t default_storage_class_objs_bytes = 0;
+
+ uint64_t non_default_storage_class_objs = 0;
+ uint64_t non_default_storage_class_objs_bytes = 0;
+
+ uint64_t ingress_corrupted_etag = 0;
+
+ uint64_t ingress_skip_too_small_bytes = 0;
+ uint64_t ingress_skip_too_small = 0;
+
+ uint64_t ingress_skip_too_small_64KB_bytes = 0;
+ uint64_t ingress_skip_too_small_64KB = 0;
+
+ utime_t duration = {0, 0};
+ };
+ std::ostream& operator<<(std::ostream &out, const worker_stats_t &s);
+ void encode(const worker_stats_t& w, ceph::bufferlist& bl);
+ void decode(worker_stats_t& w, ceph::bufferlist::const_iterator& bl);
+
+
+ struct md5_stats_t {
+ md5_stats_t& operator +=(const md5_stats_t& other);
+ void dump(Formatter *f) const;
+
+ dedup_stats_t small_objs_stat;
+ dedup_stats_t big_objs_stat;
+ uint64_t ingress_failed_load_bucket = 0;
+ uint64_t ingress_failed_get_object = 0;
+ uint64_t ingress_failed_get_obj_attrs = 0;
+ uint64_t ingress_corrupted_etag = 0;
+ uint64_t ingress_corrupted_obj_attrs = 0;
+ uint64_t ingress_skip_encrypted = 0;
+ uint64_t ingress_skip_encrypted_bytes = 0;
+ uint64_t ingress_skip_compressed = 0;
+ uint64_t ingress_skip_compressed_bytes = 0;
+ uint64_t ingress_skip_changed_objs = 0;
+
+ uint64_t shared_manifest_dedup_bytes = 0;
+ uint64_t skipped_shared_manifest = 0;
+ uint64_t skipped_purged_small = 0;
+ uint64_t skipped_singleton = 0;
+ uint64_t skipped_singleton_bytes = 0;
+ uint64_t skipped_source_record = 0;
+ uint64_t duplicate_records = 0;
+ uint64_t size_mismatch = 0;
+ uint64_t hash_mismatch = 0;
+ uint64_t failed_src_load = 0;
+ uint64_t failed_rec_load = 0;
+ uint64_t failed_block_load = 0;
+
+ uint64_t valid_hash_attrs = 0;
+ uint64_t invalid_hash_attrs = 0;
+ uint64_t set_hash_attrs = 0;
+ uint64_t skip_hash_cmp = 0;
+
+ uint64_t set_shared_manifest_src = 0;
+ uint64_t loaded_objects = 0;
+ uint64_t processed_objects = 0;
+ // counter is using on-disk size affected by block-size
+ uint64_t dup_head_bytes_estimate = 0; //duplicate_head_bytes
+ uint64_t deduped_objects = 0;
+ // counter is using s3 byte size disregarding the on-disk size affected by block-size
+ uint64_t deduped_objects_bytes = 0;
+ uint64_t dup_head_bytes = 0;
+ uint64_t failed_dedup = 0;
+ uint64_t failed_table_load = 0;
+ uint64_t failed_map_overflow = 0;
+ utime_t duration = {0, 0};
+ };
+ std::ostream &operator<<(std::ostream &out, const md5_stats_t &s);
+ void encode(const md5_stats_t& m, ceph::bufferlist& bl);
+ void decode(md5_stats_t& m, ceph::bufferlist::const_iterator& bl);
+
+ struct parsed_etag_t {
+ uint64_t md5_high; // High Bytes of the Object Data MD5
+ uint64_t md5_low; // Low Bytes of the Object Data MD5
+ uint16_t num_parts; // How many parts were used in multipart upload
+ // Setting num_parts to zero when multipart is not used
+ };
+
+#define DIV_UP(a, b) ( ((a)+(b-1)) / b)
+ // CEPH min allocation unit on disk is 4KB
+ // TBD: take from config
+ static constexpr uint64_t DISK_ALLOC_SIZE = 4*1024;
+ // 16 bytes hexstring -> 8 Byte uint64_t
+ static inline constexpr unsigned HEX_UNIT_SIZE = 16;
+
+ //---------------------------------------------------------------------------
+ static inline uint64_t byte_size_to_disk_blocks(uint64_t byte_size) {
+ return DIV_UP(byte_size, DISK_ALLOC_SIZE);
+ }
+
+ //---------------------------------------------------------------------------
+ static inline uint64_t disk_blocks_to_byte_size(uint64_t disk_blocks) {
+ return disk_blocks * DISK_ALLOC_SIZE;
+ }
+
+ //---------------------------------------------------------------------------
+ // ceph store full blocks so need to round up and multiply by block_size
+ static inline uint64_t calc_on_disk_byte_size(uint64_t byte_size) {
+ uint64_t size_4k_units = byte_size_to_disk_blocks(byte_size);
+ return disk_blocks_to_byte_size(size_4k_units);
+ }
+
+ enum urgent_msg_t {
+ URGENT_MSG_NONE = 0,
+ URGENT_MSG_ABORT = 1,
+ URGENT_MSG_PASUE = 2,
+ URGENT_MSG_RESUME = 3,
+ URGENT_MSG_RESTART = 4,
+ URGENT_MSG_INVALID = 5
+ };
+
+ const char* get_urgent_msg_names(int msg);
+ bool hex2int(const char *p, const char *p_end, uint64_t *p_val);
+ bool parse_etag_string(const std::string& etag, parsed_etag_t *parsed_etag);
+ void etag_to_bufferlist(uint64_t md5_high, uint64_t md5_low, uint16_t num_parts,
+ ceph::bufferlist *bl);
+ const char* get_next_data_ptr(bufferlist::const_iterator &bl_itr,
+ char data_buff[],
+ size_t len,
+ const DoutPrefixProvider* dpp);
+
+ //---------------------------------------------------------------------------
+ static inline void build_oid(const std::string &bucket_id,
+ const std::string &obj_name,
+ std::string *oid)
+ {
+ *oid = bucket_id + "_" + obj_name;
+ }
+
+ //---------------------------------------------------------------------------
+ static inline uint64_t calc_deduped_bytes(uint64_t head_obj_size,
+ uint16_t num_parts,
+ uint64_t size_bytes)
+ {
+ if (num_parts > 0) {
+ // multipart objects with an empty head i.e. we achive full dedup
+ return size_bytes;
+ }
+ else {
+ // reduce the head size
+ if (size_bytes > head_obj_size) {
+ return size_bytes - head_obj_size;
+ }
+ else {
+ return 0;
+ }
+ }
+ }
+
+} //namespace rgw::dedup
#include "rgw_asio_frontend.h"
#include "rgw_dmclock_scheduler_ctx.h"
#include "rgw_lua.h"
+#ifdef WITH_RADOSGW_RADOS
#include "rgw_dedup.h"
+#endif
#ifdef WITH_RADOSGW_DBSTORE
#include "rgw_sal_dbstore.h"
#endif
#endif
} /* init_lua */
+#ifdef WITH_RADOSGW_RADOS
void rgw::AppMain::init_dedup()
{
rgw::sal::Driver* driver = env.driver;
}
}
}
+#endif
void rgw::AppMain::shutdown(std::function<void(void)> finalize_async_signals)
{
ldh.reset(nullptr); // deletes ldap helper if it was created
rgw_log_usage_finalize();
+#ifdef WITH_RADOSGW_RADOS
if (dedup_background) {
dedup_background->shutdown();
}
+#endif
if (lua_background) {
lua_background->shutdown();
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2;
-// vim: ts=8 sw=2 smarttab
-/*
- * Ceph - scalable distributed file system
- *
- * Author: Gabriel BenHanokh <gbenhano@redhat.com>
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation. See file COPYING.
- *
- */
-
-#include "include/rados/rados_types.hpp"
-#include "include/rados/buffer.h"
-#include "include/rados/librados.hpp"
-#include "rgw_tools.h"
-#include "svc_zone.h"
-#include "common/config.h"
-#include "common/Cond.h"
-#include "common/debug.h"
-#include "common/errno.h"
-#include "rgw_common.h"
-#include "rgw_sal.h"
-#include "rgw_zone.h"
-#include "rgw_cache.h"
-#include "rgw_acl.h"
-#include "rgw_acl_s3.h" /* for dumping s3policy in debug log */
-#include "rgw_aio_throttle.h"
-#include "driver/rados/rgw_bucket.h"
-#include "rgw_sal_config.h"
-#include "rgw_lib.h"
-#include "rgw_placement_types.h"
-#include "driver/rados/rgw_bucket.h"
-#include "driver/rados/rgw_sal_rados.h"
-#include "cls/rgw/cls_rgw_ops.h"
-#include "cls/rgw/cls_rgw_client.h"
-#include "cls/rgw/cls_rgw_const.h"
-#include "cls/refcount/cls_refcount_client.h"
-#include "cls/version/cls_version_client.h"
-#include "fmt/ranges.h"
-#include "osd/osd_types.h"
-#include "common/ceph_crypto.h"
-
-#include <filesystem>
-#include <algorithm>
-#include <iostream>
-#include <fstream>
-#include <stdlib.h>
-#include <time.h>
-#include <sstream>
-#include <errno.h>
-#include <dirent.h>
-#include <stdexcept>
-#include <limits>
-#include <climits>
-#include <cinttypes>
-#include <cstring>
-#include <span>
-#include <mutex>
-#include <thread>
-
-//using namespace std::chrono_literals;
-using namespace librados;
-using namespace std;
-using namespace rgw::dedup;
-
-#include "rgw_dedup_remap.h"
-#include "rgw_sal_rados.h"
-#include "rgw_dedup_table.h"
-#include "rgw_dedup_utils.h"
-#include "rgw_dedup.h"
-#include "rgw_dedup_store.h"
-#include "rgw_dedup_cluster.h"
-#include "rgw_dedup_epoch.h"
-#include "rgw_perf_counters.h"
-#include "include/ceph_assert.h"
-
-static constexpr auto dout_subsys = ceph_subsys_rgw_dedup;
-
-namespace rgw::dedup {
- static inline constexpr unsigned MAX_STORAGE_CLASS_IDX = 128;
- using storage_class_idx_t = uint8_t;
-
- //---------------------------------------------------------------------------
- void Background::DedupWatcher::handle_notify(uint64_t notify_id, uint64_t cookie,
- uint64_t notifier_id, bufferlist &bl)
- {
- ldpp_dout(parent->dpp, 10) << __func__ << "::notify_id=" << notify_id
- << "::cookie=" << cookie
- << "::notifier_id=" << notifier_id << dendl;
- if (parent->d_watch_handle != cookie) {
- ldpp_dout(parent->dpp, 1) << __func__ << "::ERR: wrong cookie=" << cookie
- << "::d_watch_handle=" << parent->d_watch_handle
- << dendl;
- return;
- }
- parent->handle_notify(notify_id, cookie, bl);
- }
-
- //---------------------------------------------------------------------------
- void Background::DedupWatcher::handle_error(uint64_t cookie, int err)
- {
- if (parent->d_watch_handle != cookie) {
- ldpp_dout(parent->dpp, 1) << __func__ << "::ERR: wrong cookie=" << cookie
- << "::d_watch_handle=" << parent->d_watch_handle
- << dendl;
- return;
- }
- ldpp_dout(parent->dpp, 1) << __func__ << "::error=" << err << dendl;
-
- parent->unwatch_reload(parent->dpp);
- parent->watch_reload(parent->dpp);
- }
-
- //---------------------------------------------------------------------------
- void control_t::reset()
- {
- this->dedup_type = dedup_req_type_t::DEDUP_TYPE_NONE;
- this->started = false;
- this->dedup_exec = false;
- this->shutdown_req = false;
- this->shutdown_done = false;
- this->local_pause_req = false;
- this->local_paused = false;
- this->remote_abort_req = false;
- this->remote_aborted = false;
- this->remote_pause_req = false;
- this->remote_paused = false;
- this->remote_restart_req = false;
- }
-
- //---------------------------------------------------------------------------
- void encode(const control_t& ctl, ceph::bufferlist& bl)
- {
- ENCODE_START(1, 1, bl);
- encode(static_cast<int32_t>(ctl.dedup_type), bl);
- encode(ctl.started, bl);
- encode(ctl.dedup_exec, bl);
- encode(ctl.shutdown_req, bl);
- encode(ctl.shutdown_done, bl);
- encode(ctl.local_pause_req, bl);
- encode(ctl.local_paused, bl);
- encode(ctl.remote_abort_req, bl);
- encode(ctl.remote_aborted, bl);
- encode(ctl.remote_pause_req, bl);
- encode(ctl.remote_paused, bl);
- encode(ctl.remote_restart_req, bl);
- ENCODE_FINISH(bl);
- }
-
- //---------------------------------------------------------------------------
- void decode(control_t& ctl, ceph::bufferlist::const_iterator& bl)
- {
- DECODE_START(1, bl);
- int32_t dedup_type;
- decode(dedup_type, bl);
- ctl.dedup_type = static_cast<dedup_req_type_t> (dedup_type);
- decode(ctl.started, bl);
- decode(ctl.dedup_exec, bl);
- decode(ctl.shutdown_req, bl);
- decode(ctl.shutdown_done, bl);
- decode(ctl.local_pause_req, bl);
- decode(ctl.local_paused, bl);
- decode(ctl.remote_abort_req, bl);
- decode(ctl.remote_aborted, bl);
- decode(ctl.remote_pause_req, bl);
- decode(ctl.remote_paused, bl);
- decode(ctl.remote_restart_req, bl);
- DECODE_FINISH(bl);
- }
-
- //---------------------------------------------------------------------------
- std::ostream& operator<<(std::ostream &out, const control_t &ctl)
- {
- out << ctl.dedup_type;
- if (ctl.started) {
- out << "::started";
- }
- if (ctl.dedup_exec) {
- out << "::dedup_exec";
- }
- if (ctl.shutdown_req) {
- out << "::shutdown_req";
- }
- if (ctl.shutdown_done) {
- out << "::shutdown_done";
- }
- if (ctl.local_pause_req) {
- out << "::local_pause_req";
- }
- if (ctl.local_paused) {
- out << "::local_paused";
- }
- if (ctl.remote_abort_req) {
- out << "::remote_abort_req";
- }
- if (ctl.remote_aborted) {
- out << "::remote_aborted";
- }
- if (ctl.remote_pause_req) {
- out << "::remote_pause_req";
- }
- if (ctl.remote_paused) {
- out << "::remote_paused";
- }
- if (ctl.remote_restart_req) {
- out << "::remote_restart_req";
- }
-
- return out;
- }
-
- //===========================================================================
- // rgw::dedup::Background
- //===========================================================================
- //---------------------------------------------------------------------------
- static void display_ioctx_state(const DoutPrefixProvider *dpp,
- const librados::IoCtx &ioctx,
- const char *caller)
- {
- if (ioctx.is_valid()) {
- ldpp_dout(dpp, 5) << caller << "::valid ioctx, instance_id="
- << ioctx.get_instance_id() << dendl;
- }
- else {
- ldpp_dout(dpp, 5) << caller << "::invalid ioctx" << dendl;
- }
- }
-
- //---------------------------------------------------------------------------
- static int safe_pool_delete(rgw::sal::RadosStore *store,
- const DoutPrefixProvider *dpp,
- int64_t expected_pool_id)
- {
- const rgw_pool& dedup_pool = store->svc()->zone->get_zone_params().dedup_pool;
- auto rados_handle = store->getRados()->get_rados_handle();
- int64_t pool_id = rados_handle->pool_lookup(dedup_pool.name.c_str());
- if (pool_id < 0) {
- int err = pool_id;
- if (err == ENOENT) {
- ldpp_dout(dpp, 10) <<__func__ << "::pool doesn't exist (probably was removed by other RGW)::"
- << dedup_pool.name << "::expected_pool_id="
- << expected_pool_id << dendl;
- }
- else {
- ldpp_dout(dpp, 5) <<__func__ << "::failed pool_lookup(" << dedup_pool.name
- << ") err=" << cpp_strerror(-err) << dendl;
- }
- return err;
- }
-
- if (pool_id != expected_pool_id) {
- ldpp_dout(dpp, 5) << __func__ << "::ERR: pool_id was changed from: "
- << expected_pool_id << " to: " << pool_id
- << " abort pool_delete() request!" << dendl;
- // report Stale file handle
- return -ESTALE;
- }
-
- ldpp_dout(dpp, 10) <<__func__ << "::calling delete pool(" << dedup_pool.name
- << ") pool_id=" << pool_id << dendl;
- return rados_handle->pool_delete(dedup_pool.name.c_str());
- }
-
- //---------------------------------------------------------------------------
- static int64_t create_pool(rgw::sal::RadosStore *store,
- const DoutPrefixProvider *dpp,
- const std::string &pool_name)
- {
-#if 0
- // using Replica-1 for the intermediate data
- // since it can be regenerated in case of a failure
- std::string replica_count(std::to_string(1));
-#else
- // temporary solution until we find a way to disable the health warn on replica1
- std::string replica_count(std::to_string(2));
-#endif
- librados::bufferlist inbl;
- std::string output;
- std::string command = R"(
- {
- "prefix": "osd pool create",
- "pool": ")" + pool_name +
- R"(",
- "pool_type": "replicated",
- "size": )" + replica_count +
- R"(
- })";
-
- auto rados_handle = store->getRados()->get_rados_handle();
- int ret = rados_handle->mon_command(command, inbl, nullptr, &output);
- if (output.length()) {
- if (output != "pool 'rgw_dedup_pool' already exists") {
- ldpp_dout(dpp, 10) << __func__ << "::" << output << dendl;
- }
- }
- if (ret != 0 && ret != -EEXIST) {
- ldpp_dout(dpp, 1) << __func__ << "::ERR: failed to create pool "
- << pool_name << " with: "
- << cpp_strerror(-ret) << ", ret=" << ret << dendl;
- return ret;
- }
- const rgw_pool& dedup_pool = store->svc()->zone->get_zone_params().dedup_pool;
- return rados_handle->pool_lookup(dedup_pool.name.c_str());
- }
-
- //---------------------------------------------------------------------------
- static int init_dedup_pool_ioctx(rgw::sal::RadosStore *store,
- const DoutPrefixProvider *dpp,
- bool create,
- librados::IoCtx &ioctx)
- {
- const rgw_pool& dedup_pool = store->svc()->zone->get_zone_params().dedup_pool;
- std::string pool_name(dedup_pool.name.c_str());
- auto rados_handle = store->getRados()->get_rados_handle();
- int64_t pool_id = rados_handle->pool_lookup(dedup_pool.name.c_str());
- if (pool_id >= 0) {
- // TBD: what to do when create option is passed
- ldpp_dout(dpp, 10) << __func__ << "::pool " << dedup_pool.name
- << " already exists, pool_id=" << pool_id << dendl;
- }
- else if (create) {
- pool_id = create_pool(store, dpp, pool_name);
- if (pool_id >= 0) {
- ldpp_dout(dpp, 10) << __func__ << "::pool " << dedup_pool.name
- << " was created, pool_id=" << pool_id << dendl;
- }
- else {
- return pool_id;
- }
- }
- else {
- ldpp_dout(dpp, 1) << __func__
- << "::ERR: pool doesn't exist and no create option" << dendl;
- return -ENOENT;
- }
-
- int ret = rgw_init_ioctx(dpp, rados_handle, dedup_pool, ioctx);
- if (unlikely(ret < 0)) {
- ldpp_dout(dpp, 1) << __func__ << "::ERR: failed rgw_init_ioctx() ret=" << ret
- << "::" << cpp_strerror(-ret) << dendl;
- return ret;
- }
-
- ret = ioctx.application_enable("rgw_dedup", false);
- if (ret == 0) {
- ldpp_dout(dpp, 10) << __func__ << "::pool " << dedup_pool.name
- << " was associated with dedup app" << dendl;
- }
- else {
- ldpp_dout(dpp, 1) << __func__ << "::ERR: failed to enable pool "
- << dedup_pool.name << " with: "
- << cpp_strerror(-ret) << ", ret=" << ret << dendl;
- }
- return ret;
- }
-
- //---------------------------------------------------------------------------
- int Background::init_rados_access_handles(bool init_pool)
- {
- store = dynamic_cast<rgw::sal::RadosStore*>(driver);
- if (!store) {
- ldpp_dout(dpp, 0) << "ERR: failed dynamic_cast to RadosStore" << dendl;
- // this is the return code used in rgw_bucket.cc
- return -ENOTSUP;
- }
-
- rados = store->getRados();
- rados_handle = rados->get_rados_handle();
- if (init_pool) {
- int ret = init_dedup_pool_ioctx(store, dpp, true, d_dedup_cluster_ioctx);
- display_ioctx_state(dpp, d_dedup_cluster_ioctx, __func__);
- return ret;
- }
- return 0;
- }
-
- //---------------------------------------------------------------------------
- Background::Background(rgw::sal::Driver* _driver, CephContext* _cct) :
- driver(_driver),
- dp(_cct, dout_subsys, "dedup background: "),
- dpp(&dp),
- cct(_cct),
- d_cluster(dpp, cct, driver),
- d_watcher_ctx(this)
- {
- d_min_obj_size_for_dedup = cct->_conf->rgw_max_chunk_size;
- d_head_object_size = cct->_conf->rgw_max_chunk_size;
- //ceph_assert(4*1024*1024 == d_head_object_size);
-
- int ret = init_rados_access_handles(false);
- if (ret != 0) {
- derr << __func__ << "::ERR: failed init_rados_access_handles() ret="
- << ret << "::" << cpp_strerror(-ret) << dendl;
- throw std::runtime_error("Failed init_rados_access_handles()");
- }
-
- d_heart_beat_last_update = ceph_clock_now();
- d_heart_beat_max_elapsed_sec = 3;
- }
-
- //---------------------------------------------------------------------------
- int Background::add_disk_rec_from_bucket_idx(disk_block_array_t &disk_arr,
- const rgw::sal::Bucket *p_bucket,
- const parsed_etag_t *p_parsed_etag,
- const std::string &obj_name,
- uint64_t obj_size,
- const std::string &storage_class)
- {
- disk_record_t rec(p_bucket, obj_name, p_parsed_etag, obj_size, storage_class);
- // First pass using only ETAG and size taken from bucket-index
- rec.s.flags.set_fastlane();
-
- auto p_disk = disk_arr.get_shard_block_seq(p_parsed_etag->md5_low);
- disk_block_seq_t::record_info_t rec_info;
- int ret = p_disk->add_record(d_dedup_cluster_ioctx, &rec, &rec_info);
- if (unlikely(ret != 0)) {
- return ret;
- }
- ldpp_dout(dpp, 20) << __func__ << "::" << p_bucket->get_name() << "/"
- << obj_name << " was written to block_idx="
- << rec_info.block_id << " rec_id=" << rec_info.rec_id << dendl;
- return 0;
- }
-
- //---------------------------------------------------------------------------
- int Background::add_record_to_dedup_table(dedup_table_t *p_table,
- const disk_record_t *p_rec,
- disk_block_id_t block_id,
- record_id_t rec_id,
- md5_stats_t *p_stats,
- remapper_t *remapper)
- {
- uint32_t size_4k_units = byte_size_to_disk_blocks(p_rec->s.obj_bytes_size);
- storage_class_idx_t sc_idx = remapper->remap(p_rec->stor_class, dpp,
- &p_stats->failed_map_overflow);
- if (unlikely(sc_idx == remapper_t::NULL_IDX)) {
- // TBD: need stat counters
- return -EOVERFLOW;
- }
- key_t key(p_rec->s.md5_high, p_rec->s.md5_low, size_4k_units,
- p_rec->s.num_parts, sc_idx);
- bool has_shared_manifest = p_rec->has_shared_manifest();
- ldpp_dout(dpp, 20) << __func__ << "::bucket=" << p_rec->bucket_name
- << ", obj=" << p_rec->obj_name << ", block_id="
- << (uint32_t)block_id << ", rec_id=" << (uint32_t)rec_id
- << ", shared_manifest=" << has_shared_manifest
- << "::num_parts=" << p_rec->s.num_parts
- << "::size_4k_units=" << key.size_4k_units
- << "::ETAG=" << std::hex << p_rec->s.md5_high
- << p_rec->s.md5_low << std::dec << dendl;
-
- int ret = p_table->add_entry(&key, block_id, rec_id, has_shared_manifest);
- if (ret == 0) {
- p_stats->loaded_objects ++;
- ldpp_dout(dpp, 20) << __func__ << "::" << p_rec->bucket_name << "/"
- << p_rec->obj_name << " was added successfully to table"
- << "::loaded_objects=" << p_stats->loaded_objects << dendl;
- }
- else {
- // We allocate memory for the dedup on startup based on the existing obj count
- // If the system grew significantly since that point we won't be able to
- // accommodate all the objects in the hash-table.
- // Please keep in mind that it is very unlikely since duplicates objects will
- // consume a single entry and since we skip small objects so in reality
- // I expect the allocation to be more than sufficient.
- //
- // However, if we filled up the system there is still value is continuing
- // with this process since we might find duplicates to existing object (which
- // don't take extra space)
-
- int level = 15;
- if (p_stats->failed_table_load % 0x10000 == 0) {
- level = 5;
- }
- else if (p_stats->failed_table_load % 0x100 == 0) {
- level = 10;
- }
- ldpp_dout(dpp, level) << __func__ << "::Failed p_table->add_entry (overflow)"
- << "::loaded_objects=" << p_stats->loaded_objects
- << "::failed_table_load=" << p_stats->failed_table_load
- << dendl;
-
- p_stats->failed_table_load++;
- }
- return ret;
- }
-
-#ifdef FULL_DEDUP_SUPPORT
-
- static constexpr uint64_t cost = 1; // 1 throttle unit per request
- static constexpr uint64_t id = 0; // ids unused
- //---------------------------------------------------------------------------
- [[maybe_unused]]static void show_ref_tags(const DoutPrefixProvider* dpp, std::string &oid, rgw_rados_ref &obj)
- {
- unsigned idx = 0;
- std::list<std::string> refs;
- std::string wildcard_tag;
- int ret = cls_refcount_read(obj.ioctx, oid, &refs, true);
- if (ret < 0) {
- ldpp_dout(dpp, 0) << __func__ << "::ERR: manifest::failed cls_refcount_read()"
- << " idx=" << idx << dendl;
- return;
- }
-
- for (list<string>::iterator iter = refs.begin(); iter != refs.end(); ++iter) {
- ldpp_dout(dpp, 20) << __func__ << "::manifest::" << oid << "::" << idx
- << "::TAG=" << *iter << dendl;
- }
- }
-
- //---------------------------------------------------------------------------
- int Background::free_tail_objs_by_manifest(const string &ref_tag,
- const string &oid,
- RGWObjManifest &tgt_manifest)
- {
- unsigned idx = 0;
- for (auto p = tgt_manifest.obj_begin(dpp); p != tgt_manifest.obj_end(dpp); ++p, ++idx) {
- rgw_raw_obj raw_obj = p.get_location().get_raw_obj(rados);
- if (oid == raw_obj.oid) {
- ldpp_dout(dpp, 20) << __func__ << "::[" << idx <<"] Skip HEAD OBJ: " << raw_obj.oid << dendl;
- continue;
- }
-
- rgw_rados_ref obj;
- int ret = rgw_get_rados_ref(dpp, rados_handle, raw_obj, &obj);
- if (ret < 0) {
- ldpp_dout(dpp, 1) << __func__ << "ERR: manifest::failed to open context "
- << obj << dendl;
- continue;
- }
- librados::IoCtx ioctx = obj.ioctx;
- ldpp_dout(dpp, 20) << __func__ << "::removing tail object: " << raw_obj.oid
- << dendl;
- ret = ioctx.remove(raw_obj.oid);
- }
-
- return 0;
- }
-
- //---------------------------------------------------------------------------
- int Background::rollback_ref_by_manifest(const string &ref_tag,
- const string &oid,
- RGWObjManifest &manifest)
- {
- unsigned idx = 0;
- int ret_code = 0;
- std::unique_ptr<rgw::Aio> aio = rgw::make_throttle(cct->_conf->rgw_max_copy_obj_concurrent_io, null_yield);
- for (auto p = manifest.obj_begin(dpp); p != manifest.obj_end(dpp); ++p, ++idx) {
- rgw_raw_obj raw_obj = p.get_location().get_raw_obj(rados);
- if (oid == raw_obj.oid) {
- ldpp_dout(dpp, 20) << __func__ << "::[" << idx <<"] Skip HEAD OBJ: "
- << raw_obj.oid << dendl;
- continue;
- }
-
- rgw_rados_ref obj;
- int local_ret = rgw_get_rados_ref(dpp, rados_handle, raw_obj, &obj);
- if (local_ret < 0) {
- ret_code = local_ret;
- ldpp_dout(dpp, 1) << __func__ << "::ERR: manifest::failed to open context "
- << obj << dendl;
- // skip bad objects, nothing we can do
- continue;
- }
-
- ObjectWriteOperation op;
- cls_refcount_put(op, ref_tag, true);
- rgw::AioResultList completed = aio->get(obj.obj,
- rgw::Aio::librados_op(obj.ioctx, std::move(op), null_yield),
- cost, id);
- }
- rgw::AioResultList completed = aio->drain();
- return ret_code;
- }
-
- //---------------------------------------------------------------------------
- int Background::inc_ref_count_by_manifest(const string &ref_tag,
- const string &oid,
- RGWObjManifest &manifest)
- {
- std::unique_ptr<rgw::Aio> aio = rgw::make_throttle(cct->_conf->rgw_max_copy_obj_concurrent_io, null_yield);
- rgw::AioResultList all_results;
- int ret = 0;
- unsigned idx = 0;
- for (auto p = manifest.obj_begin(dpp); p != manifest.obj_end(dpp); ++p, ++idx) {
- rgw_raw_obj raw_obj = p.get_location().get_raw_obj(rados);
- if (oid == raw_obj.oid) {
- ldpp_dout(dpp, 20) << __func__ << "::[" << idx <<"] Skip HEAD OBJ: " << raw_obj.oid << dendl;
- continue;
- }
-
- rgw_rados_ref obj;
- ret = rgw_get_rados_ref(dpp, rados_handle, raw_obj, &obj);
- if (ret < 0) {
- ldpp_dout(dpp, 1) << __func__ << "::ERR: manifest::failed to open context "
- << obj << dendl;
- break;
- }
-
- ObjectWriteOperation op;
- cls_refcount_get(op, ref_tag, true);
- ldpp_dout(dpp, 20) << __func__ << "::inc ref-count on tail object: " << raw_obj.oid << dendl;
- rgw::AioResultList completed = aio->get(obj.obj,
- rgw::Aio::librados_op(obj.ioctx, std::move(op), null_yield),
- cost, id);
- ret = rgw::check_for_errors(completed);
- all_results.splice(all_results.end(), completed);
- if (ret < 0) {
- ldpp_dout(dpp, 1) << __func__ << "::ERROR: failed to copy obj=" << obj
- << ", the error code = " << ret << dendl;
- break;
- }
- }
-
- if (ret == 0) {
- rgw::AioResultList completed = aio->drain();
- int ret = rgw::check_for_errors(completed);
- all_results.splice(all_results.end(), completed);
- if (ret == 0) {
- return 0;
- }
- else {
- ldpp_dout(dpp, 1) << __func__ << "::ERR: manifest: failed to drain ios ret="
- << ret <<dendl;
- }
- }
-
- // if arrived here we failed somewhere -> rollback all ref-inc operations
- /* wait all pending op done */
- rgw::AioResultList completed = aio->drain();
- all_results.splice(all_results.end(), completed);
- int ret2 = 0;
- for (auto& aio_res : all_results) {
- if (aio_res.result < 0) {
- continue; // skip errors
- }
- rgw_rados_ref obj;
- ret2 = rgw_get_rados_ref(dpp, rados_handle, aio_res.obj, &obj);
- if (ret2 < 0) {
- continue;
- }
-
- ObjectWriteOperation op;
- cls_refcount_put(op, ref_tag, true);
- rgw::AioResultList completed = aio->get(obj.obj,
- rgw::Aio::librados_op(obj.ioctx, std::move(op), null_yield),
- cost, id);
- ret2 = rgw::check_for_errors(completed);
- if (ret2 < 0) {
- ldpp_dout(dpp, 1) << __func__ << "::ERR: cleanup after error failed to drop reference on obj=" << aio_res.obj << dendl;
- }
- }
- completed = aio->drain();
- ret2 = rgw::check_for_errors(completed);
- if (ret2 < 0) {
- ldpp_dout(dpp, 1) << __func__ << "::ERR: failed to drain rollback ios, ret="
- << ret2 <<dendl;
- }
-
- return ret;
- }
-
- //---------------------------------------------------------------------------
- static int get_ioctx(const DoutPrefixProvider* const dpp,
- rgw::sal::Driver* driver,
- RGWRados* rados,
- const disk_record_t *p_rec,
- librados::IoCtx *p_ioctx,
- std::string *oid)
- {
- unique_ptr<rgw::sal::Bucket> bucket;
- {
- rgw_bucket b{p_rec->tenant_name, p_rec->bucket_name, p_rec->bucket_id};
- int ret = driver->load_bucket(dpp, b, &bucket, null_yield);
- if (unlikely(ret != 0)) {
- ldpp_dout(dpp, 1) << __func__ << "::ERR: driver->load_bucket(): "
- << cpp_strerror(-ret) << dendl;
- return ret;
- }
- }
-
- build_oid(p_rec->bucket_id, p_rec->obj_name, oid);
- //ldpp_dout(dpp, 0) << __func__ << "::OID=" << oid << " || bucket_id=" << bucket_id << dendl;
- rgw_pool data_pool;
- rgw_obj obj{bucket->get_key(), *oid};
- if (!rados->get_obj_data_pool(bucket->get_placement_rule(), obj, &data_pool)) {
- ldpp_dout(dpp, 1) << __func__ << "::failed to get data pool for bucket "
- << bucket->get_name() << dendl;
- return -EIO;
- }
- int ret = rgw_init_ioctx(dpp, rados->get_rados_handle(), data_pool, *p_ioctx);
- if (ret < 0) {
- ldpp_dout(dpp, 1) << __func__ << "::ERR: failed to get ioctx from data pool:"
- << data_pool.to_str() << dendl;
- return -EIO;
- }
-
- return 0;
- }
-
- //---------------------------------------------------------------------------
- static void init_cmp_pairs(const disk_record_t *p_rec,
- const bufferlist &etag_bl,
- bufferlist &hash_bl, // OUT PARAM
- librados::ObjectWriteOperation *p_op)
- {
- p_op->cmpxattr(RGW_ATTR_ETAG, CEPH_OSD_CMPXATTR_OP_EQ, etag_bl);
- // TBD: do we really need the secondary compare using the full manifest?
- // Can replace it with something cheaper like size/version?
- p_op->cmpxattr(RGW_ATTR_MANIFEST, CEPH_OSD_CMPXATTR_OP_EQ, p_rec->manifest_bl);
-
- // BLAKE3 hash has 256 bit splitted into multiple 64bit units
- const unsigned units = (256 / (sizeof(uint64_t)*8));
- static_assert(units == 4);
- for (unsigned i = 0; i < units; i++) {
- ceph::encode(p_rec->s.hash[i], hash_bl);
- }
-
- if (!p_rec->s.flags.hash_calculated()) {
- p_op->cmpxattr(RGW_ATTR_BLAKE3, CEPH_OSD_CMPXATTR_OP_EQ, hash_bl);
- }
- }
-
- //---------------------------------------------------------------------------
- int Background::dedup_object(const disk_record_t *p_src_rec,
- const disk_record_t *p_tgt_rec,
- md5_stats_t *p_stats,
- bool has_shared_manifest_src)
- {
- RGWObjManifest src_manifest;
- try {
- auto bl_iter = p_src_rec->manifest_bl.cbegin();
- decode(src_manifest, bl_iter);
- } catch (buffer::error& err) {
- ldpp_dout(dpp, 1) << __func__ << "::ERROR: bad src manifest" << dendl;
- return -EINVAL;
- }
- RGWObjManifest tgt_manifest;
- try {
- auto bl_iter = p_tgt_rec->manifest_bl.cbegin();
- decode(tgt_manifest, bl_iter);
- } catch (buffer::error& err) {
- ldpp_dout(dpp, 1) << __func__ << "::ERROR: bad tgt manifest" << dendl;
- return -EINVAL;
- }
- ldpp_dout(dpp, 20) << __func__ << "::DEDUP From: "
- << p_src_rec->bucket_name << "/" << p_src_rec->obj_name << " -> "
- << p_tgt_rec->bucket_name << "/" << p_tgt_rec->obj_name << dendl;
-
- bufferlist etag_bl;
- etag_to_bufferlist(p_tgt_rec->s.md5_high, p_tgt_rec->s.md5_low, p_tgt_rec->s.num_parts, &etag_bl);
- ldpp_dout(dpp, 20) << __func__ << "::num_parts=" << p_tgt_rec->s.num_parts
- << "::ETAG=" << etag_bl.to_str() << dendl;
-
- bufferlist hash_bl, manifest_hash_bl, tgt_hash_bl;
- crypto::digest<crypto::SHA1>(p_src_rec->manifest_bl).encode(hash_bl);
- // Use a shorter hash (64bit instead of 160bit)
- hash_bl.splice(0, 8, &manifest_hash_bl);
- librados::ObjectWriteOperation tgt_op;
- init_cmp_pairs(p_tgt_rec, etag_bl, tgt_hash_bl, &tgt_op);
- tgt_op.setxattr(RGW_ATTR_SHARE_MANIFEST, manifest_hash_bl);
- tgt_op.setxattr(RGW_ATTR_MANIFEST, p_src_rec->manifest_bl);
- if (p_tgt_rec->s.flags.hash_calculated()) {
- tgt_op.setxattr(RGW_ATTR_BLAKE3, tgt_hash_bl);
- p_stats->set_hash_attrs++;
- }
-
- std::string src_oid, tgt_oid;
- librados::IoCtx src_ioctx, tgt_ioctx;
- int ret1 = get_ioctx(dpp, driver, rados, p_src_rec, &src_ioctx, &src_oid);
- int ret2 = get_ioctx(dpp, driver, rados, p_tgt_rec, &tgt_ioctx, &tgt_oid);
- if (unlikely(ret1 != 0 || ret2 != 0)) {
- ldpp_dout(dpp, 1) << __func__ << "::ERR: failed get_ioctx()" << dendl;
- return (ret1 ? ret1 : ret2);
- }
-
- // TBD: Do we need to remove target RGW_ATTR_TAIL_TAG??
- string ref_tag = p_tgt_rec->ref_tag;
- ldpp_dout(dpp, 20) << __func__ << "::ref_tag=" << ref_tag << dendl;
- int ret = inc_ref_count_by_manifest(ref_tag, src_oid, src_manifest);
- if (ret == 0) {
- ldpp_dout(dpp, 20) << __func__ << "::send TGT CLS (Shared_Manifest)" << dendl;
- ret = tgt_ioctx.operate(tgt_oid, &tgt_op);
- if (unlikely(ret != 0)) {
- ldpp_dout(dpp, 1) << __func__ << "::ERR: failed tgt_ioctx.operate("
- << tgt_oid << "), err is " << cpp_strerror(-ret) << dendl;
- rollback_ref_by_manifest(ref_tag, src_oid, src_manifest);
- return ret;
- }
-
- // free tail objects based on TGT manifest
- free_tail_objs_by_manifest(ref_tag, tgt_oid, tgt_manifest);
-
- if (!has_shared_manifest_src) {
- // When SRC OBJ A has two or more dups (B, C) we set SHARED_MANIFEST
- // after deduping B and update it in dedup_table, but don't update the
- // disk-record (as require an expensive random-disk-write).
- // When deduping C we can trust the shared_manifest state in the table and
- // skip a redundant update to SRC object attribute
- bufferlist src_hash_bl;
- librados::ObjectWriteOperation src_op;
- init_cmp_pairs(p_src_rec, etag_bl, src_hash_bl, &src_op);
- src_op.setxattr(RGW_ATTR_SHARE_MANIFEST, manifest_hash_bl);
- if (p_src_rec->s.flags.hash_calculated()) {
- src_op.setxattr(RGW_ATTR_BLAKE3, src_hash_bl);
- p_stats->set_hash_attrs++;
- }
-
- ldpp_dout(dpp, 20) << __func__ <<"::send SRC CLS (Shared_Manifest)"<< dendl;
- ret = src_ioctx.operate(src_oid, &src_op);
- if (unlikely(ret != 0)) {
- ldpp_dout(dpp, 1) << __func__ << "::ERR: failed src_ioctx.operate("
- << src_oid << "), err is " << cpp_strerror(-ret)<<dendl;
- return ret;
- }
- }
- }
-
- // do we need to set compression on the head object or is it set on tail?
- // RGW_ATTR_COMPRESSION
- return ret;
- }
-
- //---------------------------------------------------------------------------
- int Background::calc_object_blake3(const disk_record_t *p_rec, uint8_t *p_hash)
- {
- ldpp_dout(dpp, 20) << __func__ << "::obj_name=" << p_rec->obj_name << dendl;
- RGWObjManifest manifest;
- try {
- auto bl_iter = p_rec->manifest_bl.cbegin();
- decode(manifest, bl_iter);
- } catch (buffer::error& err) {
- ldpp_dout(dpp, 1) << __func__ << "::ERROR: bad src manifest for: "
- << p_rec->obj_name << dendl;
- return -EINVAL;
- }
-
- blake3_hasher hmac;
- blake3_hasher_init(&hmac);
- for (auto p = manifest.obj_begin(dpp); p != manifest.obj_end(dpp); ++p) {
- rgw_raw_obj raw_obj = p.get_location().get_raw_obj(rados);
- rgw_rados_ref obj;
- int ret = rgw_get_rados_ref(dpp, rados_handle, raw_obj, &obj);
- if (ret < 0) {
- ldpp_dout(dpp, 1) << __func__ << "::failed rgw_get_rados_ref() for oid: "
- << raw_obj.oid << ", err is " << cpp_strerror(-ret) << dendl;
- return ret;
- }
-
- bufferlist bl;
- librados::IoCtx ioctx = obj.ioctx;
- // read full object
- ret = ioctx.read(raw_obj.oid, bl, 0, 0);
- if (ret > 0) {
- for (const auto& bptr : bl.buffers()) {
- blake3_hasher_update(&hmac, (const unsigned char *)bptr.c_str(), bptr.length());
- }
- }
- else {
- ldpp_dout(dpp, 1) << __func__ << "::ERR: failed to read " << raw_obj.oid
- << ", error is " << cpp_strerror(-ret) << dendl;
- return ret;
- }
- }
-
- blake3_hasher_finalize(&hmac, p_hash, BLAKE3_OUT_LEN);
- return 0;
- }
-
- //---------------------------------------------------------------------------
- [[maybe_unused]]static void __attribute__ ((noinline))
- print_record(const DoutPrefixProvider* dpp,
- const disk_record_t *p_tgt_rec,
- disk_block_id_t block_id,
- record_id_t rec_id,
- md5_shard_t md5_shard)
- {
- ldpp_dout(dpp, 20) << __func__ << "::bucket=" << p_tgt_rec->bucket_name
- << ", obj=" << p_tgt_rec->obj_name
- << ", block_id=" << block_id
- << ", rec_id=" << (int)rec_id
- << ", md5_shard=" << (int)md5_shard << dendl;
-
- ldpp_dout(dpp, 20) << __func__ << "::md5_shard=" << (int)md5_shard
- << "::" << p_tgt_rec->bucket_name
- << "/" << p_tgt_rec->obj_name
- << "::num_parts=" << p_tgt_rec->s.num_parts
- << "::ETAG=" << std::hex << p_tgt_rec->s.md5_high
- << p_tgt_rec->s.md5_low << std::dec << dendl;
- }
-
- //---------------------------------------------------------------------------
- int Background::add_obj_attrs_to_record(rgw_bucket *p_rb,
- disk_record_t *p_rec,
- const rgw::sal::Attrs &attrs,
- dedup_table_t *p_table,
- md5_stats_t *p_stats) /*IN-OUT*/
- {
- // if TAIL_TAG exists -> use it as ref-tag, eitherwise take ID_TAG
- auto itr = attrs.find(RGW_ATTR_TAIL_TAG);
- if (itr != attrs.end()) {
- p_rec->ref_tag = itr->second.to_str();
- }
- else {
- itr = attrs.find(RGW_ATTR_ID_TAG);
- if (itr != attrs.end()) {
- p_rec->ref_tag = itr->second.to_str();
- }
- else {
- ldpp_dout(dpp, 5) << __func__ << "::No TAIL_TAG and no ID_TAG" << dendl;
- return -EINVAL;
- }
- }
- p_rec->s.ref_tag_len = p_rec->ref_tag.length();
-
- // clear bufferlist first
- p_rec->manifest_bl.clear();
-
- itr = attrs.find(RGW_ATTR_MANIFEST);
- if (itr != attrs.end()) {
- const bufferlist &bl = itr->second;
- RGWObjManifest manifest;
- try {
- auto bl_iter = bl.cbegin();
- decode(manifest, bl_iter);
- } catch (buffer::error& err) {
- ldpp_dout(dpp, 1) << __func__
- << "::ERROR: unable to decode manifest" << dendl;
- return -EINVAL;
- }
-
- // force explicit tail_placement as the dedup could be on another bucket
- const rgw_bucket_placement& tail_placement = manifest.get_tail_placement();
- if (tail_placement.bucket.name.empty()) {
- ldpp_dout(dpp, 20) << __func__ << "dedup::updating tail placement" << dendl;
- manifest.set_tail_placement(tail_placement.placement_rule, *p_rb);
- encode(manifest, p_rec->manifest_bl);
- }
- else {
- p_rec->manifest_bl = bl;
- }
- p_rec->s.manifest_len = p_rec->manifest_bl.length();
- }
- else {
- ldpp_dout(dpp, 5) << __func__ << "::ERROR: no manifest" << dendl;
- return -EINVAL;
- }
-
- itr = attrs.find(RGW_ATTR_SHARE_MANIFEST);
- if (itr != attrs.end()) {
- uint64_t hash = 0;
- try {
- auto bl_iter = itr->second.cbegin();
- ceph::decode(hash, bl_iter);
- p_rec->s.shared_manifest = hash;
- } catch (buffer::error& err) {
- ldpp_dout(dpp, 1) << __func__ << "::ERROR: bad shared_manifest" << dendl;
- return -EINVAL;
- }
- ldpp_dout(dpp, 20) << __func__ << "::Set Shared_Manifest::OBJ_NAME="
- << p_rec->obj_name << "::shared_manifest=0x" << std::hex
- << p_rec->s.shared_manifest << std::dec << dendl;
- p_rec->s.flags.set_shared_manifest();
- }
- else {
- memset(&p_rec->s.shared_manifest, 0, sizeof(p_rec->s.shared_manifest));
- }
-
- itr = attrs.find(RGW_ATTR_BLAKE3);
- if (itr != attrs.end()) {
- try {
- auto bl_iter = itr->second.cbegin();
- // BLAKE3 hash 256 bit splitted into multiple 64bit units
- const unsigned units = (256 / (sizeof(uint64_t)*8));
- static_assert(units == 4);
- for (unsigned i = 0; i < units; i++) {
- uint64_t val;
- ceph::decode(val, bl_iter);
- p_rec->s.hash[i] = val;
- }
- p_stats->valid_hash_attrs++;
- return 0;
- } catch (buffer::error& err) {
- ldpp_dout(dpp, 1) << __func__ << "::ERR: failed HASH decode" << dendl;
- return -EINVAL;
- }
- }
-
- p_stats->invalid_hash_attrs++;
- // TBD: redundant memset...
- memset(p_rec->s.hash, 0, sizeof(p_rec->s.hash));
- // BLAKE3_OUT_LEN is 32 Bytes
- int ret = calc_object_blake3(p_rec, (uint8_t*)p_rec->s.hash);
- if (ret == 0) {
- p_rec->s.flags.set_hash_calculated();
- }
-
- return ret;
- }
-
- //---------------------------------------------------------------------------
- // We purged all entries not marked for-dedup (i.e. singleton bit is set) from the table
- // so all entries left are sources of dedup with multiple copies.
- // Need to read attributes from the Head-Object and output them to a new SLAB
- int Background::read_object_attribute(dedup_table_t *p_table,
- disk_record_t *p_rec,
- disk_block_id_t old_block_id,
- record_id_t old_rec_id,
- md5_shard_t md5_shard,
- md5_stats_t *p_stats /* IN-OUT */,
- disk_block_seq_t *p_disk,
- remapper_t *remapper)
- {
- bool should_print_debug = cct->_conf->subsys.should_gather<ceph_subsys_rgw_dedup, 20>();
- if (unlikely(should_print_debug)) {
- print_record(dpp, p_rec, old_block_id, old_rec_id, md5_shard);
- }
- p_stats->processed_objects ++;
-
- uint32_t size_4k_units = byte_size_to_disk_blocks(p_rec->s.obj_bytes_size);
- uint64_t ondisk_byte_size = disk_blocks_to_byte_size(size_4k_units);
- storage_class_idx_t sc_idx = remapper->remap(p_rec->stor_class, dpp,
- &p_stats->failed_map_overflow);
- if (unlikely(sc_idx == remapper_t::NULL_IDX)) {
- // TBD: need stat counters
- return -EOVERFLOW;
- }
- key_t key_from_bucket_index(p_rec->s.md5_high, p_rec->s.md5_low, size_4k_units,
- p_rec->s.num_parts, sc_idx);
- dedup_table_t::value_t src_val;
- int ret = p_table->get_val(&key_from_bucket_index, &src_val);
- if (ret != 0) {
- if (ondisk_byte_size <= d_min_obj_size_for_dedup && p_rec->s.num_parts == 0) {
- // record has no valid entry in table because it is a too small
- // It was loaded to table for calculation and then purged
- p_stats->skipped_purged_small++;
- ldpp_dout(dpp, 20) << __func__ << "::skipped purged small obj::"
- << p_rec->obj_name << "::" << ondisk_byte_size << dendl;
- // help small object tests pass - avoid complication differentiating between
- // small objects ( < 64KB, >= 64KB <= 4MB, > 4MB
- p_stats->processed_objects--;
- }
- else {
- // record has no valid entry in table because it is a singleton
- p_stats->skipped_singleton++;
- p_stats->skipped_singleton_bytes += ondisk_byte_size;
- ldpp_dout(dpp, 20) << __func__ << "::skipped singleton::"
- << p_rec->obj_name << std::dec << dendl;
- }
- return 0;
- }
-
- // Every object after this point was counted as a dedup potential
- // If we conclude that it can't be dedup it should be accounted for
- rgw_bucket b{p_rec->tenant_name, p_rec->bucket_name, p_rec->bucket_id};
- unique_ptr<rgw::sal::Bucket> bucket;
- ret = driver->load_bucket(dpp, b, &bucket, null_yield);
- if (unlikely(ret != 0)) {
- // could happen when the bucket is removed between passes
- p_stats->ingress_failed_load_bucket++;
- ldpp_dout(dpp, 15) << __func__ << "::Failed driver->load_bucket(): "
- << cpp_strerror(-ret) << dendl;
- return 0;
- }
-
- unique_ptr<rgw::sal::Object> p_obj = bucket->get_object(p_rec->obj_name);
- if (unlikely(!p_obj)) {
- // could happen when the object is removed between passes
- p_stats->ingress_failed_get_object++;
- ldpp_dout(dpp, 15) << __func__ << "::Failed bucket->get_object("
- << p_rec->obj_name << ")" << dendl;
- return 0;
- }
-
- ret = p_obj->get_obj_attrs(null_yield, dpp);
- if (unlikely(ret < 0)) {
- p_stats->ingress_failed_get_obj_attrs++;
- ldpp_dout(dpp, 10) << __func__ << "::ERR: failed to stat object(" << p_rec->obj_name
- << "), returned error: " << cpp_strerror(-ret) << dendl;
- return ret;
- }
-
- const rgw::sal::Attrs& attrs = p_obj->get_attrs();
- if (attrs.find(RGW_ATTR_CRYPT_MODE) != attrs.end()) {
- p_stats->ingress_skip_encrypted++;
- p_stats->ingress_skip_encrypted_bytes += ondisk_byte_size;
- ldpp_dout(dpp, 20) <<__func__ << "::Skipping encrypted object "
- << p_rec->obj_name << dendl;
- return 0;
- }
-
- // TBD: We should be able to support RGW_ATTR_COMPRESSION when all copies are compressed
- if (attrs.find(RGW_ATTR_COMPRESSION) != attrs.end()) {
- p_stats->ingress_skip_compressed++;
- p_stats->ingress_skip_compressed_bytes += ondisk_byte_size;
- ldpp_dout(dpp, 20) <<__func__ << "::Skipping compressed object "
- << p_rec->obj_name << dendl;
- return 0;
- }
-
- // extract ETAG and Size and compare with values taken from the bucket-index
- parsed_etag_t parsed_etag;
- auto itr = attrs.find(RGW_ATTR_ETAG);
- if (itr != attrs.end()) {
- if (unlikely(!parse_etag_string(itr->second.to_str(), &parsed_etag))) {
- p_stats->ingress_corrupted_etag++;
- ldpp_dout(dpp, 10) << __func__ << "::ERROR: corrupted etag::" << p_rec->obj_name << dendl;
- return -EINVAL;
- }
- }
- else {
- p_stats->ingress_corrupted_etag++;
- ldpp_dout(dpp, 10) << __func__ << "::ERROR: no etag" << p_rec->obj_name << dendl;
- return -EINVAL;
- }
-
- std::string storage_class;
- itr = attrs.find(RGW_ATTR_STORAGE_CLASS);
- if (itr != attrs.end()) {
- storage_class = itr->second.to_str();
- }
- else {
- storage_class = RGW_STORAGE_CLASS_STANDARD;
- }
- // no need to check for remap success as we compare keys bellow
- sc_idx = remapper->remap(storage_class, dpp, &p_stats->failed_map_overflow);
- key_t key_from_obj(parsed_etag.md5_high, parsed_etag.md5_low,
- byte_size_to_disk_blocks(p_obj->get_size()),
- parsed_etag.num_parts, sc_idx);
- if (unlikely(key_from_obj != key_from_bucket_index ||
- p_rec->s.obj_bytes_size != p_obj->get_size())) {
- ldpp_dout(dpp, 15) <<__func__ << "::Skipping changed object "
- << p_rec->obj_name << dendl;
- p_stats->ingress_skip_changed_objs++;
- return 0;
- }
-
- // reset flags
- p_rec->s.flags.clear();
- ret = add_obj_attrs_to_record(&b, p_rec, attrs, p_table, p_stats);
- if (unlikely(ret != 0)) {
- ldpp_dout(dpp, 5) << __func__ << "::ERR: failed add_obj_attrs_to_record() ret="
- << ret << "::" << cpp_strerror(-ret) << dendl;
- return ret;
- }
-
- disk_block_seq_t::record_info_t rec_info;
- ret = p_disk->add_record(d_dedup_cluster_ioctx, p_rec, &rec_info);
- if (ret == 0) {
- // set the disk_block_id_t to this unless the existing disk_block_id is marked as shared-manifest
- ceph_assert(rec_info.rec_id < MAX_REC_IN_BLOCK);
- ldpp_dout(dpp, 20) << __func__ << "::" << p_rec->bucket_name << "/"
- << p_rec->obj_name << " was written to block_idx="
- << rec_info.block_id << "::rec_id=" << (int)rec_info.rec_id
- << "::shared_manifest=" << p_rec->has_shared_manifest() << dendl;
- p_table->update_entry(&key_from_bucket_index, rec_info.block_id,
- rec_info.rec_id, p_rec->has_shared_manifest());
- }
- else {
- ldpp_dout(dpp, 5) << __func__ << "::ERR: Failed p_disk->add_record()"<< dendl;
- if (ret == -EINVAL) {
- p_stats->ingress_corrupted_obj_attrs++;
- }
- }
- return ret;
- }
-
- //---------------------------------------------------------------------------
- static int write_blake3_object_attribute(const DoutPrefixProvider* const dpp,
- rgw::sal::Driver* driver,
- RGWRados* rados,
- const disk_record_t *p_rec)
- {
- bufferlist etag_bl;
- bufferlist hash_bl;
- librados::ObjectWriteOperation op;
- etag_to_bufferlist(p_rec->s.md5_high, p_rec->s.md5_low, p_rec->s.num_parts,
- &etag_bl);
- init_cmp_pairs(p_rec, etag_bl, hash_bl /*OUT PARAM*/, &op);
- op.setxattr(RGW_ATTR_BLAKE3, hash_bl);
-
- std::string oid;
- librados::IoCtx ioctx;
- int ret = get_ioctx(dpp, driver, rados, p_rec, &ioctx, &oid);
- if (unlikely(ret != 0)) {
- ldpp_dout(dpp, 5) << __func__ << "::ERR: failed get_ioctx()" << dendl;
- return ret;
- }
-
- ret = ioctx.operate(oid, &op);
- if (unlikely(ret != 0)) {
- ldpp_dout(dpp, 5) << __func__ << "::ERR: failed ioctx.operate("
- << oid << "), err is " << cpp_strerror(-ret) << dendl;
- }
- return ret;
- }
-
- //---------------------------------------------------------------------------
- // We purged all entries not marked for-dedup (i.e. singleton bit is set) from the table
- // so all entries left are sources of dedup with multiple copies.
- // If the record is marked as Shared-Manifest-Object -> skip it
- // if the record's key doesn’t exist in table -> skip it (it is a singleton and it was purged)
- // If the record block-index matches the hashtable entry -> skip it (it is the SRC object)
- // All other entries are Dedicated-Manifest-Objects with a valid SRC object
-
- // we can withstand most errors moving to the next object
- // only report an error if we recived a stop scan request!
- //
- int Background::try_deduping_record(dedup_table_t *p_table,
- const disk_record_t *p_tgt_rec,
- disk_block_id_t block_id,
- record_id_t rec_id,
- md5_shard_t md5_shard,
- md5_stats_t *p_stats, /* IN-OUT */
- remapper_t *remapper)
- {
- bool should_print_debug = cct->_conf->subsys.should_gather<ceph_subsys_rgw, 20>();
- if (unlikely(should_print_debug)) {
- print_record(dpp, p_tgt_rec, block_id, rec_id, md5_shard);
- }
-
- uint32_t size_4k_units = byte_size_to_disk_blocks(p_tgt_rec->s.obj_bytes_size);
- storage_class_idx_t sc_idx = remapper->remap(p_tgt_rec->stor_class, dpp,
- &p_stats->failed_map_overflow);
- ceph_assert(sc_idx != remapper_t::NULL_IDX);
- key_t key(p_tgt_rec->s.md5_high, p_tgt_rec->s.md5_low, size_4k_units,
- p_tgt_rec->s.num_parts, sc_idx);
- dedup_table_t::value_t src_val;
- int ret = p_table->get_val(&key, &src_val);
- if (ret != 0) {
- // record has no valid entry in table because it is a singleton
- // should never happened since we purged all singletons before
- ldpp_dout(dpp, 5) << __func__ << "::skipped singleton::" << p_tgt_rec->bucket_name
- << "/" << p_tgt_rec->obj_name << "::num_parts=" << p_tgt_rec->s.num_parts
- << "::ETAG=" << std::hex << p_tgt_rec->s.md5_high
- << p_tgt_rec->s.md5_low << std::dec << dendl;
- ceph_abort("Unexpcted singleton");
- return 0;
- }
-
- disk_block_id_t src_block_id = src_val.block_idx;
- record_id_t src_rec_id = src_val.rec_id;
- if (block_id == src_block_id && rec_id == src_rec_id) {
- // the table entry point to this record which means it is a dedup source so nothing to do
- p_stats->skipped_source_record++;
- ldpp_dout(dpp, 20) << __func__ << "::skipped source-record" << dendl;
- return 0;
- }
-
- // ceph store full blocks so need to round up and multiply by block_size
- uint64_t ondisk_byte_size = disk_blocks_to_byte_size(size_4k_units);
- uint64_t dedupable_objects_bytes = calc_deduped_bytes(d_head_object_size,
- p_tgt_rec->s.num_parts,
- ondisk_byte_size);
- if (p_tgt_rec->s.flags.has_shared_manifest()) {
- // record holds a shared_manifest object so can't be a dedup target
- p_stats->skipped_shared_manifest++;
- p_stats->shared_manifest_dedup_bytes += dedupable_objects_bytes;
- ldpp_dout(dpp, 20) << __func__ << "::skipped shared_manifest" << dendl;
- return 0;
- }
-
- // This records is a dedup target with source record on source_block_id
- disk_record_t src_rec;
- ret = load_record(d_dedup_cluster_ioctx, p_tgt_rec, &src_rec, src_block_id,
- src_rec_id, md5_shard, dpp);
- if (unlikely(ret != 0)) {
- p_stats->failed_src_load++;
- // we can withstand most errors moving to the next object
- ldpp_dout(dpp, 5) << __func__ << "::ERR: Failed load_record("
- << src_block_id << ", " << src_rec_id << ")" << dendl;
- return 0;
- }
-
- ldpp_dout(dpp, 20) << __func__ << "::SRC=" << src_rec.bucket_name
- << "/" << src_rec.obj_name << dendl;
- // verify that SRC and TGT records don't refer to the same physical object
- // This could happen in theory if we read the same objects twice
- if (src_rec.obj_name == p_tgt_rec->obj_name && src_rec.bucket_name == p_tgt_rec->bucket_name) {
- p_stats->duplicate_records++;
- ldpp_dout(dpp, 10) << __func__ << "::WARN: Duplicate records for object="
- << src_rec.obj_name << dendl;
- return 0;
- }
-
- // the hash table size is rounded to the nearest 4KB and will wrap after 16G
- if (unlikely(src_rec.s.obj_bytes_size != p_tgt_rec->s.obj_bytes_size)) {
- p_stats->size_mismatch++;
- ldpp_dout(dpp, 10) << __func__ << "::WARN: different byte size for objects::"
- << src_rec.obj_name << "::" << src_rec.s.obj_bytes_size
- << "::" << p_tgt_rec->obj_name << "::"
- << p_tgt_rec->s.obj_bytes_size << dendl;
- return 0;
- }
-
- if (memcmp(src_rec.s.hash, p_tgt_rec->s.hash, sizeof(src_rec.s.hash)) != 0) {
- p_stats->hash_mismatch++;
- ldpp_dout(dpp, 10) << __func__ << "::HASH mismatch" << dendl;
- // TBD: set hash attributes on head objects to save calc next time
- if (src_rec.s.flags.hash_calculated()) {
- write_blake3_object_attribute(dpp, driver, rados, &src_rec);
- p_stats->set_hash_attrs++;
- }
- if (p_tgt_rec->s.flags.hash_calculated()) {
- write_blake3_object_attribute(dpp, driver, rados, p_tgt_rec);
- p_stats->set_hash_attrs++;
- }
- return 0;
- }
-
- ret = dedup_object(&src_rec, p_tgt_rec, p_stats, src_val.has_shared_manifest());
- if (ret == 0) {
- p_stats->deduped_objects++;
- p_stats->deduped_objects_bytes += dedupable_objects_bytes;
- if (p_tgt_rec->s.num_parts == 0) {
- // single part objects duplicate the head object when dedup is used
- p_stats->dup_head_bytes += d_head_object_size;
- }
-
- // mark the SRC object as a providor of a shared manifest
- if (!src_val.has_shared_manifest()) {
- p_stats->set_shared_manifest_src++;
- // set the shared manifest flag in the dedup table
- p_table->set_shared_manifest_src_mode(&key, src_block_id, src_rec_id);
- }
- else {
- ldpp_dout(dpp, 20) << __func__ << "::SRC object already marked as shared_manifest" << dendl;
- }
- }
- else {
- ldpp_dout(dpp, 10) << __func__ << "::ERR: Failed dedup for "
- << src_rec.bucket_name << "/" << src_rec.obj_name << dendl;
- p_stats->failed_dedup++;
- }
-
- return 0;
- }
-
-#endif // #ifdef FULL_DEDUP_SUPPORT
- //---------------------------------------------------------------------------
- const char* Background::dedup_step_name(dedup_step_t step)
- {
- static const char* names[] = {"STEP_NONE",
- "STEP_BUCKET_INDEX_INGRESS",
- "STEP_BUILD_TABLE",
- "STEP_READ_ATTRIBUTES",
- "STEP_REMOVE_DUPLICATES"};
- static const char* undefined_step = "UNDEFINED_STEP";
- if (step >= STEP_NONE && step <= STEP_REMOVE_DUPLICATES) {
- return names[step];
- }
- else {
- return undefined_step;
- }
- }
-
- //---------------------------------------------------------------------------
- int Background::process_all_slabs(dedup_table_t *p_table,
- dedup_step_t step,
- md5_shard_t md5_shard,
- work_shard_t worker_id,
- uint32_t *p_slab_count,
- md5_stats_t *p_stats, /* IN-OUT */
- disk_block_seq_t *p_disk_block_seq,
- remapper_t *remapper)
- {
- char block_buff[sizeof(disk_block_t)];
- const int MAX_OBJ_LOAD_FAILURE = 3;
- const int MAX_BAD_BLOCKS = 2;
- bool has_more = true;
- uint32_t seq_number = 0;
- int failure_count = 0;
- ldpp_dout(dpp, 20) << __func__ << "::" << dedup_step_name(step) << "::worker_id="
- << worker_id << ", md5_shard=" << md5_shard << dendl;
- *p_slab_count = 0;
- while (has_more) {
- bufferlist bl;
- int ret = load_slab(d_dedup_cluster_ioctx, bl, md5_shard, worker_id, seq_number, dpp);
- if (unlikely(ret < 0)) {
- ldpp_dout(dpp, 1) << __func__ << "::ERR::Failed loading object!! md5_shard=" << md5_shard
- << ", worker_id=" << worker_id << ", seq_number=" << seq_number
- << ", failure_count=" << failure_count << dendl;
- // skip to the next SLAB stopping after 3 bad objects
- if (failure_count++ < MAX_OBJ_LOAD_FAILURE) {
- seq_number += DISK_BLOCK_COUNT;
- continue;
- }
- else {
- return ret;
- }
- }
-
- (*p_slab_count)++;
- failure_count = 0;
- unsigned slab_rec_count = 0;
- auto bl_itr = bl.cbegin();
- for (uint32_t block_num = 0; block_num < DISK_BLOCK_COUNT; block_num++, seq_number++) {
- disk_block_id_t disk_block_id(worker_id, seq_number);
- const char *p = get_next_data_ptr(bl_itr, block_buff, sizeof(block_buff),
- dpp);
- disk_block_t *p_disk_block = (disk_block_t*)p;
- disk_block_header_t *p_header = p_disk_block->get_header();
- p_header->deserialize();
- if (unlikely(p_header->verify(disk_block_id, dpp) != 0)) {
- p_stats->failed_block_load++;
- // move to next block until reaching a valid block
- if (failure_count++ < MAX_BAD_BLOCKS) {
- continue;
- }
- else {
- ldpp_dout(dpp, 1) << __func__ << "::Skipping slab with too many bad blocks::"
- << (int)md5_shard << ", worker_id=" << (int)worker_id
- << ", seq_number=" << seq_number << dendl;
- failure_count = 0;
- break;
- }
- }
-
- if (p_header->rec_count == 0) {
- ldpp_dout(dpp, 20) << __func__ << "::Block #" << block_num
- << " has an empty header, no more blocks" << dendl;
- has_more = false;
- break;
- }
-
- for (unsigned rec_id = 0; rec_id < p_header->rec_count; rec_id++) {
- unsigned offset = p_header->rec_offsets[rec_id];
- // We deserialize the record inside the CTOR
- disk_record_t rec(p + offset);
- ret = rec.validate(__func__, dpp, disk_block_id, rec_id);
- if (unlikely(ret != 0)) {
- p_stats->failed_rec_load++;
- return ret;
- }
-
- if (step == STEP_BUILD_TABLE) {
- add_record_to_dedup_table(p_table, &rec, disk_block_id, rec_id, p_stats, remapper);
- slab_rec_count++;
- }
-#ifdef FULL_DEDUP_SUPPORT
- else if (step == STEP_READ_ATTRIBUTES) {
- read_object_attribute(p_table, &rec, disk_block_id, rec_id, md5_shard,
- p_stats, p_disk_block_seq, remapper);
- slab_rec_count++;
- }
- else if (step == STEP_REMOVE_DUPLICATES) {
- try_deduping_record(p_table, &rec, disk_block_id, rec_id, md5_shard,
- p_stats, remapper);
- slab_rec_count++;
- }
-#endif // #ifdef FULL_DEDUP_SUPPORT
- else {
- ceph_abort("unexpected step");
- }
- }
-
- check_and_update_md5_heartbeat(md5_shard, p_stats->loaded_objects,
- p_stats->processed_objects);
- if (unlikely(d_ctl.should_pause())) {
- handle_pause_req(__func__);
- }
- if (unlikely(d_ctl.should_stop())) {
- return -ECANCELED;
- }
-
- has_more = (p_header->offset == BLOCK_MAGIC);
- ceph_assert(p_header->offset == BLOCK_MAGIC || p_header->offset == LAST_BLOCK_MAGIC);
- if (!has_more) {
- ldpp_dout(dpp, 20) << __func__ << "::No more blocks! block_id=" << disk_block_id
- << ", rec_count=" << p_header->rec_count << dendl;
- break;
- }
- }
- ldpp_dout(dpp, 20) <<__func__ << "::slab seq_number=" << seq_number
- << ", rec_count=" << slab_rec_count << dendl;
- }
- return 0;
- }
-
- //---------------------------------------------------------------------------
- static void __attribute__ ((noinline))
- show_ingress_bucket_idx_obj(const DoutPrefixProvider *dpp,
- const parsed_etag_t &parsed_etag,
- const string &bucket_name,
- const string &obj_name)
- {
- ldpp_dout(dpp, 20) << __func__ << "::(1)::" << bucket_name << "/" << obj_name
- << "::num_parts=" << parsed_etag.num_parts
- << "::ETAG=" << std::hex << parsed_etag.md5_high
- << parsed_etag.md5_low << std::dec << dendl;
- }
-
- //---------------------------------------------------------------------------
- int Background::ingress_bucket_idx_single_object(disk_block_array_t &disk_arr,
- const rgw::sal::Bucket *p_bucket,
- const rgw_bucket_dir_entry &entry,
- worker_stats_t *p_worker_stats /*IN-OUT*/)
- {
- // ceph store full blocks so need to round up and multiply by block_size
- uint64_t ondisk_byte_size = calc_on_disk_byte_size(entry.meta.size);
- // count all objects including too small and non default storage_class objs
- p_worker_stats->ingress_obj++;
- p_worker_stats->ingress_obj_bytes += ondisk_byte_size;
-
- parsed_etag_t parsed_etag;
- if (unlikely(!parse_etag_string(entry.meta.etag, &parsed_etag))) {
- p_worker_stats->ingress_corrupted_etag++;
- ldpp_dout(dpp, 1) << __func__ << "::ERROR: corrupted etag" << dendl;
- return -EINVAL;
- }
-
- if (unlikely((cct->_conf->subsys.should_gather<ceph_subsys_rgw_dedup, 20>()))) {
- show_ingress_bucket_idx_obj(dpp, parsed_etag, p_bucket->get_name(), entry.key.name);
- }
-
- // We limit dedup to objects from the same storage_class
- // TBD:
- // Should we use a skip-list of storage_classes we should skip (like glacier) ?
- const std::string& storage_class =
- rgw_placement_rule::get_canonical_storage_class(entry.meta.storage_class);
- if (storage_class == RGW_STORAGE_CLASS_STANDARD) {
- p_worker_stats->default_storage_class_objs++;
- p_worker_stats->default_storage_class_objs_bytes += ondisk_byte_size;
- }
- else {
- ldpp_dout(dpp, 20) << __func__ << "::" << entry.key.name
- << "::storage_class:" << entry.meta.storage_class << dendl;
- p_worker_stats->non_default_storage_class_objs++;
- p_worker_stats->non_default_storage_class_objs_bytes += ondisk_byte_size;
- }
-
- if (ondisk_byte_size <= d_min_obj_size_for_dedup) {
- if (parsed_etag.num_parts == 0) {
- // dedup only useful for objects bigger than 4MB
- p_worker_stats->ingress_skip_too_small++;
- p_worker_stats->ingress_skip_too_small_bytes += ondisk_byte_size;
-
- if (ondisk_byte_size >= 64*1024) {
- p_worker_stats->ingress_skip_too_small_64KB++;
- p_worker_stats->ingress_skip_too_small_64KB_bytes += ondisk_byte_size;
- }
- else {
- return 0;
- }
- }
- else {
- // multipart objects are always good candidates for dedup
- // the head object is empty and data is stored only in tail objs
- p_worker_stats->small_multipart_obj++;
- }
- }
- // multipart/single_part counters are for objects being fully processed
- if (parsed_etag.num_parts > 0) {
- p_worker_stats->multipart_objs++;
- }
- else {
- p_worker_stats->single_part_objs++;
- }
-
- return add_disk_rec_from_bucket_idx(disk_arr, p_bucket, &parsed_etag,
- entry.key.name, entry.meta.size,
- storage_class);
- }
-
- //---------------------------------------------------------------------------
- void Background::check_and_update_heartbeat(unsigned shard_id, uint64_t count_a,
- uint64_t count_b, const char *prefix)
- {
- utime_t now = ceph_clock_now();
- utime_t time_elapsed = now - d_heart_beat_last_update;
- if (unlikely(time_elapsed.tv.tv_sec >= d_heart_beat_max_elapsed_sec)) {
- ldpp_dout(dpp, 20) << __func__ << "::max_elapsed_sec="
- << d_heart_beat_max_elapsed_sec << dendl;
- d_heart_beat_last_update = now;
- d_cluster.update_shard_token_heartbeat(store, shard_id, count_a, count_b,
- prefix);
- }
- }
-
- //---------------------------------------------------------------------------
- void Background::check_and_update_worker_heartbeat(work_shard_t worker_id,
- int64_t ingress_obj_count)
- {
- check_and_update_heartbeat(worker_id, ingress_obj_count, 0, WORKER_SHARD_PREFIX);
- }
-
- //---------------------------------------------------------------------------
- void Background::check_and_update_md5_heartbeat(md5_shard_t md5_id,
- uint64_t load_count,
- uint64_t dedup_count)
- {
- check_and_update_heartbeat(md5_id, load_count, dedup_count, MD5_SHARD_PREFIX);
- }
-
- //---------------------------------------------------------------------------
- static uint32_t move_to_next_bucket_index_shard(const DoutPrefixProvider* dpp,
- unsigned current_shard,
- unsigned num_work_shards,
- const std::string &bucket_name,
- rgw_obj_index_key *p_marker /* OUT-PARAM */)
- {
- uint32_t next_shard = current_shard + num_work_shards;
- ldpp_dout(dpp, 20) << __func__ << "::" << bucket_name << "::curr_shard="
- << current_shard << ", next shard=" << next_shard << dendl;
- *p_marker = rgw_obj_index_key(); // reset marker to an empty index
- return next_shard;
- }
-
- // This function process bucket-index shards of a given @bucket
- // The bucket-index-shards are stored in a group of @oids
- // The @oids are using a simple map from the shard-id to the oid holding bucket-indices
- // We start by processing all bucket-indices owned by this @worker-id
- // Once we are done with a given bucket-index shard we skip to the next
- // bucket-index-shard owned by this worker-id
- // if (bucket_index_shard % work_id) == 0) -> read and process bucket_index_shard
- // else -> skip bucket_index_shard and don't read it
- //---------------------------------------------------------------------------
- int Background::process_bucket_shards(disk_block_array_t &disk_arr,
- const rgw::sal::Bucket *bucket,
- std::map<int, string> &oids,
- librados::IoCtx &ioctx,
- work_shard_t worker_id,
- work_shard_t num_work_shards,
- worker_stats_t *p_worker_stats /*IN-OUT*/)
- {
- const uint32_t num_shards = oids.size();
- uint32_t current_shard = worker_id;
- rgw_obj_index_key marker; // start with an empty marker
- const string null_prefix, null_delimiter;
- const bool list_versions = true;
- const int max_entries = 1000;
- uint32_t obj_count = 0;
-
- while (current_shard < num_shards ) {
- check_and_update_worker_heartbeat(worker_id, p_worker_stats->ingress_obj);
- if (unlikely(d_ctl.should_pause())) {
- handle_pause_req(__func__);
- }
- if (unlikely(d_ctl.should_stop())) {
- return -ECANCELED;
- }
-
- const string& oid = oids[current_shard];
- rgw_cls_list_ret result;
- librados::ObjectReadOperation op;
- // get bucket-indices of @current_shard
- cls_rgw_bucket_list_op(op, marker, null_prefix, null_delimiter, max_entries,
- list_versions, &result);
- int ret = rgw_rados_operate(dpp, ioctx, oid, std::move(op), nullptr, null_yield);
- if (unlikely(ret < 0)) {
- ldpp_dout(dpp, 1) << __func__ << "::ERR: failed rgw_rados_operate() ret="
- << ret << "::" << cpp_strerror(-ret) << dendl;
- current_shard = move_to_next_bucket_index_shard(dpp, current_shard, num_work_shards,
- bucket->get_name(), &marker);
- continue;
- }
- obj_count += result.dir.m.size();
- for (auto& entry : result.dir.m) {
- const rgw_bucket_dir_entry& dirent = entry.second;
- if (unlikely((!dirent.exists && !dirent.is_delete_marker()) || !dirent.pending_map.empty())) {
- // TBD: should we bailout ???
- ldpp_dout(dpp, 1) << __func__ << "::ERR: calling check_disk_state bucket="
- << bucket->get_name() << " entry=" << dirent.key << dendl;
- // make sure we're advancing marker
- marker = dirent.key;
- continue;
- }
- marker = dirent.key;
- ret = ingress_bucket_idx_single_object(disk_arr, bucket, dirent, p_worker_stats);
- }
- // TBD: advance marker only once here!
- if (result.is_truncated) {
- ldpp_dout(dpp, 15) << __func__ << "::[" << current_shard
- << "]result.is_truncated::count=" << obj_count << dendl;
- }
- else {
- // we reached the end of this shard -> move to the next shard
- current_shard = move_to_next_bucket_index_shard(dpp, current_shard, num_work_shards,
- bucket->get_name(), &marker);
- ldpp_dout(dpp, 15) << __func__ << "::move_to_next_bucket_index_shard::count="
- << obj_count << "::new_shard=" << current_shard << dendl;
- }
- }
- ldpp_dout(dpp, 15) << __func__ << "::Finished processing Bucket "
- << bucket->get_name() << ", num_shards=" << num_shards
- << ", obj_count=" << obj_count << dendl;
- return 0;
- }
-
- //---------------------------------------------------------------------------
- int Background::ingress_bucket_objects_single_shard(disk_block_array_t &disk_arr,
- const rgw_bucket &bucket_rec,
- work_shard_t worker_id,
- work_shard_t num_work_shards,
- worker_stats_t *p_worker_stats /*IN-OUT*/)
- {
- unique_ptr<rgw::sal::Bucket> bucket;
- int ret = driver->load_bucket(dpp, bucket_rec, &bucket, null_yield);
- if (unlikely(ret != 0)) {
- ldpp_dout(dpp, 1) << __func__ << "::ERROR: driver->load_bucket(): "
- << cpp_strerror(-ret) << dendl;
- return ret;
- }
-
- const std::string bucket_id = bucket->get_key().get_key();
- RGWBucketInfo bucket_info;
- ret = rados->get_bucket_instance_info(bucket_id, bucket_info,
- nullptr, nullptr, null_yield, dpp);
- if (unlikely(ret < 0)) {
- if (ret == -ENOENT) {
- // probably a race condition with bucket removal
- ldpp_dout(dpp, 10) << __func__ << "::ret == -ENOENT" << dendl;
- return 0;
- }
- ldpp_dout(dpp, 5) << __func__ << "::ERROR: get_bucket_instance_info(), ret="
- << ret << "::" << cpp_strerror(-ret) << dendl;
- return ret;
- }
- const rgw::bucket_index_layout_generation idx_layout = bucket_info.layout.current_index;
- librados::IoCtx ioctx;
- // objects holding the bucket-listings
- std::map<int, std::string> oids;
- ret = store->svc()->bi_rados->open_bucket_index(dpp, bucket_info, std::nullopt,
- idx_layout, &ioctx, &oids, nullptr);
- if (ret >= 0) {
- // process all the shards in this bucket owned by the worker_id
- return process_bucket_shards(disk_arr, bucket.get(), oids, ioctx, worker_id,
- num_work_shards, p_worker_stats);
- }
- else {
- ldpp_dout(dpp, 1) << __func__ << "::ERROR: open_bucket_index() ret="
- << ret << "::" << cpp_strerror(-ret) << dendl;
- return ret;
- }
- }
-
- //---------------------------------------------------------------------------
- static void display_table_stat_counters(const DoutPrefixProvider* dpp,
- const md5_stats_t *p_stats)
- {
- uint64_t obj_count_in_shard = (p_stats->big_objs_stat.singleton_count +
- p_stats->big_objs_stat.unique_count +
- p_stats->big_objs_stat.duplicate_count);
-
- ldpp_dout(dpp, 10) << "\n>>>>>" << __func__ << "::FINISHED STEP_BUILD_TABLE\n"
- << "::total_count=" << obj_count_in_shard
- << "::loaded_objects=" << p_stats->loaded_objects
- << p_stats->big_objs_stat << dendl;
- ldpp_dout(dpp, 10) << __func__ << "::small objs::"
- << p_stats->small_objs_stat << dendl;
- }
-
- //---------------------------------------------------------------------------
- int Background::objects_dedup_single_md5_shard(dedup_table_t *p_table,
- md5_shard_t md5_shard,
- md5_stats_t *p_stats,
- work_shard_t num_work_shards)
- {
- remapper_t remapper(MAX_STORAGE_CLASS_IDX);
- // make sure that the standard storage_class is always in the mapper!
- storage_class_idx_t sc_idx = remapper.remap(RGW_STORAGE_CLASS_STANDARD, dpp,
- &p_stats->failed_map_overflow);
- ceph_assert(sc_idx == 0);
- uint32_t slab_count_arr[num_work_shards];
- // first load all etags to hashtable to find dedups
- // the entries come from bucket-index and got minimal info (etag, size)
- for (work_shard_t worker_id = 0; worker_id < num_work_shards; worker_id++) {
- process_all_slabs(p_table, STEP_BUILD_TABLE, md5_shard, worker_id,
- slab_count_arr+worker_id, p_stats, nullptr, &remapper);
- if (unlikely(d_ctl.should_stop())) {
- ldpp_dout(dpp, 5) << __func__ << "::STEP_BUILD_TABLE::STOPPED\n" << dendl;
- return -ECANCELED;
- }
- }
- p_table->count_duplicates(&p_stats->small_objs_stat, &p_stats->big_objs_stat,
- &p_stats->dup_head_bytes_estimate);
- display_table_stat_counters(dpp, p_stats);
-
- ldpp_dout(dpp, 10) << __func__ << "::MD5 Loop::" << d_ctl.dedup_type << dendl;
- if (d_ctl.dedup_type != dedup_req_type_t::DEDUP_TYPE_FULL) {
- for (work_shard_t worker_id = 0; worker_id < num_work_shards; worker_id++) {
- remove_slabs(worker_id, md5_shard, slab_count_arr[worker_id]);
- }
- return 0;
- }
-
-#ifndef FULL_DEDUP_SUPPORT
- // we don't support full dedup with this release
- return 0;
-#endif
-
- p_table->remove_singletons_and_redistribute_keys();
- // The SLABs holds minimal data set brought from the bucket-index
- // Objects participating in DEDUP need to read attributes from the Head-Object
- // TBD - find a better name than num_work_shards for the combined output
- {
- disk_block_t arr[DISK_BLOCK_COUNT];
- worker_stats_t wstat;
- disk_block_seq_t disk_block_seq(dpp, arr, num_work_shards, md5_shard, &wstat);
- for (work_shard_t worker_id = 0; worker_id < num_work_shards; worker_id++) {
- process_all_slabs(p_table, STEP_READ_ATTRIBUTES, md5_shard, worker_id,
- slab_count_arr+worker_id, p_stats, &disk_block_seq, &remapper);
- if (unlikely(d_ctl.should_stop())) {
- ldpp_dout(dpp, 5) << __func__ << "::STEP_READ_ATTRIBUTES::STOPPED\n" << dendl;
- return -ECANCELED;
- }
- // we finished processing output SLAB from @worker_id -> remove them
- remove_slabs(worker_id, md5_shard, slab_count_arr[worker_id]);
- }
- disk_block_seq.flush_disk_records(d_dedup_cluster_ioctx);
- }
-
- ldpp_dout(dpp, 10) << __func__ << "::STEP_REMOVE_DUPLICATES::started..." << dendl;
- uint32_t slab_count = 0;
- process_all_slabs(p_table, STEP_REMOVE_DUPLICATES, md5_shard, num_work_shards,
- &slab_count, p_stats, nullptr, &remapper);
- if (unlikely(d_ctl.should_stop())) {
- ldpp_dout(dpp, 5) << __func__ << "::STEP_REMOVE_DUPLICATES::STOPPED\n" << dendl;
- return -ECANCELED;
- }
- ldpp_dout(dpp, 10) << __func__ << "::STEP_REMOVE_DUPLICATES::finished..." << dendl;
- // remove the special SLAB holding aggragted data
- remove_slabs(num_work_shards, md5_shard, slab_count);
- return 0;
- }
-
- //---------------------------------------------------------------------------
- int Background::read_bucket_stats(const rgw_bucket &bucket_rec,
- uint64_t *p_num_obj,
- uint64_t *p_size)
- {
- unique_ptr<rgw::sal::Bucket> bucket;
- int ret = driver->load_bucket(dpp, bucket_rec, &bucket, null_yield);
- if (unlikely(ret != 0)) {
- ldpp_dout(dpp, 1) << __func__ << "::ERROR: driver->load_bucket(): "
- << cpp_strerror(-ret) << dendl;
- return ret;
- }
-
- const auto& index = bucket->get_info().get_current_index();
- if (is_layout_indexless(index)) {
- ldpp_dout(dpp, 1) << __func__
- << "::ERR, indexless buckets do not maintain stats; bucket="
- << bucket->get_name() << dendl;
- return -EINVAL;
- }
-
- std::map<RGWObjCategory, RGWStorageStats> stats;
- std::string bucket_ver, master_ver;
- std::string max_marker;
- ret = bucket->read_stats(dpp, null_yield, index, RGW_NO_SHARD, &bucket_ver,
- &master_ver, stats, &max_marker);
- if (ret < 0) {
- ldpp_dout(dpp, 1) << __func__ << "::ERR getting bucket stats bucket="
- << bucket->get_name() << " ret=" << ret << dendl;
- return ret;
- }
-
- for (auto itr = stats.begin(); itr != stats.end(); ++itr) {
- RGWStorageStats& s = itr->second;
- ldpp_dout(dpp, 20) << __func__ << "::" << bucket->get_name() << "::"
- << to_string(itr->first) << "::num_obj=" << s.num_objects
- << "::size=" << s.size << dendl;
- *p_num_obj += s.num_objects;
- *p_size += s.size;
- }
-
- return 0;
- }
-
- //---------------------------------------------------------------------------
- int Background::collect_all_buckets_stats()
- {
- int ret = 0;
- std::string section("bucket.instance");
- std::string marker;
- void *handle = nullptr;
- ret = driver->meta_list_keys_init(dpp, section, marker, &handle);
- if (ret < 0) {
- ldpp_dout(dpp, 1) << __func__ << "::ERR: Failed meta_list_keys_init: "
- << cpp_strerror(-ret) << dendl;
- return ret;
- }
-
- d_all_buckets_obj_count = 0;
- d_all_buckets_obj_size = 0;
-
- bool has_more = true;
- while (has_more) {
- std::list<std::string> entries;
- constexpr int max_keys = 1000;
- ret = driver->meta_list_keys_next(dpp, handle, max_keys, entries, &has_more);
- if (ret == 0) {
- for (auto& entry : entries) {
- ldpp_dout(dpp, 20) <<__func__ << "::bucket_name=" << entry << dendl;
- rgw_bucket bucket;
- ret = rgw_bucket_parse_bucket_key(cct, entry, &bucket, nullptr);
- if (unlikely(ret < 0)) {
- ldpp_dout(dpp, 1) << __func__ << "::ERR: Failed rgw_bucket_parse_bucket_key: "
- << cpp_strerror(-ret) << dendl;
- goto err;
- }
- ldpp_dout(dpp, 20) <<__func__ << "::bucket=" << bucket << dendl;
- ret = read_bucket_stats(bucket, &d_all_buckets_obj_count,
- &d_all_buckets_obj_size);
- if (unlikely(ret != 0)) {
- goto err;
- }
- }
- driver->meta_list_keys_complete(handle);
- }
- else {
- ldpp_dout(dpp, 1) << __func__ << "::ERR: failed driver->meta_list_keys_next()" << dendl;
- goto err;
- }
- }
- ldpp_dout(dpp, 10) <<__func__
- << "::all_buckets_obj_count=" << d_all_buckets_obj_count
- << "::all_buckets_obj_size=" << d_all_buckets_obj_size
- << dendl;
- return 0;
-
- err:
- ldpp_dout(dpp, 1) << __func__ << "::error handler" << dendl;
- // reset counters to mark that we don't have the info
- d_all_buckets_obj_count = 0;
- d_all_buckets_obj_size = 0;
- if (handle) {
- driver->meta_list_keys_complete(handle);
- }
- return ret;
- }
-
- //---------------------------------------------------------------------------
- int Background::objects_ingress_single_work_shard(work_shard_t worker_id,
- work_shard_t num_work_shards,
- md5_shard_t num_md5_shards,
- worker_stats_t *p_worker_stats,
- uint8_t *raw_mem,
- uint64_t raw_mem_size)
- {
- int ret = 0;
- std::string section("bucket.instance");
- std::string marker;
- void *handle = nullptr;
- ret = driver->meta_list_keys_init(dpp, section, marker, &handle);
- if (ret < 0) {
- ldpp_dout(dpp, 1) << __func__ << "::ERR: Failed meta_list_keys_init: "
- << cpp_strerror(-ret) << dendl;
- return ret;
- }
- disk_block_array_t disk_arr(dpp, raw_mem, raw_mem_size, worker_id,
- p_worker_stats, num_md5_shards);
- bool has_more = true;
- // iterate over all buckets
- while (ret == 0 && has_more) {
- std::list<std::string> entries;
- constexpr int max_keys = 1000;
- ret = driver->meta_list_keys_next(dpp, handle, max_keys, entries, &has_more);
- if (ret == 0) {
- ldpp_dout(dpp, 20) <<__func__ << "::entries.size()=" << entries.size() << dendl;
- for (auto& entry : entries) {
- ldpp_dout(dpp, 20) <<__func__ << "::bucket_name=" << entry << dendl;
- rgw_bucket bucket;
- ret = rgw_bucket_parse_bucket_key(cct, entry, &bucket, nullptr);
- if (unlikely(ret < 0)) {
- // bad bucket entry, skip to the next one
- ldpp_dout(dpp, 1) << __func__ << "::ERR: Failed rgw_bucket_parse_bucket_key: "
- << cpp_strerror(-ret) << dendl;
- continue;
- }
- ldpp_dout(dpp, 20) <<__func__ << "::bucket=" << bucket << dendl;
- ret = ingress_bucket_objects_single_shard(disk_arr, bucket, worker_id,
- num_work_shards, p_worker_stats);
- if (unlikely(ret != 0)) {
- if (d_ctl.should_stop()) {
- driver->meta_list_keys_complete(handle);
- return -ECANCELED;
- }
- ldpp_dout(dpp, 1) << __func__ << "::Failed ingress_bucket_objects_single_shard()" << dendl;
- // skip bad bucket and move on to the next one
- continue;
- }
- }
- driver->meta_list_keys_complete(handle);
- }
- else {
- ldpp_dout(dpp, 1) << __func__ << "::failed driver->meta_list_keys_next()" << dendl;
- driver->meta_list_keys_complete(handle);
- // TBD: what can we do here?
- break;
- }
- }
- ldpp_dout(dpp, 20) <<__func__ << "::flush_output_buffers() worker_id="
- << worker_id << dendl;
- disk_arr.flush_output_buffers(dpp, d_dedup_cluster_ioctx);
- return ret;
- }
-
- //---------------------------------------------------------------------------
- int Background::remove_slabs(unsigned worker_id, unsigned md5_shard, uint32_t slab_count)
- {
- unsigned failure_count = 0;
-
- for (uint32_t slab_id = 0; slab_id < slab_count; slab_id++) {
- uint32_t seq_number = disk_block_id_t::slab_id_to_seq_num(slab_id);
- disk_block_id_t block_id(worker_id, seq_number);
- std::string oid(block_id.get_slab_name(md5_shard));
- ldpp_dout(dpp, 20) << __func__ << "::calling ioctx->remove(" << oid << ")" << dendl;
- int ret = d_dedup_cluster_ioctx.remove(oid);
- if (ret != 0) {
- ldpp_dout(dpp, 0) << __func__ << "::ERR Failed ioctx->remove(" << oid << ")" << dendl;
- failure_count++;
- }
- }
-
- return failure_count;
- }
-
- //---------------------------------------------------------------------------
- int Background::f_ingress_work_shard(unsigned worker_id,
- uint8_t *raw_mem,
- uint64_t raw_mem_size,
- work_shard_t num_work_shards,
- md5_shard_t num_md5_shards)
- {
- ldpp_dout(dpp, 20) << __func__ << "::worker_id=" << worker_id << dendl;
- utime_t start_time = ceph_clock_now();
- worker_stats_t worker_stats;
- int ret = objects_ingress_single_work_shard(worker_id, num_work_shards, num_md5_shards,
- &worker_stats,raw_mem, raw_mem_size);
- if (ret == 0) {
- worker_stats.duration = ceph_clock_now() - start_time;
- d_cluster.mark_work_shard_token_completed(store, worker_id, &worker_stats);
- ldpp_dout(dpp, 10) << "stat counters [worker]:\n" << worker_stats << dendl;
- ldpp_dout(dpp, 10) << "Shard Process Duration = "
- << worker_stats.duration << dendl;
- }
- //ldpp_dout(dpp, 0) << __func__ << "::sleep for 2 seconds\n" << dendl;
- //std::this_thread::sleep_for(std::chrono::seconds(2));
- return ret;
- }
-
- //---------------------------------------------------------------------------
- int Background::f_dedup_md5_shard(unsigned md5_shard,
- uint8_t *raw_mem,
- uint64_t raw_mem_size,
- work_shard_t num_work_shards,
- md5_shard_t num_md5_shards)
- {
- utime_t start_time = ceph_clock_now();
- md5_stats_t md5_stats;
- //DEDUP_DYN_ALLOC
- dedup_table_t table(dpp, d_head_object_size, raw_mem, raw_mem_size);
- int ret = objects_dedup_single_md5_shard(&table, md5_shard, &md5_stats, num_work_shards);
- if (ret == 0) {
- md5_stats.duration = ceph_clock_now() - start_time;
- d_cluster.mark_md5_shard_token_completed(store, md5_shard, &md5_stats);
- ldpp_dout(dpp, 10) << "stat counters [md5]:\n" << md5_stats << dendl;
- ldpp_dout(dpp, 10) << "Shard Process Duration = "
- << md5_stats.duration << dendl;
- }
- return ret;
- }
-
- //---------------------------------------------------------------------------
- int Background::process_all_shards(bool ingress_work_shards,
- int (Background::*func)(unsigned, uint8_t*, uint64_t, work_shard_t, md5_shard_t),
- uint8_t *raw_mem,
- uint64_t raw_mem_size,
- work_shard_t num_work_shards,
- md5_shard_t num_md5_shards)
- {
- while (true) {
- d_heart_beat_last_update = ceph_clock_now();
- uint16_t shard_id;
- if (ingress_work_shards) {
- shard_id = d_cluster.get_next_work_shard_token(store, num_work_shards);
- }
- else {
- shard_id = d_cluster.get_next_md5_shard_token(store, num_md5_shards);
- }
-
- // start with a common error handler
- if (shard_id != NULL_SHARD) {
- ldpp_dout(dpp, 10) << __func__ << "::Got shard_id=" << shard_id << dendl;
- int ret = (this->*func)(shard_id, raw_mem, raw_mem_size, num_work_shards,
- num_md5_shards);
- if (unlikely(ret != 0)) {
- if (d_ctl.should_stop()) {
- ldpp_dout(dpp, 5) << __func__ << "::stop execution" << dendl;
- return -ECANCELED;
- }
- else {
- ldpp_dout(dpp, 5) << __func__ << "::Skip shard #" << shard_id << dendl;
- }
- }
- }
- else {
- ldpp_dout(dpp, 10) << __func__ << "::finished processing all shards" <<dendl;
- break;
- }
- } // while loop
- return 0;
- }
-
- //---------------------------------------------------------------------------
- [[maybe_unused]]static int collect_pool_stats(const DoutPrefixProvider* const dpp,
- RGWRados* rados,
- uint64_t *p_num_objects,
- uint64_t *p_num_objects_bytes)
- {
- *p_num_objects = 0;
- *p_num_objects_bytes = 0;
- list<string> vec;
- vec.push_back("default.rgw.buckets.data");
- map<string,librados::pool_stat_t> stats;
- auto rados_handle = rados->get_rados_handle();
- int ret = rados_handle->get_pool_stats(vec, stats);
- if (ret < 0) {
- ldpp_dout(dpp, 0) << __func__ << ":ERROR: fetching pool stats: "
- << cpp_strerror(-ret) << dendl;
- return ret;
- }
-
- for (auto i = stats.begin(); i != stats.end(); ++i) {
- const char *pool_name = i->first.c_str();
- librados::pool_stat_t& s = i->second;
- // TBD: add support for EC
- // We need to find the user byte size without the added protection
- double replica_level = (double)s.num_object_copies / s.num_objects;
- *p_num_objects = s.num_objects;
- *p_num_objects_bytes = s.num_bytes / replica_level;
- ldpp_dout(dpp, 10) <<__func__ << "::" << pool_name << "::num_objects="
- << s.num_objects << "::num_copies=" << s.num_object_copies
- << "::num_bytes=" << s.num_bytes << "/" << *p_num_objects_bytes << dendl;
- }
- return 0;
- }
-
- //-------------------------------------------------------------------------------
- // 32B per object-entry in the hashtable
- // 2MB per shard-buffer
- //=============||==============||=========||===================================||
- // Obj Count || shard count || memory || calculation ||
- // ------------||--------------||---------||---------------------------------- ||
- // 1M || 4 || 8MB || 8MB/32 = 0.25M * 4 = 1M ||
- // 4M || 8 || 16MB || 16MB/32 = 0.50M * 8 = 4M ||
- //-------------------------------------------------------------------------------
- // 16M || 16 || 32MB || 32MB/32 = 1.00M * 16 = 16M ||
- //-------------------------------------------------------------------------------
- // 64M || 32 || 64MB || 64MB/32 = 2.00M * 32 = 64M ||
- // 256M || 64 || 128MB || 128MB/32 = 4.00M * 64 = 256M ||
- // 1024M( 1G) || 128 || 256MB || 256MB/32 = 8.00M * 128 = 1024M ||
- // 4096M( 4G) || 256 || 512MB || 512MB/32 = 16M.00 * 256 = 4096M ||
- // 16384M(16G) || 512 || 1024MB || 1024MB/32 = 32M.00 * 512 = 16384M ||
- //-------------||--------------||---------||-----------------------------------||
- static md5_shard_t calc_num_md5_shards(uint64_t obj_count)
- {
- // create headroom by allocating space for a 10% bigger system
- obj_count = obj_count + (obj_count/10);
-
- uint64_t M = 1024 * 1024;
- if (obj_count < 1*M) {
- // less than 1M objects -> use 4 shards (8MB)
- return 4;
- }
- else if (obj_count < 4*M) {
- // less than 4M objects -> use 8 shards (16MB)
- return 8;
- }
- else if (obj_count < 16*M) {
- // less than 16M objects -> use 16 shards (32MB)
- return 16;
- }
- else if (obj_count < 64*M) {
- // less than 64M objects -> use 32 shards (64MB)
- return 32;
- }
- else if (obj_count < 256*M) {
- // less than 256M objects -> use 64 shards (128MB)
- return 64;
- }
- else if (obj_count < 1024*M) {
- // less than 1024M objects -> use 128 shards (256MB)
- return 128;
- }
- else if (obj_count < 4*1024*M) {
- // less than 4096M objects -> use 256 shards (512MB)
- return 256;
- }
- else {
- return 512;
- }
- }
-
- //---------------------------------------------------------------------------
- int Background::setup(dedup_epoch_t *p_epoch)
- {
- int ret = collect_all_buckets_stats();
- if (unlikely(ret != 0)) {
- return ret;
- }
-
- md5_shard_t num_md5_shards = calc_num_md5_shards(d_all_buckets_obj_count);
- num_md5_shards = std::min(num_md5_shards, MAX_MD5_SHARD);
- num_md5_shards = std::max(num_md5_shards, MIN_MD5_SHARD);
- work_shard_t num_work_shards = num_md5_shards;
- num_work_shards = std::min(num_work_shards, MAX_WORK_SHARD);
-
- ldpp_dout(dpp, 5) << __func__ << "::obj_count=" <<d_all_buckets_obj_count
- << "::num_md5_shards=" << num_md5_shards
- << "::num_work_shards=" << num_work_shards << dendl;
- // init handles and create the dedup_pool
- ret = init_rados_access_handles(true);
- if (ret != 0) {
- derr << "dedup_bg::resume() failed init_rados_access_handles() ret="
- << ret << "::" << cpp_strerror(-ret) << dendl;
- return ret;
- }
- display_ioctx_state(dpp, d_dedup_cluster_ioctx, __func__);
-
- ret = d_cluster.reset(store, p_epoch, num_work_shards, num_md5_shards);
- if (ret != 0) {
- ldpp_dout(dpp, 1) << __func__ << "::ERR: failed cluster.init()" << dendl;
- return ret;
- }
-
- if (unlikely(p_epoch->num_work_shards > MAX_WORK_SHARD)) {
- ldpp_dout(dpp, 1) << __func__ << "::ERR: p_epoch->num_work_shards="
- << p_epoch->num_work_shards
- << " is larger than MAX_WORK_SHARD ("
- << MAX_WORK_SHARD << ")" << dendl;
- return -EOVERFLOW;
- }
- if (unlikely(p_epoch->num_md5_shards > MAX_MD5_SHARD)) {
- ldpp_dout(dpp, 1) << __func__ << "::ERR: p_epoch->num_md5_shards="
- << p_epoch->num_md5_shards
- << " is larger than MAX_MD5_SHARD ("
- << MAX_MD5_SHARD << ")" << dendl;
- return -EOVERFLOW;
- }
-
- ldpp_dout(dpp, 10) <<__func__ << "::" << *p_epoch << dendl;
- d_ctl.dedup_type = p_epoch->dedup_type;
-#ifdef FULL_DEDUP_SUPPORT
- ceph_assert(d_ctl.dedup_type == dedup_req_type_t::DEDUP_TYPE_FULL ||
- d_ctl.dedup_type == dedup_req_type_t::DEDUP_TYPE_ESTIMATE);
-#else
- ceph_assert(d_ctl.dedup_type == dedup_req_type_t::DEDUP_TYPE_ESTIMATE);
-#endif
- ldpp_dout(dpp, 10) << __func__ << "::" << d_ctl.dedup_type << dendl;
-
- return 0;
- }
-
- //---------------------------------------------------------------------------
- int Background::watch_reload(const DoutPrefixProvider* dpp)
- {
- return cluster::watch_reload(store, dpp, &d_watch_handle, &d_watcher_ctx);
- }
-
- //---------------------------------------------------------------------------
- int Background::unwatch_reload(const DoutPrefixProvider* dpp)
- {
- if (d_watch_handle == 0) {
- // nothing to unwatch
- ldpp_dout(dpp, 1) << "dedup_bg::unwatch_reload(): nothing to watch"
- << dendl;
- return 0;
- }
-
- ldpp_dout(dpp, 5) << "dedup_bg::unwatch_reload(): watch_handle="
- << d_watch_handle << dendl;
-
- int ret = cluster::unwatch_reload(store, dpp, d_watch_handle);
- if (ret == 0) {
- ldpp_dout(dpp, 5) << "dedup_bg::unwatch_reload():Stopped watching "
- << "::d_watch_handle=" << d_watch_handle << dendl;
- d_watch_handle = 0;
- }
- return ret;
- }
-
- //---------------------------------------------------------------------------
- void Background::handle_notify(uint64_t notify_id, uint64_t cookie, bufferlist &bl)
- {
- int ret = 0;
- int32_t urgent_msg = URGENT_MSG_NONE;
- try {
- auto bl_iter = bl.cbegin();
- ceph::decode(urgent_msg, bl_iter);
- } catch (buffer::error& err) {
- ldpp_dout(dpp, 1) << __func__ << "::ERROR: bad urgent_msg" << dendl;
- ret = -EINVAL;
- }
- ldpp_dout(dpp, 5) << __func__ << "::-->" << get_urgent_msg_names(urgent_msg) << dendl;
-
- // use lock to prevent concurrent pause/resume requests
- std::unique_lock cond_lock(d_cond_mutex); // [------>open lock block
- if (unlikely(d_ctl.local_urgent_req())) {
- // can't operate when the system is paused/shutdown
- cond_lock.unlock(); // close lock block------>]
- ldpp_dout(dpp, 5) << __func__
- << "::system is paused/shutdown -> cancel notification" << dendl;
- cluster::ack_notify(store, dpp, &d_ctl, notify_id, cookie, -EBUSY);
- return;
- }
-
- switch(urgent_msg) {
- case URGENT_MSG_ABORT:
- if (d_ctl.dedup_exec) {
- d_ctl.remote_abort_req = true;
- d_cond.notify_all();
- d_cond.wait(cond_lock, [this]{return d_ctl.remote_aborted || d_ctl.local_urgent_req();});
- d_ctl.remote_aborted ? ret = 0 : ret = -EBUSY;
- }
- else {
- ldpp_dout(dpp, 5) << __func__ << "::inactive dedup->nothing to do" << dendl;
- }
- break;
- case URGENT_MSG_RESTART:
- if (!d_ctl.dedup_exec) {
- d_ctl.remote_restart_req = true;
- d_cond.notify_all();
- }
- else {
- ldpp_dout(dpp, 5) << __func__ << "::\ncan't restart active dedup\n"<< dendl;
- ret = -EEXIST;
- }
- break;
- case URGENT_MSG_PASUE:
- if (d_ctl.dedup_exec && !d_ctl.remote_paused) {
- d_ctl.remote_pause_req = true;
- d_cond.notify_all();
- d_cond.wait(cond_lock, [this]{return d_ctl.remote_paused || d_ctl.local_urgent_req();});
- d_ctl.remote_paused ? ret = 0 : ret = -EBUSY;
- }
- else {
- if (d_ctl.remote_paused) {
- ldpp_dout(dpp, 5) << __func__ << "::dedup is already paused" << dendl;
- }
- else {
- ldpp_dout(dpp, 5) << __func__ << "::inactive dedup->nothing to do" << dendl;
- }
- }
- break;
- case URGENT_MSG_RESUME:
- if (d_ctl.remote_pause_req || d_ctl.remote_paused) {
- d_ctl.remote_pause_req = false;
- d_ctl.remote_paused = false;
- d_cond.notify_all();
- }
- else {
- ldpp_dout(dpp, 5) << __func__ << "::dedup is not paused->nothing to do" << dendl;
- }
- break;
- default:
- ldpp_dout(dpp, 1) << __func__ << "::unexpected urgent_msg: "
- << get_urgent_msg_names(urgent_msg) << dendl;
- ret = -EINVAL;
- }
-
- cond_lock.unlock(); // close lock block------>]
- cluster::ack_notify(store, dpp, &d_ctl, notify_id, cookie, ret);
- }
-
- //---------------------------------------------------------------------------
- void Background::start()
- {
- const DoutPrefixProvider* const dpp = &dp;
- ldpp_dout(dpp, 10) << __FILE__ << "::" <<__func__ << dendl;
- {
- std::unique_lock pause_lock(d_cond_mutex);
- if (d_ctl.started) {
- // start the thread only once
- ldpp_dout(dpp, 1) << "dedup_bg already started" << dendl;
- return;
- }
- d_ctl.started = true;
- }
- d_runner = std::thread(&Background::run, this);
- }
-
- //------------------------- --------------------------------------------------
- void Background::shutdown()
- {
- ldpp_dout(dpp, 5) <<__func__ << "::dedup_bg shutdown()" << dendl;
- std::unique_lock cond_lock(d_cond_mutex);
- bool nested_call = false;
- if (d_ctl.shutdown_req) {
- // should never happen!
- ldpp_dout(dpp, 1) <<__func__ << "dedup_bg nested call" << dendl;
- nested_call = true;
- }
- d_ctl.shutdown_req = true;
- d_cond.notify_all();
- ldpp_dout(dpp, 1) <<__func__ << "dedup_bg shutdown waiting..." << dendl;
- d_cond.wait(cond_lock, [this]{return d_ctl.shutdown_done;});
- //cond_lock.unlock();
-
- if (nested_call) {
- ldpp_dout(dpp, 1) <<__func__ << "::nested call:: repeat notify" << dendl;
- d_cond.notify_all();
- }
-
- if (d_runner.joinable()) {
- ldpp_dout(dpp, 5) <<__func__ << "::dedup_bg wait join()" << dendl;
- d_runner.join();
- ldpp_dout(dpp, 5) <<__func__ << "::dedup_bg finished join()" << dendl;
- }
- else {
- ldpp_dout(dpp, 5) <<__func__ << "::dedup_bg not joinable()" << dendl;
- }
-
- d_ctl.reset();
- }
-
- //---------------------------------------------------------------------------
- void Background::pause()
- {
- display_ioctx_state(dpp, d_dedup_cluster_ioctx, "dedup_bg->pause() request");
- std::unique_lock cond_lock(d_cond_mutex);
-
- if (d_ctl.local_paused || d_ctl.shutdown_done) {
- cond_lock.unlock();
- ldpp_dout(dpp, 1) << __FILE__ << "::" <<__func__
- << "::dedup_bg is already paused/stopped" << dendl;
- return;
- }
-
- bool nested_call = false;
- if (d_ctl.local_pause_req) {
- // should never happen!
- ldpp_dout(dpp, 1) <<__func__ << "::nested call" << dendl;
- nested_call = true;
- }
- d_ctl.local_pause_req = true;
- d_cond.notify_all();
- d_cond.wait(cond_lock, [this]{return d_ctl.local_paused||d_ctl.shutdown_done;});
- if (nested_call) {
- ldpp_dout(dpp, 1) << "dedup_bg::nested call:: repeat notify" << dendl;
- d_cond.notify_all();
- }
-
- // destory open watch request and pool handle before pause() is completed
- unwatch_reload(dpp);
- d_dedup_cluster_ioctx.close();
- ldpp_dout(dpp, 5) << "dedup_bg paused" << dendl;
- }
-
- //---------------------------------------------------------------------------
- void Background::resume(rgw::sal::Driver* _driver)
- {
- ldpp_dout(dpp, 5) << "dedup_bg->resume()" << dendl;
- // use lock to prevent concurrent pause/resume requests
- std::unique_lock cond_lock(d_cond_mutex);
-
- if (!d_ctl.local_paused) {
- cond_lock.unlock();
- ldpp_dout(dpp, 5) << "dedup_bg::resume thread is not paused!" << dendl;
- if (_driver != driver) {
- ldpp_dout(dpp, 1) << "dedup_bg attempt to change driver on an active system was refused" << dendl;
- }
- return;
- }
-
- driver = _driver;
- // can pool change its uid between pause/resume ???
- int ret = init_rados_access_handles(false);
- if (ret != 0) {
- derr << "dedup_bg::resume() failed init_rados_access_handles() ret="
- << ret << "::" << cpp_strerror(-ret) << dendl;
- throw std::runtime_error("Failed init_rados_access_handles()");
- }
- display_ioctx_state(dpp, d_dedup_cluster_ioctx, "dedup_bg->resume() done");
- // create new watch request using the new pool handle
- watch_reload(dpp);
- d_ctl.local_pause_req = false;
- d_ctl.local_paused = false;
-
- // wake up threads blocked after seeing pause state
- d_cond.notify_all();
- ldpp_dout(dpp, 5) << "dedup_bg was resumed" << dendl;
- }
-
- //---------------------------------------------------------------------------
- void Background::handle_pause_req(const char *caller)
- {
- ldpp_dout(dpp, 5) << __func__ << "::caller=" << caller << dendl;
- ldpp_dout(dpp, 5) << __func__ << "::" << d_ctl << dendl;
- while (d_ctl.local_pause_req || d_ctl.local_paused || d_ctl.remote_pause_req || d_ctl.remote_paused) {
- std::unique_lock cond_lock(d_cond_mutex);
- if (d_ctl.should_stop()) {
- ldpp_dout(dpp, 5) << __func__ << "::should_stop!" << dendl;
- return;
- }
-
- if (d_ctl.local_pause_req) {
- d_ctl.local_pause_req = false;
- d_ctl.local_paused = true;
- }
-
- if (d_ctl.remote_pause_req) {
- d_ctl.remote_pause_req = false;
- d_ctl.remote_paused = true;
- }
-
- d_cond.notify_all();
-
- if (d_ctl.local_paused) {
- ldpp_dout(dpp, 10) << __func__ << "::wait on d_ctl.local_paused" << dendl;
- d_cond.wait(cond_lock, [this]{return !d_ctl.local_paused || d_ctl.should_stop() ;});
- }
-
- if (d_ctl.remote_paused) {
- ldpp_dout(dpp, 10) << __func__ << "::wait on d_ctl.remote_paused" << dendl;
- d_cond.wait(cond_lock, [this]{return !d_ctl.remote_paused || d_ctl.should_stop() || d_ctl.local_pause_req;});
- }
- } // while loop
-
- ldpp_dout(dpp, 5) << "Dedup background thread resumed!" << dendl;
- }
-
- //---------------------------------------------------------------------------
- void Background::work_shards_barrier(work_shard_t num_work_shards)
- {
- // Wait for other worker to finish ingress step
- // We can move to the next step even if some token are in failed state
- const unsigned MAX_WAIT_SEC = 120; // wait 2 minutes for failing members
- unsigned ttl = 3;
- unsigned time_elapsed = 0;
-
- while (true) {
- int ret = d_cluster.all_work_shard_tokens_completed(store, num_work_shards);
- // we start incrementing time_elapsed only after all valid tokens finish
- if (ret == 0 || (time_elapsed > MAX_WAIT_SEC) ) {
- break;
- }
-
- ldpp_dout(dpp, 10) << __func__ << "::Wait for object ingress completion, ttl="
- << ttl << " seconds" << dendl;
- std::unique_lock cond_lock(d_cond_mutex);
- d_cond.wait_for(cond_lock, std::chrono::seconds(ttl),
- [this]{return d_ctl.should_stop() || d_ctl.should_pause();});
- if (unlikely(d_ctl.should_pause())) {
- handle_pause_req(__func__);
- }
- if (unlikely(d_ctl.should_stop())) {
- return;
- }
-
- if (ret != -EAGAIN) {
- // All incomplete tokens are corrupted or in time out state
- // Give them an extra 120 seconds just in case ...
- time_elapsed += ttl;
- }
- // else there are still good tokens in process, wait for them
- }
-
- ldpp_dout(dpp, 10) << "\n\n==Object Ingress step was completed on all shards==\n"
- << dendl;
- if (unlikely(d_ctl.should_pause())) {
- handle_pause_req(__func__);
- }
- }
-
- //---------------------------------------------------------------------------
- static bool all_md5_shards_completed(cluster *p_cluster,
- rgw::sal::RadosStore *store,
- md5_shard_t num_md5_shards)
- {
- return (p_cluster->all_md5_shard_tokens_completed(store, num_md5_shards) == 0);
- }
-
- //---------------------------------------------------------------------------
- void Background::md5_shards_barrier(md5_shard_t num_md5_shards)
- {
- // Wait for others to finish step
- unsigned ttl = 3;
- // require that everything completed successfully before deleting the pool
- while (!all_md5_shards_completed(&d_cluster, store, num_md5_shards)) {
- ldpp_dout(dpp, 10) << __func__ << "::Wait for md5 completion, ttl="
- << ttl << " seconds" << dendl;
- std::unique_lock cond_lock(d_cond_mutex);
- d_cond.wait_for(cond_lock, std::chrono::seconds(ttl),
- [this]{return d_ctl.should_stop() || d_ctl.should_pause();});
- if (unlikely(d_ctl.should_pause())) {
- handle_pause_req(__func__);
- }
- if (unlikely(d_ctl.should_stop())) {
- return;
- }
- }
-
- ldpp_dout(dpp, 10) << "\n\n==MD5 processing was completed on all shards!==\n"
- << dendl;
- if (unlikely(d_ctl.should_pause())) {
- handle_pause_req(__func__);
- }
- }
-
- //---------------------------------------------------------------------------
- void Background::run()
- {
- const auto rc = ceph_pthread_setname("dedup_bg");
- ldpp_dout(dpp, 5) << __func__ << "ceph_pthread_setname() ret=" << rc << dendl;
-
- // 256x8KB=2MB
- const uint64_t PER_SHARD_BUFFER_SIZE = DISK_BLOCK_COUNT *sizeof(disk_block_t);
- ldpp_dout(dpp, 20) <<__func__ << "::dedup::main loop" << dendl;
-
- while (!d_ctl.shutdown_req) {
- if (unlikely(d_ctl.should_pause())) {
- handle_pause_req(__func__);
- if (unlikely(d_ctl.should_stop())) {
- ldpp_dout(dpp, 5) <<__func__ << "::stop req after a pause" << dendl;
- d_ctl.dedup_exec = false;
- }
- }
-
- if (d_ctl.dedup_exec) {
- dedup_epoch_t epoch;
- if (setup(&epoch) != 0) {
- ldpp_dout(dpp, 1) << __func__ << "::failed setup()" << dendl;
- return;
- }
- const rgw_pool& dedup_pool = store->svc()->zone->get_zone_params().dedup_pool;
- int64_t pool_id = rados_handle->pool_lookup(dedup_pool.name.c_str());
- if (pool_id < 0) {
- ldpp_dout(dpp, 1) << __func__ << "::bad pool_id" << dendl;
- return;
- }
- work_shard_t num_work_shards = epoch.num_work_shards;
- md5_shard_t num_md5_shards = epoch.num_md5_shards;
- const uint64_t RAW_MEM_SIZE = PER_SHARD_BUFFER_SIZE * num_md5_shards;
- ldpp_dout(dpp, 5) <<__func__ << "::RAW_MEM_SIZE=" << RAW_MEM_SIZE
- << "::num_work_shards=" << num_work_shards
- << "::num_md5_shards=" << num_md5_shards << dendl;
- // DEDUP_DYN_ALLOC
- auto raw_mem = std::make_unique<uint8_t[]>(RAW_MEM_SIZE);
- if (raw_mem == nullptr) {
- ldpp_dout(dpp, 1) << "failed slab memory allocation - size=" << RAW_MEM_SIZE << dendl;
- return;
- }
-
- process_all_shards(true, &Background::f_ingress_work_shard, raw_mem.get(),
- RAW_MEM_SIZE, num_work_shards, num_md5_shards);
- if (!d_ctl.should_stop()) {
- // Wait for all other workers to finish ingress step
- work_shards_barrier(num_work_shards);
- if (!d_ctl.should_stop()) {
- process_all_shards(false, &Background::f_dedup_md5_shard, raw_mem.get(),
- RAW_MEM_SIZE, num_work_shards, num_md5_shards);
- // Wait for all other md5 shards to finish
- md5_shards_barrier(num_md5_shards);
- safe_pool_delete(store, dpp, pool_id);
- }
- else {
- ldpp_dout(dpp, 5) <<__func__ << "::stop req from barrier" << dendl;
- }
- }
- else {
- ldpp_dout(dpp, 5) <<__func__ << "::stop req from ingress_work_shard" << dendl;
- }
- } // dedup_exec
-
- std::unique_lock cond_lock(d_cond_mutex);
- d_ctl.dedup_exec = false;
- if (d_ctl.remote_abort_req) {
- d_ctl.remote_aborted = true;
-
- d_ctl.remote_abort_req = false;
- d_ctl.remote_paused = false;
- d_cond.notify_all();
- ldpp_dout(dpp, 5) << __func__ << "::Dedup was aborted on a remote req" << dendl;
- }
- d_cond.wait(cond_lock, [this]{return d_ctl.remote_restart_req || d_ctl.should_stop() || d_ctl.should_pause();});
- if (!d_ctl.should_stop() && !d_ctl.should_pause()) {
- // TBD: should we release lock here ???
- if (d_cluster.can_start_new_scan(store)) {
- d_ctl.dedup_exec = true;
- d_ctl.remote_aborted = false;
- d_ctl.remote_paused = false;
- d_ctl.remote_restart_req = false;
- d_cond.notify_all();
- }
- }else if (d_ctl.should_stop()) {
- ldpp_dout(dpp, 5) << "main loop::should_stop::" << d_ctl << dendl;
- }
- else {
- ldpp_dout(dpp, 5) << "main loop::should_pause::" << d_ctl << dendl;
- }
- }
- d_ctl.shutdown_done = true;
- d_cond.notify_all();
- // shutdown
- ldpp_dout(dpp, 5) << __func__ << "::Dedup background thread stopped" << dendl;
- }
-
-}; //namespace rgw::dedup
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2;
-// vim: ts=8 sw=2 smarttab
-/*
- * Ceph - scalable distributed file system
- *
- * Author: Gabriel BenHanokh <gbenhano@redhat.com>
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation. See file COPYING.
- *
- */
-
-#pragma once
-#include "common/dout.h"
-#include "rgw_common.h"
-#include "rgw_dedup_utils.h"
-#include "rgw_dedup_table.h"
-#include "rgw_dedup_cluster.h"
-#include "rgw_realm_reloader.h"
-#include <string>
-#include <unordered_map>
-#include <variant>
-#include <iostream>
-#include <ostream>
-
-namespace rgw::dedup {
- struct dedup_epoch_t;
- struct control_t {
- control_t() {
- reset();
- }
- void reset();
- inline bool local_urgent_req() const {
- return (shutdown_req || local_pause_req);
- }
- inline bool should_stop() const {
- return (shutdown_req || remote_abort_req);
- }
- inline bool should_pause() const {
- return (local_pause_req || remote_pause_req);
- }
-
- // allow to start/pasue/resume/stop execution
- dedup_req_type_t dedup_type = dedup_req_type_t::DEDUP_TYPE_NONE;
- bool started = false;
- bool dedup_exec = false;
- bool shutdown_req = false;
- bool shutdown_done = false;
- bool local_pause_req = false;
- bool local_paused = false;
- bool remote_abort_req = false;
- bool remote_aborted = false;
- bool remote_pause_req = false;
- bool remote_paused = false;
- bool remote_restart_req = false;
- };
- std::ostream& operator<<(std::ostream &out, const control_t &ctl);
- void encode(const control_t& ctl, ceph::bufferlist& bl);
- void decode(control_t& ctl, ceph::bufferlist::const_iterator& bl);
- class remapper_t;
- class disk_block_seq_t;
- struct disk_record_t;
- struct key_t;
- //Interval between each execution of the script is set to 5 seconds
- static inline constexpr int INIT_EXECUTE_INTERVAL = 5;
- class Background : public RGWRealmReloader::Pauser {
- class DedupWatcher : public librados::WatchCtx2 {
- Background* const parent;
- public:
- DedupWatcher(Background* _parent) : parent(_parent) {}
- ~DedupWatcher() override = default;
- void handle_notify(uint64_t notify_id, uint64_t cookie,
- uint64_t notifier_id, bufferlist& bl) override;
- void handle_error(uint64_t cookie, int err) override;
- };
-
- public:
- Background(rgw::sal::Driver* _driver, CephContext* _cct);
- int watch_reload(const DoutPrefixProvider* dpp);
- int unwatch_reload(const DoutPrefixProvider* dpp);
- void handle_notify(uint64_t notify_id, uint64_t cookie, bufferlist &bl);
- void start();
- void shutdown();
- void pause() override;
- void resume(rgw::sal::Driver* _driver) override;
-
- private:
- enum dedup_step_t {
- STEP_NONE,
- STEP_BUCKET_INDEX_INGRESS,
- STEP_BUILD_TABLE,
- STEP_READ_ATTRIBUTES,
- STEP_REMOVE_DUPLICATES
- };
-
- void run();
- int setup(struct dedup_epoch_t*);
- void work_shards_barrier(work_shard_t num_work_shards);
- void md5_shards_barrier(md5_shard_t num_md5_shards);
- void handle_pause_req(const char* caller);
- const char* dedup_step_name(dedup_step_t step);
- int read_buckets();
- void check_and_update_heartbeat(unsigned shard_id, uint64_t count_a, uint64_t count_b,
- const char *prefix);
-
- inline void check_and_update_worker_heartbeat(work_shard_t worker_id, int64_t obj_count);
- inline void check_and_update_md5_heartbeat(md5_shard_t md5_id,
- uint64_t load_count,
- uint64_t dedup_count);
- int ingress_bucket_idx_single_object(disk_block_array_t &disk_arr,
- const rgw::sal::Bucket *bucket,
- const rgw_bucket_dir_entry &entry,
- worker_stats_t *p_worker_stats /*IN-OUT*/);
- int process_bucket_shards(disk_block_array_t &disk_arr,
- const rgw::sal::Bucket *bucket,
- std::map<int,std::string> &oids,
- librados::IoCtx &ioctx,
- work_shard_t shard_id,
- work_shard_t num_work_shards,
- worker_stats_t *p_worker_stats /*IN-OUT*/);
- int ingress_bucket_objects_single_shard(disk_block_array_t &disk_arr,
- const rgw_bucket &bucket_rec,
- work_shard_t worker_id,
- work_shard_t num_work_shards,
- worker_stats_t *p_worker_stats /*IN-OUT*/);
- int objects_ingress_single_work_shard(work_shard_t worker_id,
- work_shard_t num_work_shards,
- md5_shard_t num_md5_shards,
- worker_stats_t *p_worker_stats,
- uint8_t *raw_mem,
- uint64_t raw_mem_size);
- int f_ingress_work_shard(unsigned shard_id,
- uint8_t *raw_mem,
- uint64_t raw_mem_size,
- work_shard_t num_work_shards,
- md5_shard_t num_md5_shards);
- int f_dedup_md5_shard(unsigned shard_id,
- uint8_t *raw_mem,
- uint64_t raw_mem_size,
- work_shard_t num_work_shards,
- md5_shard_t num_md5_shards);
- int process_all_shards(bool ingress_work_shards,
- int (Background::* func)(unsigned, uint8_t*, uint64_t, work_shard_t, md5_shard_t),
- uint8_t *raw_mem,
- uint64_t raw_mem_size,
- work_shard_t num_work_shards,
- md5_shard_t num_md5_shards);
- int read_bucket_stats(const rgw_bucket &bucket_rec,
- uint64_t *p_num_obj,
- uint64_t *p_size);
- int collect_all_buckets_stats();
- int objects_dedup_single_md5_shard(dedup_table_t *p_table,
- md5_shard_t md5_shard,
- md5_stats_t *p_stats,
- work_shard_t num_work_shards);
- int add_disk_rec_from_bucket_idx(disk_block_array_t &disk_arr,
- const rgw::sal::Bucket *p_bucket,
- const parsed_etag_t *p_parsed_etag,
- const std::string &obj_name,
- uint64_t obj_size,
- const std::string &storage_class);
-
- int add_record_to_dedup_table(dedup_table_t *p_table,
- const struct disk_record_t *p_rec,
- disk_block_id_t block_id,
- record_id_t rec_id,
- md5_stats_t *p_stats,
- remapper_t *remapper);
-
- int process_all_slabs(dedup_table_t *p_table,
- dedup_step_t step,
- md5_shard_t md5_shard,
- work_shard_t work_shard,
- uint32_t *p_seq_count,
- md5_stats_t *p_stats /* IN-OUT */,
- disk_block_seq_t *p_disk_block_arr,
- remapper_t *remapper);
-
-#ifdef FULL_DEDUP_SUPPORT
- int calc_object_blake3(const disk_record_t *p_rec, uint8_t *p_hash);
- int add_obj_attrs_to_record(rgw_bucket *p_rb,
- disk_record_t *p_rec,
- const rgw::sal::Attrs &attrs,
- dedup_table_t *p_table,
- md5_stats_t *p_stats); /* IN-OUT */
-
- int read_object_attribute(dedup_table_t *p_table,
- disk_record_t *p_rec,
- disk_block_id_t block_id,
- record_id_t rec_id,
- md5_shard_t md5_shard,
- md5_stats_t *p_stats /* IN-OUT */,
- disk_block_seq_t *p_disk,
- remapper_t *remapper);
- int try_deduping_record(dedup_table_t *p_table,
- const disk_record_t *p_rec,
- disk_block_id_t block_id,
- record_id_t rec_id,
- md5_shard_t md5_shard,
- md5_stats_t *p_stats, /* IN-OUT */
- remapper_t *remapper);
- int inc_ref_count_by_manifest(const std::string &ref_tag,
- const std::string &oid,
- RGWObjManifest &manifest);
- int rollback_ref_by_manifest(const std::string &ref_tag,
- const std::string &oid,
- RGWObjManifest &tgt_manifest);
- int free_tail_objs_by_manifest(const std::string &ref_tag,
- const std::string &oid,
- RGWObjManifest &tgt_manifest);
- int dedup_object(const disk_record_t *p_src_rec,
- const disk_record_t *p_tgt_rec,
- md5_stats_t *p_stats,
- bool is_shared_manifest_src);
-#endif
- int remove_slabs(unsigned worker_id, unsigned md5_shard, uint32_t slab_count);
- int init_rados_access_handles(bool init_pool);
-
- // private data members
- rgw::sal::Driver* driver = nullptr;
- rgw::sal::RadosStore* store = nullptr;
- RGWRados* rados = nullptr;
- librados::Rados* rados_handle = nullptr;
- const DoutPrefix dp;
- const DoutPrefixProvider* const dpp;
- CephContext* const cct;
- cluster d_cluster;
- librados::IoCtx d_dedup_cluster_ioctx;
- utime_t d_heart_beat_last_update;
- unsigned d_heart_beat_max_elapsed_sec;
-
- // A pool with 6 billion objects has a 1/(2^64) chance for collison with a 128bit MD5
- uint64_t d_max_protected_objects = (6ULL * 1024 * 1024 * 1024);
- uint64_t d_all_buckets_obj_count = 0;
- uint64_t d_all_buckets_obj_size = 0;
- // we don't benefit from deduping RGW objects smaller than head-object size
- uint32_t d_min_obj_size_for_dedup = (4ULL * 1024 * 1024);
- uint32_t d_head_object_size = (4ULL * 1024 * 1024);
- control_t d_ctl;
- uint64_t d_watch_handle = 0;
- DedupWatcher d_watcher_ctx;
-
- std::thread d_runner;
- std::mutex d_cond_mutex;
- std::condition_variable d_cond;
- };
-
-} //namespace rgw::dedup
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2;
-// vim: ts=8 sw=2 smarttab
-/*
- * Ceph - scalable distributed file system
- *
- * Author: Gabriel BenHanokh <gbenhano@redhat.com>
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation. See file COPYING.
- *
- */
-
-#include "rgw_dedup_cluster.h"
-#include "rgw_dedup.h"
-#include "rgw_dedup_epoch.h"
-#include "rgw_common.h"
-#include "rgw_dedup_store.h"
-#include "include/rados/rados_types.hpp"
-#include "include/rados/buffer.h"
-#include "include/rados/librados.hpp"
-#include "svc_zone.h"
-#include "common/Clock.h" // for ceph_clock_now()
-#include "common/config.h"
-#include "common/Cond.h"
-#include "common/debug.h"
-#include "common/errno.h"
-#include "rgw_common.h"
-#include "include/denc.h"
-#include "rgw_sal.h"
-#include "driver/rados/rgw_sal_rados.h"
-#include <cstdlib>
-#include <ctime>
-#include <string>
-
-namespace rgw::dedup {
- const char* DEDUP_EPOCH_TOKEN = "EPOCH_TOKEN";
- const char* DEDUP_WATCH_OBJ = "DEDUP_WATCH_OBJ";
-
- static constexpr unsigned EPOCH_MAX_LOCK_DURATION_SEC = 30;
- struct shard_progress_t;
- static int collect_shard_stats(rgw::sal::RadosStore *store,
- const DoutPrefixProvider *dpp,
- utime_t epoch_time,
- unsigned shards_count,
- const char *prefix,
- bufferlist bl_arr[],
- struct shard_progress_t *sp_arr);
-
- const uint64_t SP_ALL_OBJECTS = ULLONG_MAX;
- const uint64_t SP_NO_OBJECTS = 0ULL;
- const char* SHARD_PROGRESS_ATTR = "shard_progress";
-
- //---------------------------------------------------------------------------
- static int get_control_ioctx(rgw::sal::RadosStore *store,
- const DoutPrefixProvider *dpp,
- librados::IoCtx &ctl_ioctx /* OUT-PARAM */)
- {
- const auto& control_pool = store->svc()->zone->get_zone_params().control_pool;
- auto rados_handle = store->getRados()->get_rados_handle();
- int ret = rgw_init_ioctx(dpp, rados_handle, control_pool, ctl_ioctx);
- if (unlikely(ret < 0)) {
- ldpp_dout(dpp, 1) << __func__ << "::ERR: failed rgw_init_ioctx() for control_pool ret="
- << ret << "::" << cpp_strerror(-ret) << dendl;
- }
- return ret;
- }
-
- //---------------------------------------------------------------------------
- static int get_epoch(rgw::sal::RadosStore *store,
- const DoutPrefixProvider *dpp,
- dedup_epoch_t *p_epoch, /* OUT */
- const char *caller)
- {
- librados::IoCtx ctl_ioctx;
- int ret = get_control_ioctx(store, dpp, ctl_ioctx);
- if (unlikely(ret != 0)) {
- return ret;
- }
-
- std::string oid(DEDUP_EPOCH_TOKEN);
- bufferlist bl;
- ret = ctl_ioctx.getxattr(oid, RGW_DEDUP_ATTR_EPOCH, bl);
- if (ret > 0) {
- try {
- auto p = bl.cbegin();
- decode(*p_epoch, p);
- }catch (const buffer::error&) {
- ldpp_dout(dpp, 0) << __func__ << "::failed epoch decode!" << dendl;
- return -EINVAL;
- }
- if (caller) {
- ldpp_dout(dpp, 10) << __func__ << "::"<< caller<< "::" << *p_epoch << dendl;
- }
- return 0;
- }
- else {
- // zero length read means no data
- if (ret == 0) {
- ret = -ENODATA;
- }
- ldpp_dout(dpp, 10) << __func__ << "::" << (caller ? caller : "")
- << "::failed ctl_ioctx.getxattr() with: "
- << cpp_strerror(-ret) << ", ret=" << ret << dendl;
- return ret;
- }
- }
-
- //---------------------------------------------------------------------------
- static int set_epoch(rgw::sal::RadosStore *store,
- const std::string &cluster_id,
- const DoutPrefixProvider *dpp,
- work_shard_t num_work_shards,
- md5_shard_t num_md5_shards)
- {
- librados::IoCtx ctl_ioctx;
- int ret = get_control_ioctx(store, dpp, ctl_ioctx);
- if (unlikely(ret != 0)) {
- return ret;
- }
-
- std::string oid(DEDUP_EPOCH_TOKEN);
- ldpp_dout(dpp, 10) << __func__ << "::oid=" << oid << dendl;
- bool exclusive = true; // block overwrite of old objects
- ret = ctl_ioctx.create(oid, exclusive);
- if (ret >= 0) {
- ldpp_dout(dpp, 10) << __func__ << "::successfully created Epoch object!" << dendl;
- // now try and take ownership
- }
- else if (ret == -EEXIST) {
- ldpp_dout(dpp, 10) << __func__ << "::Epoch object exists -> trying to take over" << dendl;
- // try and take ownership
- }
- else{
- ldpp_dout(dpp, 1) << __func__ << "::ERROR: failed to create " << oid
- <<" with: "<< cpp_strerror(-ret) << ", ret=" << ret <<dendl;
- return ret;
- }
-
- uint32_t serial = 0;
- dedup_req_type_t dedup_type = dedup_req_type_t::DEDUP_TYPE_ESTIMATE;
- dedup_epoch_t new_epoch = { serial, dedup_type, ceph_clock_now(),
- num_work_shards, num_md5_shards };
- bufferlist new_epoch_bl, empty_bl;
- encode(new_epoch, new_epoch_bl);
- librados::ObjectWriteOperation op;
- op.cmpxattr(RGW_DEDUP_ATTR_EPOCH, CEPH_OSD_CMPXATTR_OP_EQ, empty_bl);
- op.setxattr(RGW_DEDUP_ATTR_EPOCH, new_epoch_bl);
-
- ldpp_dout(dpp, 10) << __func__ << "::send EPOCH CLS" << dendl;
- ret = ctl_ioctx.operate(oid, &op);
- if (ret == 0) {
- ldpp_dout(dpp, 10) << __func__ << "::Epoch object was written" << dendl;
- }
- // TBD: must check for failure caused by an existing EPOCH xattr!
- // probably best to read attribute from epoch!
- else if (ret == -ECANCELED) {
- dedup_epoch_t epoch;
- ret = get_epoch(store, dpp, &epoch, __func__);
- if (ret == 0) {
- ldpp_dout(dpp, 10) << __func__ << "::Accept existing Epoch object" << dendl;
- }
- return ret;
- }
- else {
- ldpp_dout(dpp, 5) << __func__ << "::ERR: failed ctl_ioctx.operate("
- << oid << "), err is " << cpp_strerror(-ret) << dendl;
- }
- return ret;
- }
-
- //---------------------------------------------------------------------------
- static int swap_epoch(rgw::sal::RadosStore *store,
- const DoutPrefixProvider *dpp,
- const dedup_epoch_t *p_old_epoch,
- dedup_req_type_t dedup_type,
- work_shard_t num_work_shards,
- md5_shard_t num_md5_shards)
- {
- librados::IoCtx ctl_ioctx;
- int ret = get_control_ioctx(store, dpp, ctl_ioctx);
- if (unlikely(ret != 0)) {
- return ret;
- }
-
- dedup_epoch_t new_epoch = { p_old_epoch->serial + 1, dedup_type,
- ceph_clock_now(), num_work_shards, num_md5_shards};
- bufferlist old_epoch_bl, new_epoch_bl, err_bl;
- encode(*p_old_epoch, old_epoch_bl);
- encode(new_epoch, new_epoch_bl);
- librados::ObjectWriteOperation op;
- op.cmpxattr(RGW_DEDUP_ATTR_EPOCH, CEPH_OSD_CMPXATTR_OP_EQ, old_epoch_bl);
- op.setxattr(RGW_DEDUP_ATTR_EPOCH, new_epoch_bl);
-
- ldpp_dout(dpp, 10) << __func__ << "::send EPOCH CLS" << dendl;
- std::string oid(DEDUP_EPOCH_TOKEN);
- ret = ctl_ioctx.operate(oid, &op);
- if (ret != 0) {
- ldpp_dout(dpp, 5) << __func__ << "::ERR: failed ctl_ioctx.operate("
- << oid << "), err is " << cpp_strerror(-ret) << dendl;
- }
-
- return ret;
- }
-
- //---------------------------------------------------------------------------
- struct shard_progress_t {
- shard_progress_t() {
- // init an empty object
- this->progress_a = SP_NO_OBJECTS;
- this->progress_b = SP_NO_OBJECTS;
- this->completed = false;
-
- // set all timers to now
- this->creation_time = utime_t();
- this->completion_time = utime_t();
- this->update_time = utime_t();
-
- // owner and stats_bl are empty until set
- }
-
- shard_progress_t(uint64_t _progress_a,
- uint64_t _progress_b,
- bool _completed,
- const std::string &_owner,
- const bufferlist &_stats_bl) : owner(_owner), stats_bl(_stats_bl) {
- this->progress_a = _progress_a;
- this->progress_b = _progress_b;
- this->completed = _completed;
-
- utime_t now = ceph_clock_now();
- this->update_time = now;
-
- if (_progress_a == SP_NO_OBJECTS && _progress_b == SP_NO_OBJECTS) {
- this->creation_time = now;
- }
- if (_completed) {
- this->completion_time = now;
- }
- }
-
- bool is_completed() const {
- if (this->progress_b == SP_ALL_OBJECTS) {
- ceph_assert(this->completed);
- return true;
- }
- else {
- ceph_assert(!this->completed);
- return false;
- }
- }
-
- bool was_not_started() const {
- return (this->creation_time == this->update_time);
- }
-
- uint64_t progress_a;
- uint64_t progress_b;
- bool completed;
- utime_t update_time;
- utime_t creation_time;
- utime_t completion_time;
- std::string owner;
- bufferlist stats_bl;
- };
-
- //---------------------------------------------------------------------------
- std::ostream& operator<<(std::ostream &out, shard_progress_t& sp)
- {
- out << (sp.completed ? " + ::" : " - ::");
- out << sp.owner << "::[" << sp.progress_a << ", " << sp.progress_b << "]";
- out << "::creation: " << sp.creation_time;
- out << "::update: " << sp.update_time;
- out << "::completion: " << sp.completion_time;
- return out;
- }
-
- //---------------------------------------------------------------------------
- void encode(const shard_progress_t& sp, ceph::bufferlist& bl)
- {
- ENCODE_START(1, 1, bl);
- encode(sp.progress_a, bl);
- encode(sp.progress_b, bl);
- encode(sp.completed, bl);
- encode(sp.creation_time, bl);
- encode(sp.completion_time, bl);
- encode(sp.update_time, bl);
- encode(sp.owner, bl);
- encode(sp.stats_bl, bl);
- ENCODE_FINISH(bl);
- }
-
- //---------------------------------------------------------------------------
- void decode(shard_progress_t & sp, ceph::bufferlist::const_iterator& bl)
- {
- DECODE_START(1, bl);
- decode(sp.progress_a, bl);
- decode(sp.progress_b, bl);
- decode(sp.completed, bl);
- decode(sp.creation_time, bl);
- decode(sp.completion_time, bl);
- decode(sp.update_time, bl);
- decode(sp.owner, bl);
- decode(sp.stats_bl, bl);
- DECODE_FINISH(bl);
- }
-
- //==========================================================================
-
- //---------------------------------------------------------------------------
- void cluster::clear()
- {
- d_curr_md5_shard = 0;
- d_curr_worker_shard = 0;
-
- d_num_completed_workers = 0;
- d_num_completed_md5 = 0;
-
- memset(d_completed_workers, TOKEN_STATE_PENDING, sizeof(d_completed_workers));
- memset(d_completed_md5, TOKEN_STATE_PENDING, sizeof(d_completed_md5));
- }
-
-
- static constexpr auto COOKIE_LEN = 15;
- static constexpr auto CLUSTER_ID_LEN = 15;
- //---------------------------------------------------------------------------
- cluster::cluster(const DoutPrefixProvider *_dpp,
- CephContext *cct,
- rgw::sal::Driver* driver):
- dpp(_dpp),
- d_lock_cookie(gen_rand_alphanumeric(cct, COOKIE_LEN)),
- d_cluster_id (gen_rand_alphanumeric(cct, CLUSTER_ID_LEN))
- {
- clear();
- }
-
- //---------------------------------------------------------------------------
- int cluster::reset(rgw::sal::RadosStore *store,
- dedup_epoch_t *p_epoch,
- work_shard_t num_work_shards,
- md5_shard_t num_md5_shards)
- {
- ldpp_dout(dpp, 10) << __func__ << "::REQ num_work_shards=" << num_work_shards
- << "::num_md5_shards=" << num_md5_shards << dendl;
- clear();
-
- while (true) {
- int ret = get_epoch(store, dpp, p_epoch, __func__);
- if (ret != 0) {
- return ret;
- }
- if (p_epoch->num_work_shards && p_epoch->num_md5_shards) {
- ldpp_dout(dpp, 10) << __func__ << "::ACC num_work_shards=" << p_epoch->num_work_shards
- << "::num_md5_shards=" << p_epoch->num_md5_shards << dendl;
- break;
- }
- else if (!num_work_shards && !num_md5_shards) {
- ldpp_dout(dpp, 10) << __func__ << "::Init flow, no need to wait" << dendl;
- break;
- }
- else {
- ret = swap_epoch(store, dpp, p_epoch,
- static_cast<dedup_req_type_t> (p_epoch->dedup_type),
- num_work_shards, num_md5_shards);
- }
- }
-
- d_epoch_time = p_epoch->time;
- // retry cleanup 3 times before declaring failure
- const unsigned RETRY_LIMIT = 3;
- int ret = 1;
- for (unsigned i = 0; i < RETRY_LIMIT && ret != 0; i++) {
- ret = cleanup_prev_run(store);
- }
- if (ret != 0) {
- return ret;
- }
-
- create_shard_tokens(store, p_epoch->num_work_shards, WORKER_SHARD_PREFIX);
- create_shard_tokens(store, p_epoch->num_md5_shards, MD5_SHARD_PREFIX);
-
- ret = verify_all_shard_tokens(store, p_epoch->num_work_shards,
- WORKER_SHARD_PREFIX);
- if (ret != 0) {
- return ret;
- }
- return verify_all_shard_tokens(store, p_epoch->num_md5_shards,
- MD5_SHARD_PREFIX);
- }
-
- //---------------------------------------------------------------------------
- int cluster::cleanup_prev_run(rgw::sal::RadosStore *store)
- {
- librados::IoCtx ctl_ioctx;
- int ret = get_control_ioctx(store, dpp, ctl_ioctx);
- if (unlikely(ret != 0)) {
- return ret;
- }
-
- int error_code = 0;
- constexpr uint32_t max = 100;
- std::string marker;
- bool truncated = false;
- rgw::AccessListFilter filter{};
- unsigned deleted_count = 0, skipped_count = 0;
- unsigned failed_count = 0, no_entry_count = 0;
- do {
- std::vector<std::string> oids;
- int ret = rgw_list_pool(dpp, ctl_ioctx, max, filter, marker, &oids, &truncated);
- if (ret == -ENOENT) {
- ldpp_dout(dpp, 10) << __func__ << "::rgw_list_pool() ret == -ENOENT"<< dendl;
- break;
- }
- else if (ret < 0) {
- ldpp_dout(dpp, 1) << "failed rgw_list_pool()! ret=" << ret
- << "::" << cpp_strerror(-ret) << dendl;
- return ret;
- }
-
- for (const std::string& oid : oids) {
- if (shard_token_oid::legal_oid_name(oid) == false) {
- ldpp_dout(dpp, 10) << __func__ << "::skipping " << oid << dendl;
- skipped_count++;
- continue;
- }
-
- uint64_t size;
- struct timespec tspec;
- ret = ctl_ioctx.stat2(oid, &size, &tspec);
- if (ret == -ENOENT) {
- ldpp_dout(dpp, 20) << __func__ << "::" << oid
- << " was removed by others" << dendl;
- no_entry_count++;
- continue;
- }
- else if (ret != 0) {
- ldpp_dout(dpp, 10) << __func__ << "::failed ctl_ioctx.stat( "
- << oid << " )" << dendl;
- error_code = ret;
- failed_count++;
- continue;
- }
- utime_t mtime(tspec);
- if (d_epoch_time < mtime) {
- ldpp_dout(dpp, 10) << __func__ << "::skipping new obj! "
- << "::EPOCH={" << d_epoch_time.tv.tv_sec << ":" << d_epoch_time.tv.tv_nsec << "} "
- << "::mtime={" << mtime.tv.tv_sec << ":" << mtime.tv.tv_nsec << "}" << dendl;
- skipped_count++;
- continue;
- }
- ldpp_dout(dpp, 10) << __func__ << "::removing object: " << oid << dendl;
- ret = ctl_ioctx.remove(oid);
- if (ret == 0) {
- deleted_count++;
- }
- else if (ret == -ENOENT) {
- ldpp_dout(dpp, 20) << __func__ << "::" << oid
- << " was removed by others" << dendl;
- no_entry_count++;
- continue;
- }
- else {
- error_code = ret;
- failed_count++;
- ldpp_dout(dpp, 10) << __func__ << "::failed ctl_ioctx.remove( " << oid
- << " ), ret=" << ret << "::" << cpp_strerror(-ret) << dendl;
- }
- }
- ldpp_dout(dpp, 10) << __func__ << "::oids.size()=" << oids.size()
- << "::deleted=" << deleted_count
- << "::failed=" << failed_count
- << "::no entry=" << no_entry_count
- << "::skipped=" << skipped_count << dendl;
- } while (truncated);
-
- return error_code;
- }
-
- //---------------------------------------------------------------------------
- int cluster::create_shard_tokens(rgw::sal::RadosStore *store,
- unsigned shards_count,
- const char *prefix)
- {
- librados::IoCtx ctl_ioctx;
- int ret = get_control_ioctx(store, dpp, ctl_ioctx);
- if (unlikely(ret != 0)) {
- return ret;
- }
-
- shard_token_oid sto(prefix);
- for (unsigned shard = 0; shard < shards_count; shard++) {
- sto.set_shard(shard);
- std::string oid(sto.get_buff(), sto.get_buff_size());
- ldpp_dout(dpp, 15) << __func__ << "::creating object: " << oid << dendl;
- bool exclusive = true;
- ret = ctl_ioctx.create(oid, exclusive);
- if (ret >= 0) {
- ldpp_dout(dpp, 15) << __func__ << "::oid=" << oid << " was created!" << dendl;
- }
- else if (ret == -EEXIST) {
- ldpp_dout(dpp, 15) << __func__ << "::failed ctl_ioctx.create("
- << oid << ") -EEXIST!" << dendl;
- }
- else {
- // TBD: can it happen legally ?
- ldpp_dout(dpp, 1) << __func__ << "::failed ctl_ioctx.create(" << oid
- << ") with: " << ret << "::" << cpp_strerror(-ret) << dendl;
- }
- }
-
- return 0;
- }
-
- //---------------------------------------------------------------------------
- int cluster::verify_all_shard_tokens(rgw::sal::RadosStore *store,
- unsigned shards_count,
- const char *prefix)
- {
- librados::IoCtx ctl_ioctx;
- int ret = get_control_ioctx(store, dpp, ctl_ioctx);
- if (unlikely(ret != 0)) {
- return ret;
- }
-
- shard_token_oid sto(prefix);
- for (unsigned shard = 0; shard < shards_count; shard++) {
- sto.set_shard(shard);
- std::string oid(sto.get_buff(), sto.get_buff_size());
- ldpp_dout(dpp, 10) << __func__ << "::checking object: " << oid << dendl;
-
- uint64_t size;
- struct timespec tspec;
- ret = ctl_ioctx.stat2(oid, &size, &tspec);
- if (ret != 0) {
- ldpp_dout(dpp, 5) << __func__ << "::failed ctl_ioctx.stat( " << oid << " )"
- << "::shards_count=" << shards_count << dendl;
- return ret;
- }
- }
-
- return 0;
- }
-
- //---------------------------------------------------------------------------
- int cluster::update_shard_token_heartbeat(rgw::sal::RadosStore *store,
- unsigned shard,
- uint64_t count_a,
- uint64_t count_b,
- const char *prefix)
- {
- librados::IoCtx ctl_ioctx;
- int ret = get_control_ioctx(store, dpp, ctl_ioctx);
- if (unlikely(ret != 0)) {
- return ret;
- }
-
- shard_token_oid sto(prefix, shard);
- std::string oid(sto.get_buff(), sto.get_buff_size());
- bufferlist empty_bl;
- shard_progress_t sp(count_a, count_b, false, d_cluster_id, empty_bl);
- sp.creation_time = d_token_creation_time;
- bufferlist sp_bl;
- encode(sp, sp_bl);
- return ctl_ioctx.setxattr(oid, SHARD_PROGRESS_ATTR, sp_bl);
- }
-
- //---------------------------------------------------------------------------
- int cluster::mark_shard_token_completed(rgw::sal::RadosStore *store,
- unsigned shard,
- uint64_t obj_count,
- const char *prefix,
- const bufferlist &bl)
- {
- librados::IoCtx ctl_ioctx;
- int ret = get_control_ioctx(store, dpp, ctl_ioctx);
- if (unlikely(ret != 0)) {
- return ret;
- }
-
- shard_token_oid sto(prefix, shard);
- std::string oid(sto.get_buff(), sto.get_buff_size());
- ldpp_dout(dpp, 10) << __func__ << "::" << prefix << "::" << oid << dendl;
-
- shard_progress_t sp(obj_count, SP_ALL_OBJECTS, true, d_cluster_id, bl);
- sp.creation_time = d_token_creation_time;
- bufferlist sp_bl;
- encode(sp, sp_bl);
- ret = ctl_ioctx.setxattr(oid, SHARD_PROGRESS_ATTR, sp_bl);
- if (ret == 0) {
- ldpp_dout(dpp, 10) << __func__ << "::Done ctl_ioctx.setxattr(" << oid << ")"
- << dendl;
- }
- else {
- ldpp_dout(dpp, 0) << __func__ << "::Failed ctl_ioctx.setxattr(" << oid
- << ") ret=" << ret << "::" << cpp_strerror(-ret) << dendl;
- }
-
- return ret;
- }
-
- //---------------------------------------------------------------------------
- int32_t cluster::get_next_shard_token(rgw::sal::RadosStore *store,
- uint16_t start_shard,
- uint16_t max_shard,
- const char *prefix)
- {
- librados::IoCtx ctl_ioctx;
- int ret = get_control_ioctx(store, dpp, ctl_ioctx);
- if (unlikely(ret != 0)) {
- return ret;
- }
-
- // lock paramters:
- const utime_t lock_duration; // zero duration means lock doesn't expire
- const uint8_t lock_flags = 0; // no flags
- const std::string lock_tag; // no tag
-
- shard_token_oid sto(prefix);
- for (auto shard = start_shard; shard < max_shard; shard++) {
- sto.set_shard(shard);
- std::string oid(sto.get_buff(), sto.get_buff_size());
- ldpp_dout(dpp, 10) << __func__ << "::try garbbing " << oid << dendl;
- librados::ObjectWriteOperation op;
- op.assert_exists();
- rados::cls::lock::lock(&op, oid, ClsLockType::EXCLUSIVE, d_lock_cookie,
- lock_tag, "dedup_shard_token", lock_duration, lock_flags);
- ret = rgw_rados_operate(dpp, ctl_ioctx, oid, std::move(op), null_yield);
- if (ret == -EBUSY) {
- // someone else took this token -> move to the next one
- ldpp_dout(dpp, 10) << __func__ << "::Failed lock. " << oid <<
- " is owned by other rgw" << dendl;
- continue;
- }
- else if (ret == -ENOENT) {
- // token is deleted - processing will stop the next time we try to read from the queue
- ldpp_dout(dpp, 5) << __func__ << "::" << oid
- << " token doesn't exist, fail lock!" << dendl;
- continue;
- }
- else if (ret < 0) {
- // failed to lock for another reason, continue to process other queues
- ldpp_dout(dpp, 1) << __func__ << "::ERROR: failed to lock token: " << oid
- << ":: ret=" << ret << "::" << cpp_strerror(-ret) << dendl;
- //has_error = true;
- continue;
- }
- ldpp_dout(dpp, 10) << __func__ << "::successfully locked " << oid << dendl;
- bufferlist empty_bl;
- shard_progress_t sp(SP_NO_OBJECTS, SP_NO_OBJECTS, false, d_cluster_id, empty_bl);
- d_token_creation_time = sp.creation_time;
- bufferlist sp_bl;
- encode(sp, sp_bl);
- ret = ctl_ioctx.setxattr(oid, SHARD_PROGRESS_ATTR, sp_bl);
- if (ret == 0) {
- ldpp_dout(dpp, 10) << __func__ << "::SUCCESS!::" << oid << dendl;
- return shard;
- }
- }
-
- return NULL_SHARD;
- }
-
- //---------------------------------------------------------------------------
- work_shard_t cluster::get_next_work_shard_token(rgw::sal::RadosStore *store,
- work_shard_t num_work_shards)
- {
- int32_t shard = get_next_shard_token(store, d_curr_worker_shard,
- num_work_shards, WORKER_SHARD_PREFIX);
- if (shard >= 0 && shard < num_work_shards) {
- d_curr_worker_shard = shard + 1;
- return shard;
- }
- else {
- return NULL_WORK_SHARD;
- }
- }
-
- //---------------------------------------------------------------------------
- md5_shard_t cluster::get_next_md5_shard_token(rgw::sal::RadosStore *store,
- md5_shard_t num_md5_shards)
- {
- int32_t shard = get_next_shard_token(store, d_curr_md5_shard, num_md5_shards,
- MD5_SHARD_PREFIX);
- if (shard >= 0 && shard < num_md5_shards) {
- d_curr_md5_shard = shard + 1;
- return shard;
- }
- else {
- return NULL_MD5_SHARD;
- }
- }
-
- //---------------------------------------------------------------------------
- int cluster::all_shard_tokens_completed(rgw::sal::RadosStore *store,
- unsigned shards_count,
- const char *prefix,
- uint16_t *p_num_completed,
- uint8_t completed_arr[])
- {
- librados::IoCtx ctl_ioctx;
- int ret = get_control_ioctx(store, dpp, ctl_ioctx);
- if (unlikely(ret != 0)) {
- return ret;
- }
-
- int err_code = 0;
- unsigned count = 0;
- shard_token_oid sto(prefix);
- for (unsigned shard = 0; shard < shards_count; shard++) {
- if (completed_arr[shard] == TOKEN_STATE_COMPLETED) {
- count++;
- continue;
- }
-
- sto.set_shard(shard);
- std::string oid(sto.get_buff(), sto.get_buff_size());
- ldpp_dout(dpp, 10) << __func__ << "::checking object: " << oid << dendl;
- bufferlist bl;
- ret = ctl_ioctx.getxattr(oid, SHARD_PROGRESS_ATTR, bl);
- if (unlikely(ret <= 0)) {
- if (ret != -ENODATA) {
- ldpp_dout(dpp, 10) << __func__ << "::failed ctl_ioctx.getxattr() ret="
- << ret << "::" << cpp_strerror(-ret) << dendl;
- }
- completed_arr[shard] = TOKEN_STATE_CORRUPTED;
- // all failures to get valid token state return ENODATA
- err_code = -ENODATA;
- continue;
- }
-
- shard_progress_t sp;
- try {
- auto p = bl.cbegin();
- decode(sp, p);
- }
- catch (const buffer::error&) {
- ldpp_dout(dpp, 1) << __func__ << "::failed shard_progress_t decode!" << dendl;
- completed_arr[shard] = TOKEN_STATE_CORRUPTED;
- // all failures to get valid token state return ENODATA
- err_code = -ENODATA;
- continue;
- }
-
- if (sp.is_completed()) {
- utime_t duration = sp.completion_time - sp.creation_time;
- // mark token completed;
- (*p_num_completed)++;
- completed_arr[shard] = TOKEN_STATE_COMPLETED;
- ldpp_dout(dpp, 20) << __func__ << "::" << oid
- << "::completed! duration=" << duration << dendl;
- count++;
- }
- else if (sp.was_not_started()) {
- // token was not started yet
- // TBD:
- // If it is not locked we can process it (by why we skipped it)??
- // If locked, check when it was done and if timed-out
- ldpp_dout(dpp, 10) << __func__ << "::" << oid
- << "::was not started, skipping" << dendl;
- return -EAGAIN;
- }
- else {
- static const utime_t heartbeat_timeout(EPOCH_MAX_LOCK_DURATION_SEC, 0);
- utime_t time_elapsed = ceph_clock_now() - sp.update_time;
- if (time_elapsed > heartbeat_timeout) {
- // lock expired -> try and break lock
- ldpp_dout(dpp, 5) << __func__ << "::" << oid
- << "::expired lock, skipping:" << time_elapsed
- << "::" << sp << dendl;
- completed_arr[shard] = TOKEN_STATE_TIMED_OUT;
- err_code = -ETIME;
- continue;
- }
- else {
- return -EAGAIN;
- }
- }
- } // loop
-
- if (count < shards_count) {
- unsigned n = shards_count - count;
- ldpp_dout(dpp, 10) << __func__ << "::waiting for " << n << " tokens" << dendl;
- }
- return err_code;
- }
-
- //---------------------------------------------------------------------------
- static int collect_shard_stats(rgw::sal::RadosStore *store,
- const DoutPrefixProvider *dpp,
- utime_t epoch_time,
- unsigned shards_count,
- const char *prefix,
- bufferlist bl_arr[],
- shard_progress_t *sp_arr)
- {
- librados::IoCtx ctl_ioctx;
- int ret = get_control_ioctx(store, dpp, ctl_ioctx);
- if (unlikely(ret != 0)) {
- return ret;
- }
-
- unsigned count = 0;
- cluster::shard_token_oid sto(prefix);
- for (unsigned shard = 0; shard < shards_count; shard++) {
- sto.set_shard(shard);
- std::string oid(sto.get_buff(), sto.get_buff_size());
- ldpp_dout(dpp, 20) << __func__ << "::checking object: " << oid << dendl;
-
- uint64_t size;
- struct timespec tspec;
- if (ctl_ioctx.stat2(oid, &size, &tspec) != 0) {
- ldpp_dout(dpp, 10) << __func__ << "::failed ctl_ioctx.stat( " << oid << " )"
- << "::shards_count=" << shards_count << dendl;
- continue;
- }
- utime_t mtime(tspec);
- if (epoch_time > mtime) {
- ldpp_dout(dpp, 10) << __func__ << "::skipping old obj! "
- << "::EPOCH={" << epoch_time.tv.tv_sec << ":" << epoch_time.tv.tv_nsec << "} "
- << "::mtime={" << mtime.tv.tv_sec << ":" << mtime.tv.tv_nsec << "}" << dendl;
- continue;
- }
-
- shard_progress_t sp;
- bufferlist bl;
- ret = ctl_ioctx.getxattr(oid, SHARD_PROGRESS_ATTR, bl);
- if (ret > 0) {
- try {
- auto p = bl.cbegin();
- decode(sp, p);
- sp_arr[shard] = sp;
- count++;
- }
- catch (const buffer::error&) {
- ldpp_dout(dpp, 10) << __func__ << "::(1)failed shard_progress_t decode!" << dendl;
- return -EINVAL;
- }
- }
- else if (ret != -ENODATA) {
- ldpp_dout(dpp, 10) << __func__ << "::" << oid << "::failed getxattr() ret="
- << ret << "::" << cpp_strerror(-ret) << dendl;
- continue;
- }
- bl_arr[shard] = sp.stats_bl;
- }
-
- if (count != shards_count) {
- ldpp_dout(dpp, 10) << __func__ << "::missing shards stats! we got "
- << count << " / " << shards_count << dendl;
- }
-
- return count;
- }
-
- struct member_time_t {
- utime_t start_time;
- utime_t end_time;
- utime_t aggregated_time;
- };
-
- //---------------------------------------------------------------------------
- static void collect_single_shard_stats(const DoutPrefixProvider *dpp,
- std::map<std::string, member_time_t> &owner_map,
- const shard_progress_t sp_arr[],
- unsigned shard,
- bool *p_show_time,
- const char *name)
- {
- const utime_t null_time;
- const shard_progress_t &sp = sp_arr[shard];
- if (sp.creation_time == null_time || sp.completion_time == null_time) {
- *p_show_time = false;
- return;
- }
-
- const std::string &owner = sp.owner;
- utime_t duration = sp.completion_time - sp.creation_time;
- if (owner_map.find(owner) != owner_map.end()) {
- owner_map[owner].aggregated_time += duration;
- owner_map[owner].end_time = sp.completion_time;
- }
- else {
- owner_map[owner].start_time = sp.creation_time;
- owner_map[owner].aggregated_time = duration;
- owner_map[owner].end_time = sp.completion_time;
- }
- ldpp_dout(dpp, 10) << __func__ << "::Got " << name
- << " stats for shard #" << shard << dendl;
- }
-
- //---------------------------------------------------------------------------
- static void show_incomplete_shards_fmt(bool has_incomplete_shards,
- unsigned num_shards,
- const shard_progress_t sp_arr[],
- Formatter *fmt)
-
- {
- if (!has_incomplete_shards) {
- return;
- }
- Formatter::ArraySection array_section{*fmt, "incomplete_shards"};
- for (unsigned shard = 0; shard < num_shards; shard++) {
- if (sp_arr[shard].is_completed() ) {
- continue;
- }
- Formatter::ObjectSection object_section{*fmt, "shard_progress"};
- fmt->dump_unsigned("shard_id", shard);
- fmt->dump_string("owner", sp_arr[shard].owner);
- fmt->dump_unsigned("progress_a", sp_arr[shard].progress_a);
- fmt->dump_unsigned("progress_b", sp_arr[shard].progress_b);
- fmt->dump_stream("last updated") << sp_arr[shard].update_time;
- }
- }
-
- //---------------------------------------------------------------------------
- static utime_t show_time_func_fmt(const utime_t &start_time,
- bool show_time,
- const std::map<std::string, member_time_t> &owner_map,
- Formatter *fmt)
- {
- member_time_t all_members_time;
- all_members_time.start_time = start_time;
- all_members_time.end_time = start_time;
- all_members_time.aggregated_time = utime_t();
-
- Formatter::ObjectSection section{*fmt, "time"};
- {
- Formatter::ArraySection array_section{*fmt, "per-shard time"};
- for (const auto& [owner, value] : owner_map) {
- uint32_t sec = value.end_time.tv.tv_sec - value.start_time.tv.tv_sec;
- fmt->dump_stream("member time")
- << owner << "::start time = [" << value.start_time.tv.tv_sec % 1000
- << ":" << value.start_time.tv.tv_nsec / (1000*1000) << "] "
- << "::aggregated time = " << value.aggregated_time.tv.tv_sec
- << "(" << sec << ") seconds";
- all_members_time.aggregated_time += value.aggregated_time;
- if (all_members_time.end_time < value.end_time) {
- all_members_time.end_time = value.end_time;
- }
- }
- }
-
- if (show_time) {
- uint32_t sec = all_members_time.end_time.tv.tv_sec - all_members_time.start_time.tv.tv_sec;
-
- Formatter::ObjectSection section{*fmt, "All shards time"};
- fmt->dump_stream("start time") << all_members_time.start_time;
- fmt->dump_stream("end time")
- << all_members_time.end_time << " (" << sec << " seconds total)";
- fmt->dump_unsigned("aggregated time (sec)", all_members_time.aggregated_time.tv.tv_sec);
- }
-
- return all_members_time.end_time;
- }
-
- //---------------------------------------------------------------------------
- static void show_dedup_ratio_estimate_fmt(const worker_stats_t &wrk_stats_sum,
- const md5_stats_t &md5_stats_sum,
- Formatter *fmt)
- {
- uint64_t s3_bytes_before = wrk_stats_sum.ingress_obj_bytes;
- uint64_t s3_dedup_bytes = md5_stats_sum.big_objs_stat.dedup_bytes_estimate;
- uint64_t s3_bytes_after = s3_bytes_before - s3_dedup_bytes;
- Formatter::ObjectSection section{*fmt, "dedup_ratio_estimate"};
- fmt->dump_unsigned("s3_bytes_before", s3_bytes_before);
- fmt->dump_unsigned("s3_bytes_after", s3_bytes_after);
- fmt->dump_unsigned("dup_head_bytes", md5_stats_sum.dup_head_bytes_estimate);
-
- if (s3_bytes_before > s3_bytes_after && s3_bytes_after) {
- double dedup_ratio = (double)s3_bytes_before/s3_bytes_after;
- fmt->dump_float("dedup_ratio", dedup_ratio);
- }
- else {
- fmt->dump_float("dedup_ratio", 0);
- }
- }
-
- //---------------------------------------------------------------------------
- static void show_dedup_ratio_actual_fmt(const worker_stats_t &wrk_stats_sum,
- const md5_stats_t &md5_stats_sum,
- Formatter *fmt)
- {
- uint64_t s3_bytes_before = wrk_stats_sum.ingress_obj_bytes;
- uint64_t s3_dedup_bytes = (md5_stats_sum.deduped_objects_bytes +
- md5_stats_sum.shared_manifest_dedup_bytes);
- uint64_t s3_bytes_after = s3_bytes_before - s3_dedup_bytes;
-
- Formatter::ObjectSection section{*fmt, "dedup_ratio_actual"};
- fmt->dump_unsigned("s3_bytes_before", s3_bytes_before);
- fmt->dump_unsigned("s3_bytes_after", s3_bytes_after);
- fmt->dump_unsigned("dup_head_bytes", md5_stats_sum.dup_head_bytes);
- if (s3_bytes_before > s3_bytes_after && s3_bytes_after) {
- double dedup_ratio = (double)s3_bytes_before/s3_bytes_after;
- fmt->dump_float("dedup_ratio", dedup_ratio);
- }
- else {
- fmt->dump_float("dedup_ratio", 0);
- }
- }
-
- //---------------------------------------------------------------------------
- // command-line called from radosgw-admin.cc
- int cluster::collect_all_shard_stats(rgw::sal::RadosStore *store,
- Formatter *fmt,
- const DoutPrefixProvider *dpp)
- {
- dedup_epoch_t epoch;
- int ret = get_epoch(store, dpp, &epoch, nullptr);
- if (ret != 0) {
- return ret;
- }
-
- Formatter::ObjectSection section{*fmt, "DEDUP STAT COUNTERS"};
- work_shard_t num_work_shards = epoch.num_work_shards;
- md5_shard_t num_md5_shards = epoch.num_md5_shards;
-
- unsigned completed_work_shards_count = 0;
- unsigned completed_md5_shards_count = 0;
- utime_t md5_start_time;
- worker_stats_t wrk_stats_sum;
- {
- std::map<std::string, member_time_t> owner_map;
- bool show_time = true;
- bufferlist bl_arr[num_work_shards];
- shard_progress_t sp_arr[num_work_shards];
- int cnt = collect_shard_stats(store, dpp, epoch.time, num_work_shards,
- WORKER_SHARD_PREFIX, bl_arr, sp_arr);
- if (cnt != num_work_shards && 0) {
- std::cerr << ">>>Partial work shard stats recived " << cnt << " / "
- << num_work_shards << "\n" << std::endl;
- }
- bool has_incomplete_shards = false;
- for (unsigned shard = 0; shard < num_work_shards; shard++) {
- if (bl_arr[shard].length() == 0) {
- has_incomplete_shards = true;
- continue;
- }
- completed_work_shards_count++;
- worker_stats_t stats;
- try {
- auto p = bl_arr[shard].cbegin();
- decode(stats, p);
- wrk_stats_sum += stats;
- }catch (const buffer::error&) {
- // TBD: can we use std::cerr or should we use formatter ??
- std::cerr << __func__ << "::(2)failed worker_stats_t decode #" << shard << std::endl;
- continue;
- }
- collect_single_shard_stats(dpp, owner_map, sp_arr, shard, &show_time, "WORKER");
- }
- Formatter::ObjectSection worker_stats(*fmt, "worker_stats");
- wrk_stats_sum.dump(fmt);
- show_incomplete_shards_fmt(has_incomplete_shards, num_work_shards, sp_arr, fmt);
- md5_start_time = show_time_func_fmt(epoch.time, show_time, owner_map, fmt);
- }
-
- if (completed_work_shards_count == num_work_shards) {
- std::map<std::string, member_time_t> owner_map;
- bool show_time = true;
- md5_stats_t md5_stats_sum;
- bufferlist bl_arr[num_md5_shards];
- shard_progress_t sp_arr[num_md5_shards];
- int cnt = collect_shard_stats(store, dpp, epoch.time, num_md5_shards,
- MD5_SHARD_PREFIX, bl_arr, sp_arr);
- if (cnt != num_md5_shards && 0) {
- std::cerr << ">>>Partial MD5_SHARD stats recived " << cnt << " / "
- << num_md5_shards << "\n" << std::endl;
- }
- bool has_incomplete_shards = false;
- for (unsigned shard = 0; shard < num_md5_shards; shard++) {
- if (bl_arr[shard].length() == 0) {
- has_incomplete_shards = true;
- continue;
- }
- completed_md5_shards_count++;
- md5_stats_t stats;
- try {
- auto p = bl_arr[shard].cbegin();
- decode(stats, p);
- md5_stats_sum += stats;
- }catch (const buffer::error&) {
- // TBD: can we use std::cerr or should we use formatter ??
- std::cerr << __func__ << "::failed md5_stats_t decode #" << shard << std::endl;
- continue;
- }
- collect_single_shard_stats(dpp, owner_map, sp_arr, shard, &show_time, "MD5");
- }
- {
- Formatter::ObjectSection outer(*fmt, "md5_stats");
- md5_stats_sum.dump(fmt);
- show_incomplete_shards_fmt(has_incomplete_shards, num_md5_shards, sp_arr, fmt);
- show_time_func_fmt(md5_start_time, show_time, owner_map, fmt);
- }
- show_dedup_ratio_estimate_fmt(wrk_stats_sum, md5_stats_sum, fmt);
- show_dedup_ratio_actual_fmt(wrk_stats_sum, md5_stats_sum, fmt);
- }
-
- fmt->dump_bool("completed", (completed_md5_shards_count == num_md5_shards));
- return 0;
- }
-
- //---------------------------------------------------------------------------
- int cluster::watch_reload(rgw::sal::RadosStore *store,
- const DoutPrefixProvider* dpp,
- uint64_t *p_watch_handle,
- librados::WatchCtx2 *ctx)
- {
- librados::IoCtx ctl_ioctx;
- int ret = get_control_ioctx(store, dpp, ctl_ioctx);
- if (unlikely(ret != 0)) {
- return ret;
- }
-
- const std::string & oid = DEDUP_WATCH_OBJ;
- // create the object to watch (object may already exist)
- bool exclusive = true;
- ret = ctl_ioctx.create(oid, exclusive);
- if (ret >= 0) {
- ldpp_dout(dpp, 10) << "dedup_bg::watch_reload():" << oid
- << " was created!" << dendl;
- }
- else if (ret == -EEXIST) {
- ldpp_dout(dpp, 5) << __func__ << "::"<< oid << " exists" << dendl;
- }
- else {
- ldpp_dout(dpp, 1) << "dedup_bg::watch_reload(): failed ctl_ioctx.create("
- << oid << ") ret=" << ret << "::" << cpp_strerror(-ret) << dendl;
- return ret;
- }
-
- ret = ctl_ioctx.watch2(oid, p_watch_handle, ctx);
- if (ret < 0) {
- ldpp_dout(dpp, 1) << "dedup_bg::watch_reload(): failed watch2() " << oid
- << ". error: " << cpp_strerror(-ret) << dendl;
- *p_watch_handle = 0;
- return ret;
- }
- ldpp_dout(dpp, 5) << "dedup_bg::watch_reload(): Started watching "
- << oid << "::watch_handle=" << *p_watch_handle << dendl;
- return 0;
- }
-
- //---------------------------------------------------------------------------
- int cluster::unwatch_reload(rgw::sal::RadosStore *store,
- const DoutPrefixProvider* dpp,
- uint64_t watch_handle)
- {
- librados::IoCtx ctl_ioctx;
- int ret = get_control_ioctx(store, dpp, ctl_ioctx);
- if (unlikely(ret != 0)) {
- return ret;
- }
-
- ret = ctl_ioctx.unwatch2(watch_handle);
- if (ret < 0) {
- ldpp_dout(dpp, 1) << "dedup_bg::unwatch_reload() failed unwatch2() "
- << DEDUP_WATCH_OBJ << "::" << cpp_strerror(-ret) << dendl;
- return ret;
- }
- return 0;
- }
-
- //---------------------------------------------------------------------------
- int cluster::ack_notify(rgw::sal::RadosStore *store,
- const DoutPrefixProvider *dpp,
- const control_t *p_ctl,
- uint64_t notify_id,
- uint64_t cookie,
- int status)
- {
- librados::IoCtx ctl_ioctx;
- int ret = get_control_ioctx(store, dpp, ctl_ioctx);
- if (unlikely(ret != 0)) {
- return ret;
- }
-
- ldpp_dout(dpp, 5) << __func__ << "::status=" << status << dendl;
- bufferlist reply_bl;
- ceph::encode(status, reply_bl);
- encode(*p_ctl, reply_bl);
- ctl_ioctx.notify_ack(DEDUP_WATCH_OBJ, notify_id, cookie, reply_bl);
-
- return 0;
- }
-
- //---------------------------------------------------------------------------
- // command-line called from radosgw-admin.cc
- int cluster::dedup_control(rgw::sal::RadosStore *store,
- const DoutPrefixProvider *dpp,
- urgent_msg_t urgent_msg)
- {
- ldpp_dout(dpp, 10) << __func__ << "::dedup_control req = "
- << get_urgent_msg_names(urgent_msg) << dendl;
- if (urgent_msg != URGENT_MSG_RESUME &&
- urgent_msg != URGENT_MSG_PASUE &&
- urgent_msg != URGENT_MSG_RESTART &&
- urgent_msg != URGENT_MSG_ABORT) {
- ldpp_dout(dpp, 1) << __func__ << "::illegal urgent_msg="<< urgent_msg << dendl;
- return -EINVAL;
- }
-
- librados::IoCtx ctl_ioctx;
- int ret = get_control_ioctx(store, dpp, ctl_ioctx);
- if (unlikely(ret != 0)) {
- return ret;
- }
-
- // 10 seconds timeout
- const uint64_t timeout_ms = 10*1000;
- bufferlist reply_bl, urgent_msg_bl;
- ceph::encode(urgent_msg, urgent_msg_bl);
- ret = rgw_rados_notify(dpp, ctl_ioctx, DEDUP_WATCH_OBJ, urgent_msg_bl,
- timeout_ms, &reply_bl, null_yield);
- if (ret < 0) {
- ldpp_dout(dpp, 1) << __func__ << "::failed rgw_rados_notify("
- << DEDUP_WATCH_OBJ << ")::err="<<cpp_strerror(-ret) << dendl;
- return ret;
- }
- std::vector<librados::notify_ack_t> acks;
- std::vector<librados::notify_timeout_t> timeouts;
- ctl_ioctx.decode_notify_response(reply_bl, &acks, &timeouts);
- if (timeouts.size() > 0) {
- ldpp_dout(dpp, 1) << __func__ << "::failed rgw_rados_notify("
- << DEDUP_WATCH_OBJ << ")::timeout error" << dendl;
- return -EAGAIN;
- }
-
- for (auto& ack : acks) {
- try {
- ldpp_dout(dpp, 20) << __func__ << "::ACK: notifier_id=" << ack.notifier_id
- << "::cookie=" << ack.cookie << dendl;
- auto iter = ack.payload_bl.cbegin();
- ceph::decode(ret, iter);
- struct rgw::dedup::control_t ctl;
- decode(ctl, iter);
- ldpp_dout(dpp, 10) << __func__ << "::++ACK::ctl=" << ctl << "::ret=" << ret << dendl;
- } catch (buffer::error& err) {
- ldpp_dout(dpp, 1) << __func__ << "::failed decoding notify acks" << dendl;
- return -EINVAL;
- }
- if (ret != 0) {
- ldpp_dout(dpp, 1) << __func__ << "::Bad notify ack, ret=" << ret
- << "::err=" << cpp_strerror(-ret) << dendl;
- return ret;
- }
- }
- ldpp_dout(dpp, 10) << __func__ << "::" << get_urgent_msg_names(urgent_msg)
- << " finished successfully!" << dendl;
- return 0;
- }
-
- //---------------------------------------------------------------------------
- // command-line called from radosgw-admin.cc
- int cluster::dedup_restart_scan(rgw::sal::RadosStore *store,
- dedup_req_type_t dedup_type,
- const DoutPrefixProvider *dpp)
- {
- ldpp_dout(dpp, 1) << __func__ << "::dedup_type = " << dedup_type << dendl;
-
- dedup_epoch_t old_epoch;
- // store the previous epoch for cmp-swap
- int ret = get_epoch(store, dpp, &old_epoch, __func__);
- if (ret != 0) {
- // generate an empty epoch with zero counters
- std::string cluster_id("NULL_CLUSTER_ID");
- ldpp_dout(dpp, 1) << __func__ << "::set empty EPOCH using cluster_id: "
- << cluster_id << dendl;
- set_epoch(store, cluster_id, dpp, 0, 0);
- ret = get_epoch(store, dpp, &old_epoch, __func__);
- if (ret) {
- return ret;
- }
- }
-
- // first abort all dedup work!
- ret = dedup_control(store, dpp, URGENT_MSG_ABORT);
- if (ret != 0) {
- return ret;
- }
-#if 0
- // then delete dedup-pool to ensure a clean start
- const rgw_pool& dedup_pool = store->svc()->zone->get_zone_params().dedup_pool;
- auto rados_handle = store->getRados()->get_rados_handle();
- ldpp_dout(dpp, 5) <<__func__ << "::delete pool: " << dedup_pool.name << dendl;
- rados_handle->pool_delete(dedup_pool.name.c_str());
-#endif
-
- ldpp_dout(dpp, 10) << __func__ << dedup_type << dendl;
-#ifdef FULL_DEDUP_SUPPORT
- ceph_assert(dedup_type == dedup_req_type_t::DEDUP_TYPE_ESTIMATE ||
- dedup_type == dedup_req_type_t::DEDUP_TYPE_FULL);
-#else
- ceph_assert(dedup_type == dedup_req_type_t::DEDUP_TYPE_ESTIMATE);
-#endif
- ret = swap_epoch(store, dpp, &old_epoch, dedup_type, 0, 0);
- if (ret == 0) {
- ldpp_dout(dpp, 10) << __func__ << "::Epoch object was reset" << dendl;
- return dedup_control(store, dpp, URGENT_MSG_RESTART);
- }
- else {
- return ret;
- }
- }
-
- //---------------------------------------------------------------------------
- bool cluster::can_start_new_scan(rgw::sal::RadosStore *store)
- {
- ldpp_dout(dpp, 10) << __func__ << "::epoch=" << d_epoch_time << dendl;
- dedup_epoch_t new_epoch;
- if (get_epoch(store, dpp, &new_epoch, nullptr) != 0) {
- ldpp_dout(dpp, 1) << __func__ << "::No Epoch Object::"
- << "::scan can be restarted!\n\n\n" << dendl;
- // no epoch object exists -> we should start a new scan
- return true;
- }
-
- if (new_epoch.time <= d_epoch_time) {
- if (new_epoch.time == d_epoch_time) {
- ldpp_dout(dpp, 10) << __func__ << "::Epoch hasn't change - > Do not restart scan!!" << dendl;
- }
- else {
- ldpp_dout(dpp, 1) << __func__ << " ::Do not restart scan!\n epoch="
- << d_epoch_time << "\nnew_epoch="<< new_epoch.time <<dendl;
- }
- return false;
- }
- // allow members to join within a 30 sec limit
- utime_t limit = {30, 0};
- utime_t now = ceph_clock_now();
- ldpp_dout(dpp, 1) << __func__ << "\n::new_epoch=" << new_epoch.time
- << "\n::now =" << now << dendl;
- if ((now > new_epoch.time) && ((now - new_epoch.time) < limit)) {
- ldpp_dout(dpp, 1) << __func__ << "::Epoch is less than 30 seconds old!"
- << " Restart scan\n\n\n" << dendl;
- return true;
- }
- ldpp_dout(dpp, 1) << "\n::new_epoch - now = " << (new_epoch.time - now)
- << "\n::limit = " << limit << dendl;
-
- if (new_epoch.time > now) {
- ldpp_dout(dpp, 1) << ":new_epoch > now = TRUE " << dendl;
- }
- return false;
- }
-} // namespace rgw::dedup
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2;
-// vim: ts=8 sw=2 smarttab
-/*
- * Ceph - scalable distributed file system
- *
- * Author: Gabriel BenHanokh <gbenhano@redhat.com>
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation. See file COPYING.
- *
- */
-
-#pragma once
-#include "common/dout.h"
-#include "rgw_dedup_utils.h"
-#include "rgw_dedup_store.h"
-#include <string>
-
-namespace rgw::dedup {
- static constexpr const char* WORKER_SHARD_PREFIX = "WRK.SHRD.TK.";
- static constexpr const char* MD5_SHARD_PREFIX = "MD5.SHRD.TK.";
- struct control_t;
- struct dedup_epoch_t;
-
- class cluster{
- public:
- //==================================================================================
- class shard_token_oid {
- public:
- //---------------------------------------------------------------------------
- shard_token_oid(const char *prefix) {
- this->prefix_len = snprintf(this->buff, BUFF_SIZE, "%s", prefix);
- this->total_len = this->prefix_len;
- }
-
- //---------------------------------------------------------------------------
- shard_token_oid(const char *prefix, uint16_t shard) {
- this->prefix_len = snprintf(this->buff, BUFF_SIZE, "%s", prefix);
- set_shard(shard);
- }
-
- //---------------------------------------------------------------------------
- void set_shard(uint16_t shard) {
- int n = snprintf(this->buff + this->prefix_len, BUFF_SIZE, "%03x", shard);
- this->total_len = this->prefix_len + n;
- }
-
- //---------------------------------------------------------------------------
- static bool legal_oid_name(const std::string& oid) {
- return ((oid.length() <= BUFF_SIZE) &&
- (oid.starts_with(WORKER_SHARD_PREFIX)||oid.starts_with(MD5_SHARD_PREFIX)));
- }
- inline const char* get_buff() { return this->buff; }
- inline unsigned get_buff_size() { return this->total_len; }
- private:
- static const unsigned BUFF_SIZE = 15;
- unsigned total_len = 0;
- unsigned prefix_len = 0;
- char buff[BUFF_SIZE];
- };
-
- //==================================================================================
- cluster(const DoutPrefixProvider *_dpp,
- CephContext* cct,
- rgw::sal::Driver* driver);
- int reset(rgw::sal::RadosStore *store,
- struct dedup_epoch_t*,
- work_shard_t num_work_shards,
- md5_shard_t num_md5_shards);
-
- utime_t get_epoch_time() { return d_epoch_time; }
- work_shard_t get_next_work_shard_token(rgw::sal::RadosStore *store,
- work_shard_t num_work_shards);
- md5_shard_t get_next_md5_shard_token(rgw::sal::RadosStore *store,
- md5_shard_t num_md5_shards);
- bool can_start_new_scan(rgw::sal::RadosStore *store);
- static int collect_all_shard_stats(rgw::sal::RadosStore *store,
- Formatter *p_formatter,
- const DoutPrefixProvider *dpp);
- static int watch_reload(rgw::sal::RadosStore *store,
- const DoutPrefixProvider* dpp,
- uint64_t *p_watch_handle,
- librados::WatchCtx2 *ctx);
- static int unwatch_reload(rgw::sal::RadosStore *store,
- const DoutPrefixProvider* dpp,
- uint64_t watch_handle);
- static int ack_notify(rgw::sal::RadosStore *store,
- const DoutPrefixProvider *dpp,
- const struct control_t *p_ctl,
- uint64_t notify_id,
- uint64_t cookie,
- int status);
- static int dedup_control(rgw::sal::RadosStore *store,
- const DoutPrefixProvider *dpp,
- urgent_msg_t urgent_msg);
- static int dedup_restart_scan(rgw::sal::RadosStore *store,
- dedup_req_type_t dedup_type,
- const DoutPrefixProvider *dpp);
-
- //---------------------------------------------------------------------------
- int mark_work_shard_token_completed(rgw::sal::RadosStore *store,
- work_shard_t work_shard,
- const worker_stats_t *p_stats)
- {
- ceph::bufferlist bl;
- encode(*p_stats, bl);
- d_num_completed_workers++;
- d_completed_workers[work_shard] = TOKEN_STATE_COMPLETED;
-
- return mark_shard_token_completed(store, work_shard, p_stats->ingress_obj,
- WORKER_SHARD_PREFIX, bl);
- }
-
- //---------------------------------------------------------------------------
- int mark_md5_shard_token_completed(rgw::sal::RadosStore *store,
- md5_shard_t md5_shard,
- const md5_stats_t *p_stats)
- {
- ceph::bufferlist bl;
- encode(*p_stats, bl);
- d_num_completed_md5++;
- d_completed_md5[md5_shard] = TOKEN_STATE_COMPLETED;
- return mark_shard_token_completed(store, md5_shard, p_stats->loaded_objects,
- MD5_SHARD_PREFIX, bl);
- }
-
- int update_shard_token_heartbeat(rgw::sal::RadosStore *store,
- unsigned shard,
- uint64_t count_a,
- uint64_t count_b,
- const char *prefix);
-
- //---------------------------------------------------------------------------
- int all_work_shard_tokens_completed(rgw::sal::RadosStore *store,
- work_shard_t num_work_shards)
- {
- return all_shard_tokens_completed(store, num_work_shards, WORKER_SHARD_PREFIX,
- &d_num_completed_workers, d_completed_workers);
- }
-
- //---------------------------------------------------------------------------
- int all_md5_shard_tokens_completed(rgw::sal::RadosStore *store,
- md5_shard_t num_md5_shards)
- {
- return all_shard_tokens_completed(store, num_md5_shards, MD5_SHARD_PREFIX,
- &d_num_completed_md5, d_completed_md5);
- }
-
- private:
- static constexpr unsigned TOKEN_STATE_PENDING = 0x00;
- static constexpr unsigned TOKEN_STATE_CORRUPTED = 0xCC;
- static constexpr unsigned TOKEN_STATE_TIMED_OUT = 0xDD;
- static constexpr unsigned TOKEN_STATE_COMPLETED = 0xFF;
-
- void clear();
- int all_shard_tokens_completed(rgw::sal::RadosStore *store,
- unsigned shards_count,
- const char *prefix,
- uint16_t *p_num_completed,
- uint8_t completed_arr[]);
- int cleanup_prev_run(rgw::sal::RadosStore *store);
- int32_t get_next_shard_token(rgw::sal::RadosStore *store,
- uint16_t start_shard,
- uint16_t max_count,
- const char *prefix);
- int create_shard_tokens(rgw::sal::RadosStore *store,
- unsigned shards_count,
- const char *prefix);
- int verify_all_shard_tokens(rgw::sal::RadosStore *store,
- unsigned shards_count,
- const char *prefix);
- int mark_shard_token_completed(rgw::sal::RadosStore *store,
- unsigned shard,
- uint64_t obj_count,
- const char *prefix,
- const bufferlist &bl);
-
- const DoutPrefixProvider *dpp;
- std::string d_lock_cookie;
- std::string d_cluster_id;
- md5_shard_t d_curr_md5_shard = 0;
- work_shard_t d_curr_worker_shard = 0;
- utime_t d_epoch_time;
- utime_t d_token_creation_time;
- uint8_t d_completed_workers[MAX_WORK_SHARD];
- uint8_t d_completed_md5[MAX_MD5_SHARD];
- uint16_t d_num_completed_workers = 0;
- uint16_t d_num_completed_md5 = 0;
- };
-
-} //namespace rgw::dedup
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2;
-// vim: ts=8 sw=2 smarttab
-/*
- * Ceph - scalable distributed file system
- *
- * Author: Gabriel BenHanokh <gbenhano@redhat.com>
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation. See file COPYING.
- *
- */
-
-#pragma once
-
-#include "common/Clock.h" // for ceph_clock_now()
-#include "common/dout.h"
-#include "rgw_dedup_utils.h"
-
-#include <string>
-
-namespace rgw::dedup {
- constexpr const char* RGW_DEDUP_ATTR_EPOCH = "rgw.dedup.attr.epoch";
- //===========================================================================
-
- struct dedup_epoch_t {
- uint32_t serial;
- dedup_req_type_t dedup_type;
- utime_t time;
- uint32_t num_work_shards = 0;
- uint32_t num_md5_shards = 0;
- };
-
- //---------------------------------------------------------------------------
- inline void encode(const dedup_epoch_t& o, ceph::bufferlist& bl)
- {
- ENCODE_START(1, 1, bl);
- encode(o.serial, bl);
- encode(static_cast<int32_t>(o.dedup_type), bl);
- encode(o.time, bl);
- encode(o.num_work_shards, bl);
- encode(o.num_md5_shards, bl);
- ENCODE_FINISH(bl);
- }
-
- //---------------------------------------------------------------------------
- inline void decode(dedup_epoch_t& o, ceph::bufferlist::const_iterator& bl)
- {
- DECODE_START(1, bl);
- decode(o.serial, bl);
- int32_t dedup_type;
- decode(dedup_type, bl);
- o.dedup_type = static_cast<dedup_req_type_t> (dedup_type);
- decode(o.time, bl);
- decode(o.num_work_shards, bl);
- decode(o.num_md5_shards, bl);
- DECODE_FINISH(bl);
- }
-
- //---------------------------------------------------------------------------
- inline std::ostream& operator<<(std::ostream &out, const dedup_epoch_t &ep)
- {
- utime_t elapsed = ceph_clock_now() - ep.time;
- out << "EPOCH::Time={" << ep.time.tv.tv_sec <<":"<< ep.time.tv.tv_nsec << "}::";
- out << "Elapsed={" << elapsed.tv.tv_sec <<":"<< elapsed.tv.tv_nsec << "}::";
- out << ep.dedup_type << "::serial=" << ep.serial;
- out << "::num_work_shards=" << ep.num_work_shards;
- out << "::num_md5_shards=" << ep.num_md5_shards;
- return out;
- }
-
-} //namespace rgw::dedup
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2;
-// vim: ts=8 sw=2 smarttab
-/*
- * Ceph - scalable distributed file system
- *
- * Author: Gabriel BenHanokh <gbenhano@redhat.com>
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation. See file COPYING.
- *
- */
-
-#pragma once
-#include "common/dout.h"
-#include <unordered_map>
-#include <cstring>
-#include <string>
-
-
-namespace rgw::dedup {
- class remapper_t
- {
- public:
- static inline constexpr uint8_t NULL_IDX = 0xFF;
- remapper_t(uint32_t max_entries) : d_max_entries(max_entries) {}
- uint8_t remap(const std::string &key,
- const DoutPrefixProvider* dpp,
- uint64_t *p_overflow_count) { // IN-OUT
- uint8_t idx;
-
- auto itr = d_map.find(key);
- if (itr != d_map.end()) {
- idx = itr->second;
- ldpp_dout(dpp, 20) << __func__ << "::Existing key: " << key
- << " is mapped to idx=" << (int)idx << dendl;
- }
- else if (d_num_entries < d_max_entries) {
- // assign it the next entry
- idx = d_num_entries++;
- d_map[key] = idx;
- ldpp_dout(dpp, 20) << __func__ << "::New key: " << key
- << " was mapped to idx=" << (int)idx << dendl;
- }
- else {
- (*p_overflow_count) ++;
- ldpp_dout(dpp, 10) << __func__ << "::ERR: Failed adding key: "
- << key << dendl;
- idx = NULL_IDX;
- }
-
- return idx;
- }
-
- private:
- uint32_t d_num_entries = 0;
- const uint32_t d_max_entries;
- std::unordered_map<std::string, uint8_t> d_map;
- };
-
-} //namespace rgw::dedup
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2;
-// vim: ts=8 sw=2 smarttab
-/*
- * Ceph - scalable distributed file system
- *
- * Author: Gabriel BenHanokh <gbenhano@redhat.com>
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation. See file COPYING.
- *
- */
-
-#include "include/rados/rados_types.hpp"
-#include "include/rados/buffer.h"
-#include "include/rados/librados.hpp"
-#include "svc_zone.h"
-#include "common/config.h"
-#include "common/Cond.h"
-#include "common/debug.h"
-#include "common/errno.h"
-#include "rgw_common.h"
-#include "include/denc.h"
-#include "rgw_sal.h"
-#include "driver/rados/rgw_sal_rados.h"
-#include "rgw_dedup_utils.h"
-#include "rgw_dedup.h"
-#include "rgw_dedup_store.h"
-#include "fmt/ranges.h"
-#include <span>
-
-namespace rgw::dedup {
-
- //---------------------------------------------------------------------------
- disk_record_t::disk_record_t(const rgw::sal::Bucket *p_bucket,
- const std::string &obj_name,
- const parsed_etag_t *p_parsed_etag,
- uint64_t obj_size,
- const std::string &storage_class)
- {
- this->s.rec_version = 0;
- this->s.flags = 0;
- this->s.num_parts = p_parsed_etag->num_parts;
- this->obj_name = obj_name;
- this->s.obj_name_len = this->obj_name.length();
- this->bucket_name = p_bucket->get_name();
- this->s.bucket_name_len = this->bucket_name.length();
-
- this->s.md5_high = p_parsed_etag->md5_high;
- this->s.md5_low = p_parsed_etag->md5_low;
- this->s.obj_bytes_size = obj_size;
- this->s.object_version = 0;
-
- this->bucket_id = p_bucket->get_bucket_id();
- this->s.bucket_id_len = this->bucket_id.length();
- this->tenant_name = p_bucket->get_tenant();
- this->s.tenant_name_len = this->tenant_name.length();
- this->stor_class = storage_class;
- this->s.stor_class_len = storage_class.length();
-
- this->s.ref_tag_len = 0;
- this->s.manifest_len = 0;
-
- this->s.shared_manifest = 0;
- memset(this->s.hash, 0, sizeof(this->s.hash));
- this->ref_tag = "";
- this->manifest_bl.clear();
- }
-
- //---------------------------------------------------------------------------
- disk_record_t::disk_record_t(const char *buff)
- {
- disk_record_t *p_rec = (disk_record_t*)buff;
- this->s.rec_version = p_rec->s.rec_version;
- // wrong version, bail out
- if (unlikely(p_rec->s.rec_version != 0)) {
- return;
- }
-
- this->s.flags = p_rec->s.flags;
- this->s.num_parts = CEPHTOH_16(p_rec->s.num_parts);
- this->s.obj_name_len = CEPHTOH_16(p_rec->s.obj_name_len);
- this->s.bucket_name_len = CEPHTOH_16(p_rec->s.bucket_name_len);
-
- this->s.md5_high = CEPHTOH_64(p_rec->s.md5_high);
- this->s.md5_low = CEPHTOH_64(p_rec->s.md5_low);
- this->s.obj_bytes_size = CEPHTOH_64(p_rec->s.obj_bytes_size);
- this->s.object_version = CEPHTOH_64(p_rec->s.object_version);
-
- this->s.bucket_id_len = CEPHTOH_16(p_rec->s.bucket_id_len);
- this->s.tenant_name_len = CEPHTOH_16(p_rec->s.tenant_name_len);
- this->s.stor_class_len = CEPHTOH_16(p_rec->s.stor_class_len);
- this->s.ref_tag_len = CEPHTOH_16(p_rec->s.ref_tag_len);
- this->s.manifest_len = CEPHTOH_16(p_rec->s.manifest_len);
-
- const char *p = buff + sizeof(this->s);
- this->obj_name = std::string(p, this->s.obj_name_len);
- p += p_rec->s.obj_name_len;
-
- this->bucket_name = std::string(p, this->s.bucket_name_len);
- p += p_rec->s.bucket_name_len;
-
- this->bucket_id = std::string(p, this->s.bucket_id_len);
- p += p_rec->s.bucket_id_len;
-
- this->tenant_name = std::string(p, this->s.tenant_name_len);
- p += p_rec->s.tenant_name_len;
-
- this->stor_class = std::string(p, this->s.stor_class_len);
- p += p_rec->s.stor_class_len;
-
- if (p_rec->s.flags.is_fastlane()) {
- // TBD:: remove asserts
- ceph_assert(this->s.ref_tag_len == 0);
- ceph_assert(this->s.manifest_len == 0);
- }
- else {
- this->s.shared_manifest = CEPHTOH_64(p_rec->s.shared_manifest);
- // BLAKE3 hash has 256 bit splitted into multiple 64bit units
- const unsigned units = (256 / (sizeof(uint64_t)*8));
- static_assert(units == 4);
- for (unsigned i = 0; i < units; i++) {
- this->s.hash[i] = CEPHTOH_64(p_rec->s.hash[i]);
- }
- this->ref_tag = std::string(p, this->s.ref_tag_len);
- p += p_rec->s.ref_tag_len;
-
- this->manifest_bl.append(p, this->s.manifest_len);
- }
- }
-
- //---------------------------------------------------------------------------
- size_t disk_record_t::serialize(char *buff) const
- {
- ceph_assert(this->s.rec_version == 0);
- disk_record_t *p_rec = (disk_record_t*)buff;
- p_rec->s.rec_version = 0;
- p_rec->s.flags = this->s.flags;
- p_rec->s.num_parts = HTOCEPH_16(this->s.num_parts);
- p_rec->s.obj_name_len = HTOCEPH_16(this->obj_name.length());
- p_rec->s.bucket_name_len = HTOCEPH_16(this->bucket_name.length());
-
- p_rec->s.md5_high = HTOCEPH_64(this->s.md5_high);
- p_rec->s.md5_low = HTOCEPH_64(this->s.md5_low);
- p_rec->s.obj_bytes_size = HTOCEPH_64(this->s.obj_bytes_size);
- p_rec->s.object_version = HTOCEPH_64(this->s.object_version);
-
- p_rec->s.bucket_id_len = HTOCEPH_16(this->bucket_id.length());
- p_rec->s.tenant_name_len = HTOCEPH_16(this->tenant_name.length());
- p_rec->s.stor_class_len = HTOCEPH_16(this->stor_class.length());
- p_rec->s.ref_tag_len = HTOCEPH_16(this->ref_tag.length());
- p_rec->s.manifest_len = HTOCEPH_16(this->manifest_bl.length());
- char *p = buff + sizeof(this->s);
- unsigned len = this->obj_name.length();
- std::memcpy(p, this->obj_name.data(), len);
- p += len;
-
- len = this->bucket_name.length();
- std::memcpy(p, this->bucket_name.data(), len);
- p += len;
-
- len = this->bucket_id.length();
- std::memcpy(p, this->bucket_id.data(), len);
- p += len;
-
- len = this->tenant_name.length();
- std::memcpy(p, this->tenant_name.data(), len);
- p += len;
-
- len = this->stor_class.length();
- std::memcpy(p, this->stor_class.data(), len);
- p += len;
-
- if (this->s.flags.is_fastlane()) {
- // TBD:: remove asserts
- ceph_assert(this->s.ref_tag_len == 0);
- ceph_assert(this->s.manifest_len == 0);
- }
- else {
- p_rec->s.shared_manifest = HTOCEPH_64(this->s.shared_manifest);
- // BLAKE3 hash has 256 bit splitted into multiple 64bit units
- const unsigned units = (256 / (sizeof(uint64_t)*8));
- static_assert(units == 4);
- for (unsigned i = 0; i < units; i++) {
- p_rec->s.hash[i] = HTOCEPH_64(this->s.hash[i]);
- }
- len = this->ref_tag.length();
- std::memcpy(p, this->ref_tag.data(), len);
- p += len;
-
- len = this->manifest_bl.length();
- const char *p_manifest = const_cast<disk_record_t*>(this)->manifest_bl.c_str();
- std::memcpy(p, p_manifest, len);
- p += len;
- }
- return (p - buff);
- }
-
- //---------------------------------------------------------------------------
- size_t disk_record_t::length() const
- {
- return (sizeof(this->s) +
- this->obj_name.length() +
- this->bucket_name.length() +
- this->bucket_id.length() +
- this->tenant_name.length() +
- this->stor_class.length() +
- this->ref_tag.length() +
- this->manifest_bl.length());
- }
-
- //---------------------------------------------------------------------------
- int disk_record_t::validate(const char *caller,
- const DoutPrefixProvider* dpp,
- disk_block_id_t block_id,
- record_id_t rec_id) const
- {
- // optimistic approach
- if (likely((this->s.rec_version == 0) && (this->length() <= MAX_REC_SIZE))) {
- ldpp_dout(dpp, 20) << __func__ << "::success" << dendl;
- return 0;
- }
-
- // wrong version
- if (this->s.rec_version != 0) {
- // TBD
- //p_stats->failed_wrong_ver++;
- ldpp_dout(dpp, 5) << __func__ << "::" << caller << "::ERR: Bad record version: "
- << this->s.rec_version
- << "::block_id=" << block_id
- << "::rec_id=" << rec_id
- << dendl;
- return -EPROTO; // Protocol error
- }
-
- // if arrived here record size is too large
- // TBD
- //p_stats->failed_rec_overflow++;
- ldpp_dout(dpp, 5) << __func__ << "::" << caller << "::ERR: record size too big: "
- << this->length()
- << "::block_id=" << block_id
- << "::rec_id=" << rec_id
- << dendl;
- return -EOVERFLOW; // maybe should use -E2BIG ??
- }
-
- //---------------------------------------------------------------------------
- std::ostream &operator<<(std::ostream &stream, const disk_record_t & rec)
- {
- stream << rec.obj_name << "::" << rec.s.obj_name_len << "\n";
- stream << rec.bucket_name << "::" << rec.s.bucket_name_len << "\n";
- stream << rec.bucket_id << "::" << rec.s.bucket_id_len << "\n";
- stream << rec.tenant_name << "::" << rec.s.tenant_name_len << "\n";
- stream << rec.stor_class << "::" << rec.s.stor_class_len << "\n";
- stream << rec.ref_tag << "::" << rec.s.ref_tag_len << "\n";
- stream << "num_parts = " << rec.s.num_parts << "\n";
- stream << "obj_size = " << rec.s.obj_bytes_size/1024 <<" KiB" << "\n";
- stream << "MD5 = " << std::hex << rec.s.md5_high << rec.s.md5_low << "\n";
- stream << "HASH = ";
- // BLAKE3 hash has 256 bit splitted into multiple 64bit units
- const unsigned units = (256 / (sizeof(uint64_t)*8));
- static_assert(units == 4);
- for (unsigned i = 0; i < units; i++) {
- stream << rec.s.hash[i];
- }
- stream << "\n";
-
- if (rec.has_shared_manifest()) {
- stream << "Shared Manifest Object\n";
- }
- else {
- stream << "Dedicated Manifest Object\n";
- }
- stream << "Manifest len=" << rec.s.manifest_len << "\n";
- return stream;
- }
-
- //---------------------------------------------------------------------------
- void disk_block_t::init(work_shard_t worker_id, uint32_t seq_number)
- {
- disk_block_header_t *p_header = get_header();
- p_header->offset = sizeof(disk_block_header_t);
- p_header->rec_count = 0;
- p_header->block_id = disk_block_id_t(worker_id, seq_number);
- }
-
- //---------------------------------------------------------------------------
- int disk_block_header_t::verify(disk_block_id_t expected_block_id, const DoutPrefixProvider* dpp)
- {
- if (unlikely(offset != BLOCK_MAGIC && offset != LAST_BLOCK_MAGIC)) {
- ldpp_dout(dpp, 1) << __func__ << "::ERR::bad magic number (0x" << std::hex << offset << std::dec << ")" << dendl;
- return -EINVAL;
- }
-
- if (unlikely(rec_count > MAX_REC_IN_BLOCK) ) {
- ldpp_dout(dpp, 1) << __func__ << "::ERR::rec_count=" << rec_count << " > MAX_REC_IN_BLOCK" << dendl;
- return -EINVAL;
- }
-
- if (unlikely(this->block_id != expected_block_id)) {
- ldpp_dout(dpp, 1) << __func__ << "::ERR::block_id=" << block_id
- << "!= expected_block_id=" << expected_block_id << dendl;
- return -EINVAL;
- }
-
- return 0;
- }
-
- //---------------------------------------------------------------------------
- record_id_t disk_block_t::add_record(const disk_record_t *p_rec,
- const DoutPrefixProvider *dpp)
- {
- disk_block_header_t *p_header = get_header();
- if (unlikely(p_header->rec_count >= MAX_REC_IN_BLOCK)) {
- ldpp_dout(dpp, 20) << __func__ << "::rec_count=" << p_header->rec_count
- << ", MAX_REC_IN_BLOCK=" << MAX_REC_IN_BLOCK << dendl;
- return MAX_REC_IN_BLOCK;
- }
-
- if ((DISK_BLOCK_SIZE - p_header->offset) >= p_rec->length()) {
- p_header->rec_offsets[p_header->rec_count] = p_header->offset;
- unsigned rec_id = p_header->rec_count;
- p_header->rec_count ++;
- p_rec->serialize(data+p_header->offset);
- p_header->offset += p_rec->length();
- return rec_id;
- }
- else {
- return MAX_REC_IN_BLOCK;
- }
- }
-
- //---------------------------------------------------------------------------
- void disk_block_t::close_block(const DoutPrefixProvider* dpp, bool has_more)
- {
- disk_block_header_t *p_header = get_header();
- ldpp_dout(dpp, 20) << __func__ << "::rec_count=" << p_header->rec_count
- << ", has_more=" << (has_more? "TRUE" : "FALSE") << dendl;
-
- memset(data + p_header->offset, 0, (DISK_BLOCK_SIZE - p_header->offset));
- if (has_more) {
- p_header->offset = HTOCEPH_16(BLOCK_MAGIC);
- }
- else {
- p_header->offset = HTOCEPH_16(LAST_BLOCK_MAGIC);
- }
- for (unsigned i = 0; i < p_header->rec_count; i++) {
- p_header->rec_offsets[i] = HTOCEPH_16(p_header->rec_offsets[i]);
- }
- p_header->rec_count = HTOCEPH_16(p_header->rec_count);
- p_header->block_id = HTOCEPH_32((uint32_t)p_header->block_id);
- // TBD: CRC
- }
-
- //---------------------------------------------------------------------------
- void disk_block_header_t::deserialize()
- {
- this->offset = CEPHTOH_16(this->offset);
- this->rec_count = CEPHTOH_16(this->rec_count);
- this->block_id = CEPHTOH_32((uint32_t)this->block_id);
- for (unsigned i = 0; i < this->rec_count; i++) {
- this->rec_offsets[i] = CEPHTOH_16(this->rec_offsets[i]);
- }
- }
-
- //---------------------------------------------------------------------------
- disk_block_seq_t::disk_block_seq_t(const DoutPrefixProvider* dpp_in,
- disk_block_t *p_arr_in,
- work_shard_t worker_id,
- md5_shard_t md5_shard,
- worker_stats_t *p_stats_in)
- {
- activate(dpp_in, p_arr_in, worker_id, md5_shard, p_stats_in);
- }
-
- //---------------------------------------------------------------------------
- void disk_block_seq_t::activate(const DoutPrefixProvider* dpp_in,
- disk_block_t *p_arr_in,
- work_shard_t worker_id,
- md5_shard_t md5_shard,
- worker_stats_t *p_stats_in)
- {
- dpp = dpp_in;
- p_arr = p_arr_in;
- d_worker_id = worker_id;
- d_md5_shard = md5_shard;
- p_stats = p_stats_in;
- p_curr_block = nullptr;
- d_seq_number = 0;
-
- memset(p_arr, 0, sizeof(disk_block_t));
- slab_reset();
- }
-
- //---------------------------------------------------------------------------
- [[maybe_unused]]static int print_manifest(const DoutPrefixProvider *dpp,
- RGWRados *rados,
- const bufferlist &manifest_bl)
- {
- RGWObjManifest manifest;
- try {
- auto bl_iter = manifest_bl.cbegin();
- decode(manifest, bl_iter);
- } catch (buffer::error& err) {
- ldpp_dout(dpp, 1) << __func__ << "::ERROR: unable to decode manifest" << dendl;
- return -EINVAL;
- }
-
- unsigned idx = 0;
- for (auto p = manifest.obj_begin(dpp); p != manifest.obj_end(dpp); ++p, ++idx) {
- rgw_raw_obj raw_obj = p.get_location().get_raw_obj(rados);
- ldpp_dout(dpp, 20) << idx << "] " << raw_obj.oid << dendl;
- }
- ldpp_dout(dpp, 20) << "==============================================" << dendl;
- return 0;
- }
-
- //---------------------------------------------------------------------------
- std::ostream& operator<<(std::ostream& out, const disk_block_id_t& block_id)
- {
- std::ios_base::fmtflags flags = out.flags();
- out << std::hex << "0x"
- << (uint32_t)block_id.get_work_shard_id() << "::"
- << (uint32_t)block_id.get_slab_id() << "::"
- << (uint32_t)block_id.get_block_offset();
-
- if (flags & std::ios::dec) {
- out << std::dec;
- }
- return out;
- }
-
- //---------------------------------------------------------------------------
- std::string disk_block_id_t::get_slab_name(md5_shard_t md5_shard) const
- {
- // SLAB.MD5_ID.WORKER_ID.SLAB_SEQ_ID
- const char *SLAB_NAME_FORMAT = "SLB.%03X.%02X.%04X";
- static constexpr uint32_t SLAB_NAME_SIZE = 16;
- char name_buf[SLAB_NAME_SIZE];
- slab_id_t slab_id = get_slab_id();
- work_shard_t work_id = get_work_shard_id();
- unsigned n = snprintf(name_buf, sizeof(name_buf), SLAB_NAME_FORMAT,
- md5_shard, work_id, slab_id);
- std::string oid(name_buf, n);
- return oid;
- }
-
- //---------------------------------------------------------------------------
- int load_record(librados::IoCtx &ioctx,
- const disk_record_t *p_tgt_rec,
- disk_record_t *p_src_rec, /* OUT */
- disk_block_id_t block_id,
- record_id_t rec_id,
- md5_shard_t md5_shard,
- const DoutPrefixProvider *dpp)
- {
- std::string oid(block_id.get_slab_name(md5_shard));
- int read_len = DISK_BLOCK_SIZE;
- static_assert(sizeof(disk_block_t) == DISK_BLOCK_SIZE);
- int byte_offset = block_id.get_block_offset() * DISK_BLOCK_SIZE;
- bufferlist bl;
- int ret = ioctx.read(oid, bl, read_len, byte_offset);
- if (unlikely(ret != read_len)) {
- ldpp_dout(dpp, 1) << __func__ << "::ERR: failed to read block from " << oid
- << "::ret=" << ret << "::err=" << cpp_strerror(-ret)<<dendl;
- return ret;
- }
- else {
- ldpp_dout(dpp, 20) << __func__ << "::oid=" << oid << "::ret=" << ret
- << "::len=" << bl.length() << dendl;
- }
-
- const char *p = bl.c_str();
- disk_block_t *p_disk_block = (disk_block_t*)p;
- disk_block_header_t *p_header = p_disk_block->get_header();
- p_header->deserialize();
- ret = p_header->verify(block_id, dpp);
- if (ret != 0) {
- return ret;
- }
-
- unsigned offset = p_header->rec_offsets[rec_id];
- // We deserialize the record inside the CTOR
- disk_record_t rec(p + offset);
- ret = rec.validate(__func__, dpp, block_id, rec_id);
- if (unlikely(ret != 0)) {
- //p_stats->failed_rec_load++;
- return ret;
- }
-
- if (rec.s.md5_high == p_tgt_rec->s.md5_high &&
- rec.s.md5_low == p_tgt_rec->s.md5_low &&
- rec.s.num_parts == p_tgt_rec->s.num_parts &&
- rec.s.obj_bytes_size == p_tgt_rec->s.obj_bytes_size &&
- rec.stor_class == p_tgt_rec->stor_class) {
-
- *p_src_rec = rec;
- return 0;
- }
- else {
- ldpp_dout(dpp, 5) << __func__ << "::ERR: Bad record in block=" << block_id
- << ", rec_id=" << rec_id << dendl;
- return -EIO;
- }
-
- return 0;
- }
-
- //---------------------------------------------------------------------------
- [[maybe_unused]]static void
- copy_bl_multi_parts(const bufferlist &bl_in, bufferlist &bl_out,
- const DoutPrefixProvider* dpp)
- {
- const size_t MAX = 260*1024;
- char buff[MAX];
- std::srand(std::time({}));
-
- std::vector<int> vec;
- auto bl_itr = bl_in.cbegin();
- size_t len = bl_in.length();
- while (len) {
- const int random_value = std::rand();
- size_t req_len = std::min((random_value % MAX), len);
- if (len < MAX) {
- req_len = len;
- }
- vec.push_back(req_len);
- const char *p = get_next_data_ptr(bl_itr, buff, req_len, dpp);
- bufferptr ptr(p, req_len);
- bl_out.append(ptr);
- len -= req_len;
- }
- ldpp_dout(dpp, 20) << __func__ << "::req_len=" << vec << dendl;
- }
-
- //---------------------------------------------------------------------------
- int load_slab(librados::IoCtx &ioctx,
- bufferlist &bl_out,
- md5_shard_t md5_shard,
- work_shard_t worker_id,
- uint32_t seq_number,
- const DoutPrefixProvider* dpp)
- {
- disk_block_id_t block_id(worker_id, seq_number);
- std::string oid(block_id.get_slab_name(md5_shard));
- ldpp_dout(dpp, 20) << __func__ << "::worker_id=" << (uint32_t)worker_id
- << ", md5_shard=" << (uint32_t)md5_shard
- << ", seq_number=" << seq_number
- << ":: oid=" << oid << dendl;
-#ifndef DEBUG_FRAGMENTED_BUFFERLIST
- int ret = ioctx.read(oid, bl_out, 0, 0);
- if (ret > 0) {
- ldpp_dout(dpp, 20) << __func__ << "::oid=" << oid << ", len="
- << bl_out.length() << dendl;
- }
-#else
- // DEBUG MODE to test with fragmented bufferlist
- bufferlist bl_in;
- // read full object
- int ret = ioctx.read(oid, bl_in, 0, 0);
- if (ret > 0) {
- ldpp_dout(dpp, 20) << __func__ << "::oid=" << oid << ", len="
- << bl_in.length() << dendl;
- copy_bl_multi_parts(bl_in, bl_out, dpp);
- }
-#endif
- else {
- if (ret == 0) {
- // no error reported, but we read nothing which should never happen
- ldpp_dout(dpp, 1) << __func__ << "::ERR: Empty SLAB " << oid << dendl;
- ret = -ENODATA;
- }
- ldpp_dout(dpp, 5) << __func__ << "::ERR: failed to read " << oid
- << ", error is " << cpp_strerror(-ret) << dendl;
- }
- return ret;
- }
-
- //---------------------------------------------------------------------------
- int store_slab(librados::IoCtx &ioctx,
- bufferlist &bl,
- md5_shard_t md5_shard,
- work_shard_t worker_id,
- uint32_t seq_number,
- const DoutPrefixProvider* dpp)
- {
- disk_block_id_t block_id(worker_id, seq_number);
- std::string oid(block_id.get_slab_name(md5_shard));
- ldpp_dout(dpp, 20) << __func__ << "::oid=" << oid << ", len="
- << bl.length() << dendl;
- ceph_assert(bl.length());
-
- int ret = ioctx.write_full(oid, bl);
- if (ret == (int)bl.length()) {
- ldpp_dout(dpp, 20) << __func__ << "::wrote " << bl.length() << " bytes to "
- << oid << dendl;
- }
- else {
- if (ret == 0) {
- // no error reported, but we wrote nothing which should never happen
- ldpp_dout(dpp, 5) << __func__ << "::ERR: No Data was written to " << oid
- << ", bl.length()=" << bl.length() << dendl;
- ret = -ENODATA;
- }
- ldpp_dout(dpp, 1) << "ERROR: failed to write " << oid
- << " with: " << cpp_strerror(-ret) << dendl;
- }
-
- return ret;
- }
-
- //---------------------------------------------------------------------------
- int disk_block_seq_t::flush(librados::IoCtx &ioctx)
- {
- unsigned len = (p_curr_block + 1 - p_arr) * sizeof(disk_block_t);
- bufferlist bl = bufferlist::static_from_mem((char*)p_arr, len);
- int ret = store_slab(ioctx, bl, d_md5_shard, d_worker_id, d_seq_number, dpp);
- // Need to make sure the call to rgw_put_system_obj was fully synchronous
-
- // d_seq_number++ must be called **after** flush!!
- d_seq_number++;
- p_stats->egress_slabs++;
- slab_reset();
- return ret;
- }
-
- //---------------------------------------------------------------------------
- int disk_block_seq_t::flush_disk_records(librados::IoCtx &ioctx)
- {
- ceph_assert(p_arr);
- ldpp_dout(dpp, 20) << __func__ << "::worker_id=" << (uint32_t)d_worker_id
- << ", md5_shard=" << (uint32_t)d_md5_shard << dendl;
-
- // we need to force flush at the end of a cycle even if there was no work done
- // it is used as a signal to worker in the next step
- if (p_curr_block == &p_arr[0] && p_curr_block->is_empty()) {
- ldpp_dout(dpp, 20) << __func__ << "::Empty buffers, generate terminating block" << dendl;
- }
- p_stats->egress_blocks++;
- p_curr_block->close_block(dpp, false);
-
- int ret = flush(ioctx);
- return ret;
- }
-
- //---------------------------------------------------------------------------
- int disk_block_seq_t::add_record(librados::IoCtx &ioctx,
- const disk_record_t *p_rec, // IN-OUT
- record_info_t *p_rec_info) // OUT-PARAM
- {
- disk_block_id_t null_block_id;
- int ret = p_rec->validate(__func__, dpp, null_block_id, MAX_REC_IN_BLOCK);
- if (unlikely(ret != 0)) {
- // TBD
- //p_stats->failed_rec_store++;
- return ret;
- }
-
- p_stats->egress_records ++;
- // first, try and add the record to the current open block
- p_rec_info->rec_id = p_curr_block->add_record(p_rec, dpp);
- if (p_rec_info->rec_id < MAX_REC_IN_BLOCK) {
- p_rec_info->block_id = p_curr_block->get_block_id();
- return 0;
- }
- else {
- // Not enough space left in current block, close it and open the next block
- ldpp_dout(dpp, 20) << __func__ << "::Block is full-> close and move to next" << dendl;
- p_stats->egress_blocks++;
- p_curr_block->close_block(dpp, true);
- }
-
- // Do we have more Blocks in the block-array ?
- if (p_curr_block < last_block()) {
- p_curr_block ++;
- d_seq_number ++;
- p_curr_block->init(d_worker_id, d_seq_number);
- p_rec_info->rec_id = p_curr_block->add_record(p_rec, dpp);
- }
- else {
- ldpp_dout(dpp, 20) << __func__ << "::calling flush()" << dendl;
- ret = flush(ioctx);
- p_rec_info->rec_id = p_curr_block->add_record(p_rec, dpp);
- }
-
- p_rec_info->block_id = p_curr_block->get_block_id();
- return ret;
- }
-
- //---------------------------------------------------------------------------
- disk_block_array_t::disk_block_array_t(const DoutPrefixProvider* dpp,
- uint8_t *raw_mem,
- uint64_t raw_mem_size,
- work_shard_t worker_id,
- worker_stats_t *p_stats,
- md5_shard_t num_md5_shards)
- {
- d_num_md5_shards = num_md5_shards;
- d_worker_id = worker_id;
- disk_block_t *p = (disk_block_t *)raw_mem;
- disk_block_t *p_end = (disk_block_t *)(raw_mem + raw_mem_size);
-
- for (unsigned md5_shard = 0; md5_shard < d_num_md5_shards; md5_shard++) {
- ldpp_dout(dpp, 20) << __func__ << "::p=" << p << "::p_end=" << p_end << dendl;
- if (p + DISK_BLOCK_COUNT <= p_end) {
- d_disk_arr[md5_shard].activate(dpp, p, d_worker_id, md5_shard, p_stats);
- p += DISK_BLOCK_COUNT;
- }
- else {
- ldpp_dout(dpp, 1) << __func__ << "::ERR: buffer overflow! "
- << "::md5_shard=" << md5_shard << "/" << d_num_md5_shards
- << "::raw_mem_size=" << raw_mem_size << dendl;
- ldpp_dout(dpp, 1) << __func__
- << "::sizeof(disk_block_t)=" << sizeof(disk_block_t)
- << "::DISK_BLOCK_COUNT=" << DISK_BLOCK_COUNT << dendl;
- ceph_abort();
- }
- }
- }
-
- //---------------------------------------------------------------------------
- void disk_block_array_t::flush_output_buffers(const DoutPrefixProvider* dpp,
- librados::IoCtx &ioctx)
- {
- for (md5_shard_t md5_shard = 0; md5_shard < d_num_md5_shards; md5_shard++) {
- ldpp_dout(dpp, 20) <<__func__ << "::flush buffers:: worker_id="
- << d_worker_id<< ", md5_shard=" << md5_shard << dendl;
- d_disk_arr[md5_shard].flush_disk_records(ioctx);
- }
- }
-} // namespace rgw::dedup
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2;
-// vim: ts=8 sw=2 smarttab
-/*
- * Ceph - scalable distributed file system
- *
- * Author: Gabriel BenHanokh <gbenhano@redhat.com>
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation. See file COPYING.
- *
- */
-
-#pragma once
-#include "common/dout.h"
-#include "rgw_common.h"
-#include "rgw_realm_reloader.h"
-#include <string>
-#include <unordered_map>
-#include <variant>
-#include <iostream>
-#include <ostream>
-#include <cstring>
-#include <string>
-#include "include/rados/rados_types.hpp"
-#include "include/rados/buffer.h"
-#include "include/rados/librados.hpp"
-#include "rgw_dedup_utils.h"
-#include "BLAKE3/c/blake3.h"
-
-namespace rgw::dedup {
- struct key_t;
-#define CEPHTOH_16 le16toh
-#define CEPHTOH_32 le32toh
-#define CEPHTOH_64 le64toh
-#define HTOCEPH_16 htole16
-#define HTOCEPH_32 htole32
-#define HTOCEPH_64 htole64
-
- static inline constexpr unsigned DISK_BLOCK_SIZE = 8*1024;
- // we use 16 bit offset
- static_assert(DISK_BLOCK_SIZE < 64*1024);
- static constexpr unsigned DISK_BLOCK_COUNT = 256;
- static_assert(DISK_BLOCK_COUNT <= (4*1024*1024/DISK_BLOCK_SIZE));
- static constexpr unsigned MAX_REC_IN_BLOCK = 32;
- // we use 8bit record indices
- static_assert(MAX_REC_IN_BLOCK < 0xFF);
- using slab_id_t = uint16_t;
- using block_offset_t = uint8_t;
- using record_id_t = uint8_t;
-
- // disk_block_id_t is a 32 bits concataion of shard_id, slab_id and block_off
- // ---8---- | -------16------- | ---8----
- // shard_id | slab_id | block_off
- struct __attribute__ ((packed)) disk_block_id_t
- {
- public:
- disk_block_id_t() {
- block_id = 0;
- }
-
- disk_block_id_t(work_shard_t shard_id, uint32_t seq_number) {
- ceph_assert((seq_number & SEQ_NUMBER_MASK) == seq_number);
- ceph_assert(shard_id <= MAX_WORK_SHARD);
- block_id = (uint32_t)shard_id << OBJ_SHARD_SHIFT | seq_number;
- }
-
- disk_block_id_t& operator =(const disk_block_id_t &other) {
- this->block_id = other.block_id;
- return *this;
- }
-
- inline disk_block_id_t& operator =(uint32_t val) {
- this->block_id = val;
- return *this;
- }
-
- inline bool operator ==(const disk_block_id_t &other) const {
- return (this->block_id == other.block_id);
- }
-
- inline explicit operator uint32_t() const {
- return this->block_id;
- }
-
- friend std::ostream& operator<<(std::ostream& os, const disk_block_id_t& block_id);
-
- std::string get_slab_name(md5_shard_t md5_shard) const;
-
- static inline slab_id_t seq_num_to_slab_id(uint32_t seq_number) {
- return (seq_number & SLAB_ID_MASK) >> SLAB_ID_SHIFT;
- }
-
- static inline uint32_t slab_id_to_seq_num(uint32_t slab_id) {
- return (slab_id << SLAB_ID_SHIFT);
- }
-
- inline block_offset_t get_block_offset() const {
- return get_block_offset(get_seq_num());
- }
-
- inline work_shard_t get_work_shard_id() const {
- return (block_id & OBJ_SHARD_MASK) >> OBJ_SHARD_SHIFT;
- }
-
- private:
- inline uint32_t get_seq_num() const {
- return (block_id & SEQ_NUMBER_MASK);
- }
-
- inline slab_id_t get_slab_id() const {
- return seq_num_to_slab_id(get_seq_num());
- }
-
- inline block_offset_t get_block_offset(uint32_t seq_number) const {
- return (seq_number & BLOCK_OFF_MASK);
- }
-
- static constexpr uint32_t OBJ_SHARD_SHIFT = 24;
- static constexpr uint32_t OBJ_SHARD_MASK = 0xFF000000;
-
- static constexpr uint32_t SEQ_NUMBER_SHIFT = 0;
- static constexpr uint32_t SEQ_NUMBER_MASK = 0x00FFFFFF;
-
- static constexpr uint32_t SLAB_ID_SHIFT = 8;
- static constexpr uint32_t SLAB_ID_MASK = 0x00FFFF00;
-
- static constexpr uint32_t BLOCK_OFF_SHIFT = 0;
- static constexpr uint32_t BLOCK_OFF_MASK = 0x000000FF;
-
- uint32_t block_id;
- };
-
- struct disk_record_t
- {
- disk_record_t(const char *buff);
- disk_record_t(const rgw::sal::Bucket *p_bucket,
- const std::string &obj_name,
- const parsed_etag_t *p_parsed_etag,
- uint64_t obj_size,
- const std::string &storage_class);
- disk_record_t() {}
- size_t serialize(char *buff) const;
- size_t length() const;
- int validate(const char *caller,
- const DoutPrefixProvider* dpp,
- disk_block_id_t block_id,
- record_id_t rec_id) const;
- inline bool has_shared_manifest() const { return s.flags.has_shared_manifest(); }
- inline void set_shared_manifest() { s.flags.set_shared_manifest(); }
-
- struct __attribute__ ((packed)) packed_rec_t
- {
- uint8_t rec_version; // allows changing record format
- dedup_flags_t flags; // 1 Byte flags
- uint16_t num_parts; // For multipart upload (AWS MAX-PART is 10,000)
- uint16_t obj_name_len;
- uint16_t bucket_name_len;
-
- uint64_t md5_high; // High Bytes of the Object Data MD5
- uint64_t md5_low; // Low Bytes of the Object Data MD5
- uint64_t obj_bytes_size;
- uint64_t object_version;
-
- uint16_t bucket_id_len;
- uint16_t tenant_name_len;
- uint16_t stor_class_len;
- uint16_t ref_tag_len;
-
- uint16_t manifest_len;
- uint8_t pad[6];
-
- uint64_t shared_manifest; // 64bit hash of the SRC object manifest
- uint64_t hash[4]; // 4 * 8 Bytes of BLAKE3
- }s;
- std::string obj_name;
- // TBD: find pool name making it easier to get ioctx
- std::string bucket_name;
- std::string bucket_id;
- std::string tenant_name;
- std::string ref_tag;
- std::string stor_class;
- bufferlist manifest_bl;
- };
- static_assert(BLAKE3_OUT_LEN == sizeof(disk_record_t::packed_rec_t::hash));
- std::ostream &operator<<(std::ostream &stream, const disk_record_t & rec);
-
- static constexpr unsigned BLOCK_MAGIC = 0xFACE;
- static constexpr unsigned LAST_BLOCK_MAGIC = 0xCAD7;
- struct __attribute__ ((packed)) disk_block_header_t {
- void deserialize();
- int verify(disk_block_id_t block_id, const DoutPrefixProvider* dpp);
- uint16_t offset;
- uint16_t rec_count;
- disk_block_id_t block_id;
- uint16_t rec_offsets[MAX_REC_IN_BLOCK];
- };
- static constexpr unsigned MAX_REC_SIZE = (DISK_BLOCK_SIZE - sizeof(disk_block_header_t));
-
- struct __attribute__ ((packed)) disk_block_t
- {
- const disk_block_header_t* get_header() const { return (disk_block_header_t*)data; }
- disk_block_header_t* get_header() { return (disk_block_header_t*)data; }
- bool is_empty() const { return (get_header()->rec_count == 0); }
-
- void init(work_shard_t worker_id, uint32_t seq_number);
- record_id_t add_record(const disk_record_t *p_rec, const DoutPrefixProvider *dpp);
- void close_block(const DoutPrefixProvider* dpp, bool has_more);
- disk_block_id_t get_block_id() {
- disk_block_header_t *p_header = get_header();
- return p_header->block_id;
- }
- char data[DISK_BLOCK_SIZE];
- };
-
- int load_record(librados::IoCtx &ioctx,
- const disk_record_t *p_tgt_rec,
- disk_record_t *p_src_rec, /* OUT */
- disk_block_id_t block_id,
- record_id_t rec_id,
- md5_shard_t md5_shard,
- const DoutPrefixProvider *dpp);
-
- int load_slab(librados::IoCtx &ioctx,
- bufferlist &bl,
- md5_shard_t md5_shard,
- work_shard_t worker_id,
- uint32_t seq_number,
- const DoutPrefixProvider* dpp);
-
- int store_slab(librados::IoCtx &ioctx,
- bufferlist &bl,
- md5_shard_t md5_shard,
- work_shard_t worker_id,
- uint32_t seq_number,
- const DoutPrefixProvider* dpp);
-
- class disk_block_array_t;
- class disk_block_seq_t
- {
- friend class disk_block_array_t;
- public:
- struct record_info_t {
- disk_block_id_t block_id;
- record_id_t rec_id;
- };
-
- disk_block_seq_t(const DoutPrefixProvider* dpp_in,
- disk_block_t *p_arr_in,
- work_shard_t worker_id,
- md5_shard_t md5_shard,
- worker_stats_t *p_stats_in);
- int flush_disk_records(librados::IoCtx &ioctx);
- md5_shard_t get_md5_shard() { return d_md5_shard; }
- int add_record(librados::IoCtx &ioctx,
- const disk_record_t *p_rec, // IN-OUT
- record_info_t *p_rec_info); // OUT-PARAM
-
- private:
- disk_block_seq_t() {;}
- void activate(const DoutPrefixProvider* _dpp,
- disk_block_t *_p_arr,
- work_shard_t worker_id,
- md5_shard_t md5_shard,
- worker_stats_t *p_stats);
- inline const disk_block_t* last_block() { return &p_arr[DISK_BLOCK_COUNT-1]; }
- int flush(librados::IoCtx &ioctx);
- void slab_reset() {
- p_curr_block = p_arr;
- p_curr_block->init(d_worker_id, d_seq_number);
- }
-
- disk_block_t *p_arr = nullptr;
- disk_block_t *p_curr_block = nullptr;
- worker_stats_t *p_stats = nullptr;
- const DoutPrefixProvider *dpp = nullptr;
- uint32_t d_seq_number = 0;
- work_shard_t d_worker_id = NULL_WORK_SHARD;
- md5_shard_t d_md5_shard = NULL_MD5_SHARD;
- };
-
- class disk_block_array_t
- {
- public:
- disk_block_array_t(const DoutPrefixProvider* _dpp,
- uint8_t *raw_mem,
- uint64_t raw_mem_size,
- work_shard_t worker_id,
- worker_stats_t *p_worker_stats,
- md5_shard_t num_md5_shards);
- void flush_output_buffers(const DoutPrefixProvider* dpp,
- librados::IoCtx &ioctx);
- disk_block_seq_t* get_shard_block_seq(uint64_t md5_low) {
- md5_shard_t md5_shard = md5_low % d_num_md5_shards;
- return d_disk_arr + md5_shard;
- }
-
- //private:
- disk_block_seq_t d_disk_arr[MAX_MD5_SHARD];
- work_shard_t d_worker_id;
- md5_shard_t d_num_md5_shards;
- };
-} //namespace rgw::dedup
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2;
-// vim: ts=8 sw=2 smarttab
-/*
- * Ceph - scalable distributed file system
- *
- * Author: Gabriel BenHanokh <gbenhano@redhat.com>
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation. See file COPYING.
- *
- */
-
-#include "rgw_dedup_table.h"
-#include "include/ceph_assert.h"
-#include <cstring>
-#include <iostream>
-
-namespace rgw::dedup {
-
- //---------------------------------------------------------------------------
- dedup_table_t::dedup_table_t(const DoutPrefixProvider* _dpp,
- uint32_t _head_object_size,
- uint8_t *p_slab,
- uint64_t slab_size)
- {
- dpp = _dpp;
- head_object_size = _head_object_size;
- memset(p_slab, 0, slab_size);
- hash_tab = (table_entry_t*)p_slab;
- entries_count = slab_size/sizeof(table_entry_t);
- values_count = 0;
- occupied_count = 0;
- }
-
- //---------------------------------------------------------------------------
- void dedup_table_t::remove_singletons_and_redistribute_keys()
- {
- for (uint32_t tab_idx = 0; tab_idx < entries_count; tab_idx++) {
- if (!hash_tab[tab_idx].val.is_occupied()) {
- continue;
- }
-
- if (hash_tab[tab_idx].val.is_singleton()) {
- hash_tab[tab_idx].val.clear_flags();
- redistributed_clear++;
- continue;
- }
-
- const key_t &key = hash_tab[tab_idx].key;
- // This is an approximation only since size is stored in 4KB resolution
- uint64_t byte_size_approx = disk_blocks_to_byte_size(key.size_4k_units);
- if (!key.multipart_object() && (byte_size_approx <= head_object_size)) {
- hash_tab[tab_idx].val.clear_flags();
- redistributed_clear++;
- continue;
- }
-
- uint32_t key_idx = key.hash() % entries_count;
- if (key_idx != tab_idx) {
- uint64_t count = 1;
- redistributed_count++;
- uint32_t idx = key_idx;
- while (hash_tab[idx].val.is_occupied() &&
- !hash_tab[idx].val.is_singleton() &&
- (hash_tab[idx].key != key)) {
- count++;
- idx = (idx + 1) % entries_count;
- }
-
- if (idx != tab_idx) {
- if (hash_tab[idx].val.is_occupied() && hash_tab[idx].val.is_singleton() ) {
- redistributed_clear++;
- }
- if (idx == key_idx) {
- redistributed_perfect++;
- }
- hash_tab[idx] = hash_tab[tab_idx];
- hash_tab[tab_idx].val.clear_flags();
- }
- else {
- redistributed_loopback++;
- }
-
- redistributed_search_max = std::max(redistributed_search_max, count);
- redistributed_search_total += count;
- }
- else {
- redistributed_not_needed++;
- }
- }
- }
-
- //---------------------------------------------------------------------------
- uint32_t dedup_table_t::find_entry(const key_t *p_key) const
- {
- uint32_t idx = p_key->hash() % entries_count;
-
- // search until we either find the key, or find an empty slot.
- while (hash_tab[idx].val.is_occupied() && (hash_tab[idx].key != *p_key)) {
- idx = (idx + 1) % entries_count;
- }
- return idx;
- }
-
- //---------------------------------------------------------------------------
- int dedup_table_t::add_entry(key_t *p_key,
- disk_block_id_t block_id,
- record_id_t rec_id,
- bool shared_manifest)
- {
- value_t new_val(block_id, rec_id, shared_manifest);
- uint32_t idx = find_entry(p_key);
- value_t &val = hash_tab[idx].val;
- if (!val.is_occupied()) {
- if (occupied_count < entries_count) {
- occupied_count++;
- }
- else {
- return -EOVERFLOW;
- }
-
- hash_tab[idx].key = *p_key;
- hash_tab[idx].val = new_val;
- ldpp_dout(dpp, 20) << __func__ << "::add new entry" << dendl;
- ceph_assert(val.count == 1);
- }
- else {
- ceph_assert(hash_tab[idx].key == *p_key);
- val.count ++;
- if (!val.has_shared_manifest() && shared_manifest) {
- // replace value!
- ldpp_dout(dpp, 20) << __func__ << "::Replace with shared_manifest::["
- << val.block_idx << "/" << (int)val.rec_id << "] -> ["
- << block_id << "/" << (int)rec_id << "]" << dendl;
- new_val.count = val.count;
- hash_tab[idx].val = new_val;
- }
- ceph_assert(val.count > 1);
- }
- values_count++;
- ldpp_dout(dpp, 20) << __func__ << "::COUNT="<< val.count << dendl;
- return 0;
- }
-
- //---------------------------------------------------------------------------
- void dedup_table_t::update_entry(key_t *p_key,
- disk_block_id_t block_id,
- record_id_t rec_id,
- bool shared_manifest)
- {
- uint32_t idx = find_entry(p_key);
- ceph_assert(hash_tab[idx].key == *p_key);
- value_t &val = hash_tab[idx].val;
- ceph_assert(val.is_occupied());
- // we only update non-singletons since we purge singletons after the first pass
- ceph_assert(val.count > 1);
-
- // need to overwrite the block_idx/rec_id from the first pass
- // unless already set with shared_manifest with the correct block-id/rec-id
- // We only set the shared_manifest flag on the second pass where we
- // got valid block-id/rec-id
- if (!val.has_shared_manifest()) {
- // replace value!
- value_t new_val(block_id, rec_id, shared_manifest);
- new_val.count = val.count;
- hash_tab[idx].val = new_val;
- ldpp_dout(dpp, 20) << __func__ << "::Replaced table entry::["
- << val.block_idx << "/" << (int)val.rec_id << "] -> ["
- << block_id << "/" << (int)rec_id << "]" << dendl;
- }
- }
-
- //---------------------------------------------------------------------------
- int dedup_table_t::set_shared_manifest_src_mode(const key_t *p_key,
- disk_block_id_t block_id,
- record_id_t rec_id)
- {
- uint32_t idx = find_entry(p_key);
- value_t &val = hash_tab[idx].val;
- if (val.is_occupied()) {
- if (val.block_idx == block_id && val.rec_id == rec_id) {
- val.set_shared_manifest_src();
- return 0;
- }
- }
-
- return -ENOENT;
- }
-
- //---------------------------------------------------------------------------
- int dedup_table_t::get_val(const key_t *p_key, struct value_t *p_val /*OUT*/)
- {
- uint32_t idx = find_entry(p_key);
- const value_t &val = hash_tab[idx].val;
- if (!val.is_occupied()) {
- return -ENOENT;
- }
-
- *p_val = val;
- return 0;
- }
-
- //---------------------------------------------------------------------------
- void dedup_table_t::count_duplicates(dedup_stats_t *p_small_objs,
- dedup_stats_t *p_big_objs,
- uint64_t *p_duplicate_head_bytes)
- {
- for (uint32_t tab_idx = 0; tab_idx < entries_count; tab_idx++) {
- if (!hash_tab[tab_idx].val.is_occupied()) {
- continue;
- }
-
- const key_t &key = hash_tab[tab_idx].key;
- // This is an approximation only since size is stored in 4KB resolution
- uint64_t byte_size_approx = disk_blocks_to_byte_size(key.size_4k_units);
- uint32_t duplicate_count = (hash_tab[tab_idx].val.count -1);
-
- // skip small single part objects which we can't dedup
- if (!key.multipart_object() && (byte_size_approx <= head_object_size)) {
- if (hash_tab[tab_idx].val.is_singleton()) {
- p_small_objs->singleton_count++;
- }
- else {
- p_small_objs->duplicate_count += duplicate_count;
- p_small_objs->unique_count ++;
- p_small_objs->dedup_bytes_estimate += (duplicate_count * byte_size_approx);
- }
- continue;
- }
-
- if (hash_tab[tab_idx].val.is_singleton()) {
- p_big_objs->singleton_count++;
- }
- else {
- ceph_assert(hash_tab[tab_idx].val.count > 1);
- uint64_t dup_bytes_approx = calc_deduped_bytes(head_object_size,
- key.num_parts,
- byte_size_approx);
- p_big_objs->dedup_bytes_estimate += (duplicate_count * dup_bytes_approx);
- p_big_objs->duplicate_count += duplicate_count;
- p_big_objs->unique_count ++;
-
- if (!key.multipart_object()) {
- // single part objects duplicate the head object when dedup is used
- uint64_t dup_head_bytes = duplicate_count * head_object_size;
- *p_duplicate_head_bytes += dup_head_bytes;
- }
- }
- }
- }
-
-} // namespace rgw::dedup
-
-#if 0
-#include <climits>
-#include <cstdlib>
-#include <iostream>
-#include <cmath>
-#include <iomanip>
-#include <random>
-
-//---------------------------------------------------------------------------
-int main()
-{
- static constexpr unsigned MAX_ENTRIES = 1024;
- rgw::dedup::key_t *key_tab = new rgw::dedup::key_t[MAX_ENTRIES];
- if (!key_tab) {
- std::cerr << "faild alloc!" << std::endl;
- return 1;
- }
- rgw::dedup::key_t *p_key = key_tab;
- //rgw::dedup::dedup_table_t tab(MAX_ENTRIES + MAX_ENTRIES/5);
- rgw::dedup::dedup_table_t tab(MAX_ENTRIES);
-
- std::cout << "sizeof(key)=" << sizeof(rgw::dedup::key_t) << std::endl;
- // Seed with a real random value, if available
- std::random_device r;
- // Choose a random mean between 1 ULLONG_MAX
- std::default_random_engine e1(r());
- std::uniform_int_distribution<uint64_t> uniform_dist(1, std::numeric_limits<uint64_t>::max());
-
- for (unsigned i = 0; i < MAX_ENTRIES; i++) {
- uint64_t md5_high = uniform_dist(e1);
- uint64_t md5_low = uniform_dist(e1);
- uint32_t size_4k_units = std::rand();
- uint16_t num_parts = std::rand();
- //std::cout << std::hex << md5_high << "::" << md5_low << "::" << block_id << std::endl;
- rgw::dedup::key_t key(md5_high, md5_low, size_4k_units, num_parts);
- *p_key = key;
- p_key++;
- }
- work_shard_t work_shard = 3;
- for (unsigned i = 0; i < MAX_ENTRIES; i++) {
- disk_block_id_t block_id(worker_id, std::rand());
- tab.add_entry(key_tab+i, block_id, 0, false, false);
- }
- double avg = (double)total / MAX_ENTRIES;
- std::cout << "Insert::num entries=" << MAX_ENTRIES << ", total=" << total
- << ", avg=" << avg << ", max=" << max << std::endl;
- std::cout << "==========================================\n";
-
- total = 0;
- max = 0;
- for (unsigned i = 0; i < MAX_ENTRIES; i++) {
- tab.find_entry(key_tab+i);
- }
- avg = (double)total / MAX_ENTRIES;
- std::cout << "Find::num entries=" << MAX_ENTRIES << ", total=" << total
- << ", avg=" << avg << ", max=" << max << std::endl;
- std::cout << "==========================================\n";
- tab.remove_singletons_and_redistribute_keys();
- tab.print_redistribute_stats();
- tab.stat_counters_reset();
- std::cout << "==========================================\n";
- total = 0;
- max = 0;
- uint32_t cnt = 0;
- for (unsigned i = 0; i < MAX_ENTRIES; i++) {
- rgw::dedup::key_t *p_key = key_tab+i;
- tab.find_entry(p_key);
- cnt++;
-#if 0
- if (p_key->md5_high % 5 == 0) {
- tab.find_entry(p_key);
- cnt++;
- }
-#endif
- }
- avg = (double)total / cnt;
- std::cout << "num entries=" << cnt << ", total=" << total
- << ", avg=" << avg << ", max=" << max << std::endl;
-}
-#endif
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2;
-// vim: ts=8 sw=2 smarttab
-/*
- * Ceph - scalable distributed file system
- *
- * Author: Gabriel BenHanokh <gbenhano@redhat.com>
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation. See file COPYING.
- *
- */
-
-#pragma once
-#include <cstdint>
-#include <cstddef>
-#include <iterator>
-#include "common/dout.h"
-#include "rgw_dedup_store.h"
-namespace rgw::dedup {
-
- // 24 Bytes key
- struct key_t {
- key_t() { ;}
- key_t(uint64_t _md5_high,
- uint64_t _md5_low,
- uint32_t _size_4k_units,
- uint16_t _num_parts,
- uint8_t _stor_class_idx) {
- md5_high = _md5_high;
- md5_low = _md5_low;
- size_4k_units = _size_4k_units;
- num_parts = _num_parts;
- stor_class_idx = _stor_class_idx;
- pad8 = 0;
- }
-
- bool operator==(const struct key_t& other) const {
- return (memcmp(this, &other, sizeof(other)) == 0);
- }
-
- bool operator!=(const struct key_t& other) const {
- return !operator==(other);
- }
-
- uint64_t hash() const {
- // The MD5 is already a hashing function so no need for another hash
- return this->md5_low;
- }
-
- bool multipart_object() const {
- return num_parts > 0;
- }
-
- uint64_t md5_high; // High Bytes of the Object Data MD5
- uint64_t md5_low; // Low Bytes of the Object Data MD5
- uint32_t size_4k_units; // Object size in 4KB units max out at 16TB (AWS MAX-SIZE is 5TB)
- uint16_t num_parts; // How many parts were used in multipart upload (AWS MAX-PART is 10,000)
- uint8_t stor_class_idx;// storage class id
- uint8_t pad8;
- } __attribute__((__packed__));
- static_assert(sizeof(key_t) == 24);
-
- class dedup_table_t {
- public:
- // 8 Bytes Value
- struct value_t {
- value_t() {
- this->block_idx = 0xFFFFFFFF;
- this->count = 0;
- this->rec_id = 0xFF;
- this->flags.clear();
- }
-
- value_t(disk_block_id_t block_id, record_id_t rec_id, bool shared_manifest) {
- this->block_idx = block_id;
- this->count = 1;
- this->rec_id = rec_id;
- this->flags.clear();
- this->flags.set_occupied();
- if (shared_manifest) {
- flags.set_shared_manifest();
- }
- }
-
- inline void clear_flags() { flags.clear(); }
- inline bool has_shared_manifest() const {return flags.has_shared_manifest(); }
- inline void set_shared_manifest_src() { this->flags.set_shared_manifest(); }
- inline bool is_singleton() const { return (count == 1); }
- inline bool is_occupied() const { return flags.is_occupied(); }
- inline void set_occupied() { this->flags.set_occupied(); }
- inline void clear_occupied() { this->flags.clear_occupied(); }
-
- disk_block_id_t block_idx; // 32 bits
- uint16_t count; // 16 bits
- record_id_t rec_id; // 8 bits
- dedup_flags_t flags; // 8 bits
- } __attribute__((__packed__));
- static_assert(sizeof(value_t) == 8);
-
- dedup_table_t(const DoutPrefixProvider* _dpp,
- uint32_t _head_object_size,
- uint8_t *p_slab,
- uint64_t slab_size);
- int add_entry(key_t *p_key, disk_block_id_t block_id, record_id_t rec_id,
- bool shared_manifest);
- void update_entry(key_t *p_key, disk_block_id_t block_id, record_id_t rec_id,
- bool shared_manifest);
-
- int get_val(const key_t *p_key, struct value_t *p_val /*OUT*/);
-
- int set_shared_manifest_src_mode(const key_t *p_key,
- disk_block_id_t block_id,
- record_id_t rec_id);
-
- void count_duplicates(dedup_stats_t *p_small_objs_stat,
- dedup_stats_t *p_big_objs_stat,
- uint64_t *p_duplicate_head_bytes);
-
- void remove_singletons_and_redistribute_keys();
- private:
- // 32 Bytes unified entries
- struct table_entry_t {
- key_t key;
- value_t val;
- } __attribute__((__packed__));
- static_assert(sizeof(table_entry_t) == 32);
-
- uint32_t find_entry(const key_t *p_key) const;
- uint32_t values_count = 0;
- uint32_t entries_count = 0;
- uint32_t occupied_count = 0;
- uint32_t head_object_size = (4ULL * 1024 * 1024);
- table_entry_t *hash_tab = nullptr;
-
- // stat counters
- uint64_t redistributed_count = 0;
- uint64_t redistributed_search_total = 0;
- uint64_t redistributed_search_max = 0;
- uint64_t redistributed_loopback = 0;
- uint64_t redistributed_perfect = 0;
- uint64_t redistributed_clear = 0;
- uint64_t redistributed_not_needed = 0;
- const DoutPrefixProvider* dpp;
- };
-
-} //namespace rgw::dedup
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2;
-// vim: ts=8 sw=2 smarttab
-/*
- * Ceph - scalable distributed file system
- *
- * Author: Gabriel BenHanokh <gbenhano@redhat.com>
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation. See file COPYING.
- *
- */
-
-#include "rgw_dedup_utils.h"
-#include "common/ceph_crypto.h"
-
-namespace rgw::dedup {
- //---------------------------------------------------------------------------
- std::ostream& operator<<(std::ostream &out, const dedup_req_type_t& dedup_type)
- {
- if (dedup_type == dedup_req_type_t::DEDUP_TYPE_NONE) {
- out << "DEDUP_TYPE_NONE";
- }
- else if (dedup_type == dedup_req_type_t::DEDUP_TYPE_ESTIMATE) {
- out << "DEDUP_TYPE_ESTIMATE";
- }
- else if (dedup_type == dedup_req_type_t::DEDUP_TYPE_FULL) {
- out << "DEDUP_TYPE_FULL";
- }
- else {
- out << "\n*** unexpected dedup_type ***\n";
- }
-
- return out;
- }
-
- //---------------------------------------------------------------------------
- dedup_stats_t& dedup_stats_t::operator+=(const dedup_stats_t& other)
- {
- this->singleton_count += other.singleton_count;
- this->unique_count += other.unique_count;
- this->duplicate_count += other.duplicate_count;
- this->dedup_bytes_estimate += other.dedup_bytes_estimate;
- return *this;
- }
-
- //---------------------------------------------------------------------------
- std::ostream& operator<<(std::ostream &out, const dedup_stats_t& stats)
- {
- out << "::singleton_count=" << stats.singleton_count
- << "::unique_count=" << stats.unique_count
- << "::duplicate_count=" << stats.duplicate_count
- << "::duplicated_bytes=" << stats.dedup_bytes_estimate;
- return out;
- }
-
- //---------------------------------------------------------------------------
- void encode(const dedup_stats_t& ds, ceph::bufferlist& bl)
- {
- ENCODE_START(1, 1, bl);
- encode(ds.singleton_count, bl);
- encode(ds.unique_count, bl);
- encode(ds.duplicate_count, bl);
- encode(ds.dedup_bytes_estimate, bl);
- ENCODE_FINISH(bl);
- }
-
- //---------------------------------------------------------------------------
- void decode(dedup_stats_t& ds, ceph::bufferlist::const_iterator& bl)
- {
- DECODE_START(1, bl);
- decode(ds.singleton_count, bl);
- decode(ds.unique_count, bl);
- decode(ds.duplicate_count, bl);
- decode(ds.dedup_bytes_estimate, bl);
- DECODE_FINISH(bl);
- }
-
- // convert a hex-string to a 64bit integer (max 16 hex digits)
- //---------------------------------------------------------------------------
- bool hex2int(const char *p, const char *p_end, uint64_t *p_val)
- {
- if (p_end - p <= (int)(sizeof(uint64_t) * 2)) {
- uint64_t val = 0;
- while (p < p_end) {
- // get current character then increment
- uint8_t byte = *p++;
- // transform hex character to the 4bit equivalent number, using the ASCII table indexes
- if (byte >= '0' && byte <= '9') {
- byte = byte - '0';
- }
- else if (byte >= 'a' && byte <='f') {
- byte = byte - 'a' + 10;
- }
- else if (byte >= 'A' && byte <='F') {
- byte = byte - 'A' + 10;
- }
- else {
- // terminate on the first non hex char
- return false;
- }
- // shift 4 to make space for new digit, and add the 4 bits of the new digit
- val = (val << 4) | (byte & 0xF);
- }
- *p_val = val;
- return true;
- }
- else {
- return false;
- }
- }
-
- //---------------------------------------------------------------------------
- bool dec2int(const char *p, const char* p_end, uint16_t *p_val)
- {
- uint16_t val = 0;
- while (p < p_end) {
- uint8_t byte = *p++;
- if (byte >= '0' && byte <= '9') {
- val = val * 10 + (byte - '0');
- }
- else {
- // terminate on the first non hex char
- return false;
- }
- }
- *p_val = val;
- return true;
- }
-
- // 16Bytes MD5 takes 32 chars
- const unsigned MD5_LENGTH = 32;
-
- //---------------------------------------------------------------------------
- static bool get_num_parts(const std::string & etag, uint16_t *p_num_parts)
- {
- // Amazon S3 multipart upload Maximum number = 10,000
- const unsigned MAX_PARTS = 10000;
- if (etag.length() <= MD5_LENGTH) {
- // i.e. no multipart
- *p_num_parts = 0;
- return true;
- }
-
- // Amazon S3 multipart upload Maximum number = 10,000 (5 decimal digits)
- // We need 1 extra byte for the '-' delimiter and 1 extra byte for '"' at the end
- // 7 Bytes should suffice, but we roundup to 8 Bytes
- const unsigned MAX_PART_LEN = 8;
- if (unlikely(etag.length() > MD5_LENGTH + MAX_PART_LEN)) {
- // illegal ETAG
- return false;
- }
-
- std::string::size_type n = etag.find('-', etag.length() - MAX_PART_LEN);
- if (n != std::string::npos) {
- char buff[MAX_PART_LEN];
- // again, 1 extra byte for the '-' delimiter
- unsigned copy_size = etag.length() - (n + 1);
- if (copy_size <= MAX_PART_LEN) {
- unsigned nbytes = etag.copy(buff, copy_size, n+1);
- uint16_t num_parts;
- const unsigned MAX_UINT16_DIGITS = 5; // 65536
- if (nbytes <= MAX_UINT16_DIGITS) {
- if (dec2int(buff, buff+nbytes, &num_parts) && num_parts <= MAX_PARTS) {
- *p_num_parts = num_parts;
- return true;
- } // else, not all digits are legal
- } // else, more than 5 digits
- } // else, copy len too large
- } // else, '-' delimiter was not found
-
- // illegal number of parts
- return false;
- }
-
- //---------------------------------------------------------------------------
- bool parse_etag_string(const std::string& etag, parsed_etag_t *parsed_etag)
- {
- char buff[MD5_LENGTH*2];
- uint16_t num_parts = 0;
- if (get_num_parts(etag, &num_parts)) {
- etag.copy(buff, MD5_LENGTH, 0);
- uint64_t high, low;
- if (hex2int(buff, buff+16, &high)) {
- if (hex2int(buff+16, buff+32, &low)) {
- parsed_etag->md5_high = high; // High Bytes of the Object Data MD5
- parsed_etag->md5_low = low; // Low Bytes of the Object Data MD5
- parsed_etag->num_parts = num_parts; // How many parts were used in multipart upload
- return true;
- }
- }
- }
-
- // an illegal etag string
- return false;
- }
-
- //---------------------------------------------------------------------------
- void etag_to_bufferlist(uint64_t md5_high, uint64_t md5_low, uint16_t num_parts,
- ceph::bufferlist *bl)
- {
- char buff[64];
- int n = snprintf(buff, sizeof(buff), "%016lx%016lx", md5_high, md5_low);
- if (num_parts >= 1) {
- n += snprintf(buff + n, sizeof(buff) - n, "-%u", num_parts);
- }
- bl->append(buff, n);
- }
-
- //---------------------------------------------------------------------------
- const char* get_next_data_ptr(bufferlist::const_iterator &bl_itr,
- char data_buff[],
- size_t len,
- const DoutPrefixProvider* dpp)
- {
- const char *p = nullptr;
- size_t n = bl_itr.get_ptr_and_advance(len, &p);
- if (n == len) {
- // we got a zero-copy raw pointer to contiguous data on the buffer-list
- return p;
- }
-
- std::vector<int> vec;
- // otherwise - copy the data to the @data_buff
- char *p_buff = data_buff;
- do {
- vec.push_back(n);
- std::memcpy(p_buff, p, n);
- p_buff += n;
- len -= n;
- if (len > 0) {
- n = bl_itr.get_ptr_and_advance(len, &p);
- }
- } while (len > 0);
-
- ldpp_dout(dpp, 20) << __func__ << "::vec=" << vec << dendl;
- return data_buff;
- }
-
- static const char* s_urgent_msg_names[] = {
- "URGENT_MSG_NONE",
- "URGENT_MSG_ABORT",
- "URGENT_MSG_PASUE",
- "URGENT_MSG_RESUME",
- "URGENT_MSG_RESTART",
- "URGENT_MSG_INVALID"
- };
-
- //---------------------------------------------------------------------------
- const char* get_urgent_msg_names(int msg)
- {
- if (msg <= URGENT_MSG_INVALID && msg >= URGENT_MSG_NONE) {
- return s_urgent_msg_names[msg];
- }
- else {
- return s_urgent_msg_names[URGENT_MSG_INVALID];
- }
- }
-
- //---------------------------------------------------------------------------
- worker_stats_t& worker_stats_t::operator+=(const worker_stats_t& other)
- {
- this->ingress_obj += other.ingress_obj;
- this->ingress_obj_bytes += other.ingress_obj_bytes;
- this->egress_records += other.egress_records;
- this->egress_blocks += other.egress_blocks;
- this->egress_slabs += other.egress_slabs;
- this->single_part_objs += other.single_part_objs;
- this->multipart_objs += other.multipart_objs;
- this->small_multipart_obj += other.small_multipart_obj;
- this->default_storage_class_objs += other.default_storage_class_objs;
- this->default_storage_class_objs_bytes += other.default_storage_class_objs_bytes;
- this->non_default_storage_class_objs += other.non_default_storage_class_objs;
- this->non_default_storage_class_objs_bytes += other.non_default_storage_class_objs_bytes;
- this->ingress_corrupted_etag += other.ingress_corrupted_etag;
- this->ingress_skip_too_small_bytes += other.ingress_skip_too_small_bytes;
- this->ingress_skip_too_small += other.ingress_skip_too_small;
- this->ingress_skip_too_small_64KB_bytes += other.ingress_skip_too_small_64KB_bytes;
- this->ingress_skip_too_small_64KB += other.ingress_skip_too_small_64KB;
-
- return *this;
- }
- //---------------------------------------------------------------------------
- void worker_stats_t::dump(Formatter *f) const
- {
- // main section
- {
- Formatter::ObjectSection main(*f, "main");
-
- f->dump_unsigned("Ingress Objs count", this->ingress_obj);
- f->dump_unsigned("Accum byte size Ingress Objs", this->ingress_obj_bytes);
- f->dump_unsigned("Egress Records count", this->egress_records);
- f->dump_unsigned("Egress Blocks count", this->egress_blocks);
- f->dump_unsigned("Egress Slabs count", this->egress_slabs);
- f->dump_unsigned("Single part obj count", this->single_part_objs);
- f->dump_unsigned("Multipart obj count", this->multipart_objs);
- if (this->small_multipart_obj) {
- f->dump_unsigned("Small Multipart obj count", this->small_multipart_obj);
- }
- }
-
- {
- Formatter::ObjectSection notify(*f, "notify");
-
- if(this->non_default_storage_class_objs) {
- f->dump_unsigned("non default storage class objs",
- this->non_default_storage_class_objs);
- f->dump_unsigned("non default storage class objs bytes",
- this->non_default_storage_class_objs_bytes);
- }
- else {
- ceph_assert(this->default_storage_class_objs == this->ingress_obj);
- ceph_assert(this->default_storage_class_objs_bytes == this->ingress_obj_bytes);
- }
- }
-
- {
- Formatter::ObjectSection skipped(*f, "skipped");
- if(this->ingress_skip_too_small) {
- f->dump_unsigned("Ingress skip: too small objs",
- this->ingress_skip_too_small);
- f->dump_unsigned("Ingress skip: too small bytes",
- this->ingress_skip_too_small_bytes);
-
- if(this->ingress_skip_too_small_64KB) {
- f->dump_unsigned("Ingress skip: 64KB<=size<=4MB Obj",
- this->ingress_skip_too_small_64KB);
- f->dump_unsigned("Ingress skip: 64KB<=size<=4MB Bytes",
- this->ingress_skip_too_small_64KB_bytes);
- }
- }
- }
-
- {
- Formatter::ObjectSection failed(*f, "failed");
- if(this->ingress_corrupted_etag) {
- f->dump_unsigned("Corrupted ETAG", this->ingress_corrupted_etag);
- }
- }
- }
-
- //---------------------------------------------------------------------------
- std::ostream& operator<<(std::ostream &out, const worker_stats_t &s)
- {
- JSONFormatter formatter(false);
- s.dump(&formatter);
- std::stringstream sstream;
- formatter.flush(sstream);
- out << sstream.str();
- return out;
- }
-
- //---------------------------------------------------------------------------
- void encode(const worker_stats_t& w, ceph::bufferlist& bl)
- {
- ENCODE_START(1, 1, bl);
- encode(w.ingress_obj, bl);
- encode(w.ingress_obj_bytes, bl);
- encode(w.egress_records, bl);
- encode(w.egress_blocks, bl);
- encode(w.egress_slabs, bl);
-
- encode(w.single_part_objs, bl);
- encode(w.multipart_objs, bl);
- encode(w.small_multipart_obj, bl);
-
- encode(w.default_storage_class_objs, bl);
- encode(w.default_storage_class_objs_bytes, bl);
- encode(w.non_default_storage_class_objs, bl);
- encode(w.non_default_storage_class_objs_bytes, bl);
-
- encode(w.ingress_corrupted_etag, bl);
-
- encode(w.ingress_skip_too_small_bytes, bl);
- encode(w.ingress_skip_too_small, bl);
-
- encode(w.ingress_skip_too_small_64KB_bytes, bl);
- encode(w.ingress_skip_too_small_64KB, bl);
-
- encode(w.duration, bl);
- ENCODE_FINISH(bl);
- }
-
- //---------------------------------------------------------------------------
- void decode(worker_stats_t& w, ceph::bufferlist::const_iterator& bl)
- {
- DECODE_START(1, bl);
- decode(w.ingress_obj, bl);
- decode(w.ingress_obj_bytes, bl);
- decode(w.egress_records, bl);
- decode(w.egress_blocks, bl);
- decode(w.egress_slabs, bl);
- decode(w.single_part_objs, bl);
- decode(w.multipart_objs, bl);
- decode(w.small_multipart_obj, bl);
- decode(w.default_storage_class_objs, bl);
- decode(w.default_storage_class_objs_bytes, bl);
- decode(w.non_default_storage_class_objs, bl);
- decode(w.non_default_storage_class_objs_bytes, bl);
- decode(w.ingress_corrupted_etag, bl);
- decode(w.ingress_skip_too_small_bytes, bl);
- decode(w.ingress_skip_too_small, bl);
- decode(w.ingress_skip_too_small_64KB_bytes, bl);
- decode(w.ingress_skip_too_small_64KB, bl);
-
- decode(w.duration, bl);
- DECODE_FINISH(bl);
- }
-
- //---------------------------------------------------------------------------
- md5_stats_t& md5_stats_t::operator+=(const md5_stats_t& other)
- {
- this->small_objs_stat += other.small_objs_stat;
- this->big_objs_stat += other.big_objs_stat;
- this->ingress_failed_load_bucket += other.ingress_failed_load_bucket;
- this->ingress_failed_get_object += other.ingress_failed_get_object;
- this->ingress_failed_get_obj_attrs += other.ingress_failed_get_obj_attrs;
- this->ingress_corrupted_etag += other.ingress_corrupted_etag;
- this->ingress_corrupted_obj_attrs += other.ingress_corrupted_obj_attrs;
- this->ingress_skip_encrypted += other.ingress_skip_encrypted;
- this->ingress_skip_encrypted_bytes += other.ingress_skip_encrypted_bytes;
- this->ingress_skip_compressed += other.ingress_skip_compressed;
- this->ingress_skip_compressed_bytes += other.ingress_skip_compressed_bytes;
- this->ingress_skip_changed_objs += other.ingress_skip_changed_objs;
- this->shared_manifest_dedup_bytes += other.shared_manifest_dedup_bytes;
-
- this->skipped_shared_manifest += other.skipped_shared_manifest;
- this->skipped_purged_small += other.skipped_purged_small;
- this->skipped_singleton += other.skipped_singleton;
- this->skipped_singleton_bytes += other.skipped_singleton_bytes;
- this->skipped_source_record += other.skipped_source_record;
- this->duplicate_records += other.duplicate_records;
- this->size_mismatch += other.size_mismatch;
- this->hash_mismatch += other.hash_mismatch;
- this->failed_src_load += other.failed_src_load;
- this->failed_rec_load += other.failed_rec_load;
- this->failed_block_load += other.failed_block_load;
-
- this->valid_hash_attrs += other.valid_hash_attrs;
- this->invalid_hash_attrs += other.invalid_hash_attrs;
- this->set_hash_attrs += other.set_hash_attrs;
- this->skip_hash_cmp += other.skip_hash_cmp;
-
- this->set_shared_manifest_src += other.set_shared_manifest_src;
- this->loaded_objects += other.loaded_objects;
- this->processed_objects += other.processed_objects;
- this->dup_head_bytes_estimate += other.dup_head_bytes_estimate;
- this->deduped_objects += other.deduped_objects;
- this->deduped_objects_bytes += other.deduped_objects_bytes;
- this->dup_head_bytes += other.dup_head_bytes;
-
- this->failed_dedup += other.failed_dedup;
- this->failed_table_load += other.failed_table_load;
- this->failed_map_overflow += other.failed_map_overflow;
- return *this;
- }
-
- //---------------------------------------------------------------------------
- std::ostream& operator<<(std::ostream &out, const md5_stats_t &s)
- {
- JSONFormatter formatter(false);
- s.dump(&formatter);
- std::stringstream sstream;
- formatter.flush(sstream);
- out << sstream.str();
- return out;
- }
-
- //---------------------------------------------------------------------------
- void md5_stats_t::dump(Formatter *f) const
- {
- // main section
- {
- Formatter::ObjectSection main(*f, "main");
-
- f->dump_unsigned("Total processed objects", this->processed_objects);
- f->dump_unsigned("Loaded objects", this->loaded_objects);
- f->dump_unsigned("Set Shared-Manifest SRC", this->set_shared_manifest_src);
- f->dump_unsigned("Deduped Obj (this cycle)", this->deduped_objects);
- f->dump_unsigned("Deduped Bytes(this cycle)", this->deduped_objects_bytes);
- f->dump_unsigned("Dup head bytes (not dedup)", this->dup_head_bytes);
- f->dump_unsigned("Already Deduped bytes (prev cycles)",
- this->shared_manifest_dedup_bytes);
-
- const dedup_stats_t &ds = this->big_objs_stat;
- f->dump_unsigned("Singleton Obj", ds.singleton_count);
- f->dump_unsigned("Unique Obj", ds.unique_count);
- f->dump_unsigned("Duplicate Obj", ds.duplicate_count);
- f->dump_unsigned("Dedup Bytes Estimate", ds.dedup_bytes_estimate);
- }
-
- // Potential Dedup Section:
- // What could be gained by allowing dedup for smaller objects (64KB-4MB)
- // Space wasted because of duplicated head-object (4MB)
- {
- Formatter::ObjectSection potential(*f, "Potential Dedup");
- const dedup_stats_t &ds = this->small_objs_stat;
- f->dump_unsigned("Singleton Obj (64KB-4MB)", ds.singleton_count);
- f->dump_unsigned("Unique Obj (64KB-4MB)", ds.unique_count);
- f->dump_unsigned("Duplicate Obj (64KB-4MB)", ds.duplicate_count);
- f->dump_unsigned("Dedup Bytes Estimate (64KB-4MB)", ds.dedup_bytes_estimate);
- f->dump_unsigned("Duplicated Head Bytes Estimate",
- this->dup_head_bytes_estimate);
- f->dump_unsigned("Duplicated Head Bytes", this->dup_head_bytes);
- }
-
- {
- Formatter::ObjectSection notify(*f, "notify");
- if (this->failed_table_load) {
- f->dump_unsigned("Failed Table Load", this->failed_table_load);
- }
- if (this->failed_map_overflow) {
- f->dump_unsigned("Failed Remap Overflow", this->failed_map_overflow);
- }
-
- f->dump_unsigned("Valid HASH attrs", this->valid_hash_attrs);
- f->dump_unsigned("Invalid HASH attrs", this->invalid_hash_attrs);
-
- if (this->set_hash_attrs) {
- f->dump_unsigned("Set HASH", this->set_hash_attrs);
- }
-
- if (this->skip_hash_cmp) {
- f->dump_unsigned("Can't run HASH compare", this->skip_hash_cmp);
- }
- }
-
- {
- Formatter::ObjectSection skipped(*f, "skipped");
- f->dump_unsigned("Skipped shared_manifest", this->skipped_shared_manifest);
- f->dump_unsigned("Skipped purged small objs", this->skipped_purged_small);
- f->dump_unsigned("Skipped singleton objs", this->skipped_singleton);
- if (this->skipped_singleton) {
- f->dump_unsigned("Skipped singleton Bytes", this->skipped_singleton_bytes);
- }
- f->dump_unsigned("Skipped source record", this->skipped_source_record);
-
- if (this->ingress_skip_encrypted) {
- f->dump_unsigned("Skipped Encrypted objs", this->ingress_skip_encrypted);
- f->dump_unsigned("Skipped Encrypted Bytes",this->ingress_skip_encrypted_bytes);
- }
- if (this->ingress_skip_compressed) {
- f->dump_unsigned("Skipped Compressed objs", this->ingress_skip_compressed);
- f->dump_unsigned("Skipped Compressed Bytes", this->ingress_skip_compressed_bytes);
- }
- if (this->ingress_skip_changed_objs) {
- f->dump_unsigned("Skipped Changed Object", this->ingress_skip_changed_objs);
- }
- }
-
- {
- Formatter::ObjectSection sys_failures(*f, "system failures");
- if (this->ingress_failed_load_bucket) {
- f->dump_unsigned("Failed load_bucket()", this->ingress_failed_load_bucket);
- }
- if (this->ingress_failed_get_object) {
- f->dump_unsigned("Failed get_object()", this->ingress_failed_get_object);
- }
- if (this->ingress_failed_get_obj_attrs) {
- f->dump_unsigned("Failed get_obj_attrs", this->ingress_failed_get_obj_attrs);
- }
- if (this->ingress_corrupted_etag) {
- f->dump_unsigned("Corrupted ETAG", this->ingress_corrupted_etag);
- }
- if (this->ingress_corrupted_obj_attrs) {
- f->dump_unsigned("Corrupted obj attributes", this->ingress_corrupted_obj_attrs);
- }
- if (this->failed_src_load) {
- f->dump_unsigned("Failed SRC-Load ", this->failed_src_load);
- }
- if (this->failed_rec_load) {
- f->dump_unsigned("Failed Record-Load ", this->failed_rec_load);
- }
- if (this->failed_block_load) {
- f->dump_unsigned("Failed Block-Load ", this->failed_block_load);
- }
- if (this->failed_dedup) {
- f->dump_unsigned("Failed Dedup", this->failed_dedup);
- }
- }
-
- {
- Formatter::ObjectSection logical_failures(*f, "logical failures");
- if (this->hash_mismatch) {
- f->dump_unsigned("HASH mismatch", this->hash_mismatch);
- }
- if (this->duplicate_records) {
- f->dump_unsigned("Duplicate SRC/TGT", this->duplicate_records);
- }
- if (this->size_mismatch) {
- f->dump_unsigned("Size mismatch SRC/TGT", this->size_mismatch);
- }
- }
- }
-
- //---------------------------------------------------------------------------
- void encode(const md5_stats_t& m, ceph::bufferlist& bl)
- {
- ENCODE_START(1, 1, bl);
-
- encode(m.small_objs_stat, bl);
- encode(m.big_objs_stat, bl);
- encode(m.ingress_failed_load_bucket, bl);
- encode(m.ingress_failed_get_object, bl);
- encode(m.ingress_failed_get_obj_attrs, bl);
- encode(m.ingress_corrupted_etag, bl);
- encode(m.ingress_corrupted_obj_attrs, bl);
- encode(m.ingress_skip_encrypted, bl);
- encode(m.ingress_skip_encrypted_bytes, bl);
- encode(m.ingress_skip_compressed, bl);
- encode(m.ingress_skip_compressed_bytes, bl);
- encode(m.ingress_skip_changed_objs, bl);
- encode(m.shared_manifest_dedup_bytes, bl);
-
- encode(m.skipped_shared_manifest, bl);
- encode(m.skipped_purged_small, bl);
- encode(m.skipped_singleton, bl);
- encode(m.skipped_singleton_bytes, bl);
- encode(m.skipped_source_record, bl);
- encode(m.duplicate_records, bl);
- encode(m.size_mismatch, bl);
- encode(m.hash_mismatch, bl);
- encode(m.failed_src_load, bl);
- encode(m.failed_rec_load, bl);
- encode(m.failed_block_load, bl);
-
- encode(m.valid_hash_attrs, bl);
- encode(m.invalid_hash_attrs, bl);
- encode(m.set_hash_attrs, bl);
- encode(m.skip_hash_cmp, bl);
- encode(m.set_shared_manifest_src, bl);
-
- encode(m.loaded_objects, bl);
- encode(m.processed_objects, bl);
- encode(m.dup_head_bytes_estimate, bl);
- encode(m.deduped_objects, bl);
- encode(m.deduped_objects_bytes, bl);
- encode(m.dup_head_bytes, bl);
- encode(m.failed_dedup, bl);
- encode(m.failed_table_load, bl);
- encode(m.failed_map_overflow, bl);
-
- encode(m.duration, bl);
- ENCODE_FINISH(bl);
- }
-
- //---------------------------------------------------------------------------
- void decode(md5_stats_t& m, ceph::bufferlist::const_iterator& bl)
- {
- DECODE_START(1, bl);
- decode(m.small_objs_stat, bl);
- decode(m.big_objs_stat, bl);
- decode(m.ingress_failed_load_bucket, bl);
- decode(m.ingress_failed_get_object, bl);
- decode(m.ingress_failed_get_obj_attrs, bl);
- decode(m.ingress_corrupted_etag, bl);
- decode(m.ingress_corrupted_obj_attrs, bl);
- decode(m.ingress_skip_encrypted, bl);
- decode(m.ingress_skip_encrypted_bytes, bl);
- decode(m.ingress_skip_compressed, bl);
- decode(m.ingress_skip_compressed_bytes, bl);
- decode(m.ingress_skip_changed_objs, bl);
- decode(m.shared_manifest_dedup_bytes, bl);
-
- decode(m.skipped_shared_manifest, bl);
- decode(m.skipped_purged_small, bl);
- decode(m.skipped_singleton, bl);
- decode(m.skipped_singleton_bytes, bl);
- decode(m.skipped_source_record, bl);
- decode(m.duplicate_records, bl);
- decode(m.size_mismatch, bl);
- decode(m.hash_mismatch, bl);
- decode(m.failed_src_load, bl);
- decode(m.failed_rec_load, bl);
- decode(m.failed_block_load, bl);
-
- decode(m.valid_hash_attrs, bl);
- decode(m.invalid_hash_attrs, bl);
- decode(m.set_hash_attrs, bl);
- decode(m.skip_hash_cmp, bl);
- decode(m.set_shared_manifest_src, bl);
-
- decode(m.loaded_objects, bl);
- decode(m.processed_objects, bl);
- decode(m.dup_head_bytes_estimate, bl);
- decode(m.deduped_objects, bl);
- decode(m.deduped_objects_bytes, bl);
- decode(m.dup_head_bytes, bl);
- decode(m.failed_dedup, bl);
- decode(m.failed_table_load, bl);
- decode(m.failed_map_overflow, bl);
-
- decode(m.duration, bl);
- DECODE_FINISH(bl);
- }
-} //namespace rgw::dedup
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2;
-// vim: ts=8 sw=2 smarttab
-/*
- * Ceph - scalable distributed file system
- *
- * Author: Gabriel BenHanokh <gbenhano@redhat.com>
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation. See file COPYING.
- *
- */
-
-#pragma once
-#include <string>
-#include "include/rados/buffer.h"
-#include "include/encoding.h"
-#include "common/Formatter.h"
-#include "common/ceph_json.h"
-#include <time.h>
-#include "include/utime.h"
-#include "include/encoding.h"
-#include "common/dout.h"
-
-#define FULL_DEDUP_SUPPORT
-namespace rgw::dedup {
- using work_shard_t = uint16_t;
- using md5_shard_t = uint16_t;
-
- // settings to help debug small systems
- const work_shard_t MIN_WORK_SHARD = 2;
- const md5_shard_t MIN_MD5_SHARD = 4;
-
- // Those are the correct values for production system
- const work_shard_t MAX_WORK_SHARD = 255;
- const md5_shard_t MAX_MD5_SHARD = 512;
-
- const work_shard_t NULL_WORK_SHARD = 0xFFFF;
- const md5_shard_t NULL_MD5_SHARD = 0xFFFF;
- const unsigned NULL_SHARD = 0xFFFF;
-
- // work_shard is an 8 bits int with 255 legal values for the first iteration
- // and one value (0xFF) reserved for second iteration
- const unsigned WORK_SHARD_HARD_LIMIT = 0x0FF;
- // md5_shard_t is a 12 bits int with 4096 possible values
- const unsigned MD5_SHARD_HARD_LIMIT = 0xFFF;
-
- static_assert(MAX_WORK_SHARD < NULL_WORK_SHARD);
- static_assert(MAX_WORK_SHARD < NULL_SHARD);
- static_assert(MAX_WORK_SHARD <= WORK_SHARD_HARD_LIMIT);
- static_assert(MAX_MD5_SHARD < NULL_MD5_SHARD);
- static_assert(MAX_MD5_SHARD < NULL_SHARD);
- static_assert(MAX_MD5_SHARD <= MD5_SHARD_HARD_LIMIT);
-
- //---------------------------------------------------------------------------
- enum dedup_req_type_t {
- DEDUP_TYPE_NONE = 0,
- DEDUP_TYPE_ESTIMATE = 1,
- DEDUP_TYPE_FULL = 2
- };
-
- std::ostream& operator<<(std::ostream &out, const dedup_req_type_t& dedup_type);
- struct __attribute__ ((packed)) dedup_flags_t {
- private:
- static constexpr uint8_t RGW_DEDUP_FLAG_HASH_CALCULATED = 0x01; // REC
- static constexpr uint8_t RGW_DEDUP_FLAG_SHARED_MANIFEST = 0x02; // REC + TAB
- static constexpr uint8_t RGW_DEDUP_FLAG_OCCUPIED = 0x04; // TAB
- static constexpr uint8_t RGW_DEDUP_FLAG_FASTLANE = 0x08; // REC
-
- public:
- dedup_flags_t() : flags(0) {}
- dedup_flags_t(uint8_t _flags) : flags(_flags) {}
- inline void clear() { this->flags = 0; }
- inline bool hash_calculated() const { return ((flags & RGW_DEDUP_FLAG_HASH_CALCULATED) != 0); }
- inline void set_hash_calculated() { flags |= RGW_DEDUP_FLAG_HASH_CALCULATED; }
- inline bool has_shared_manifest() const { return ((flags & RGW_DEDUP_FLAG_SHARED_MANIFEST) != 0); }
- inline void set_shared_manifest() { flags |= RGW_DEDUP_FLAG_SHARED_MANIFEST; }
- inline bool is_occupied() const {return ((this->flags & RGW_DEDUP_FLAG_OCCUPIED) != 0); }
- inline void set_occupied() {this->flags |= RGW_DEDUP_FLAG_OCCUPIED; }
- inline void clear_occupied() { this->flags &= ~RGW_DEDUP_FLAG_OCCUPIED; }
- inline bool is_fastlane() const { return ((flags & RGW_DEDUP_FLAG_FASTLANE) != 0); }
- inline void set_fastlane() { flags |= RGW_DEDUP_FLAG_FASTLANE; }
- private:
- uint8_t flags;
- };
-
- struct dedup_stats_t {
- dedup_stats_t& operator+=(const dedup_stats_t& other);
-
- uint64_t singleton_count = 0;
- uint64_t unique_count = 0;
- uint64_t duplicate_count = 0;
- uint64_t dedup_bytes_estimate = 0;
- };
-
- std::ostream& operator<<(std::ostream &out, const dedup_stats_t& stats);
- void encode(const dedup_stats_t& ds, ceph::bufferlist& bl);
- void decode(dedup_stats_t& ds, ceph::bufferlist::const_iterator& bl);
-
- struct worker_stats_t {
- worker_stats_t& operator +=(const worker_stats_t& other);
- void dump(Formatter *f) const;
-
- uint64_t ingress_obj = 0;
- uint64_t ingress_obj_bytes = 0;
- uint64_t egress_records = 0;
- uint64_t egress_blocks = 0;
- uint64_t egress_slabs = 0;
-
- uint64_t single_part_objs = 0;
- uint64_t multipart_objs = 0;
- uint64_t small_multipart_obj = 0;
-
- uint64_t default_storage_class_objs = 0;
- uint64_t default_storage_class_objs_bytes = 0;
-
- uint64_t non_default_storage_class_objs = 0;
- uint64_t non_default_storage_class_objs_bytes = 0;
-
- uint64_t ingress_corrupted_etag = 0;
-
- uint64_t ingress_skip_too_small_bytes = 0;
- uint64_t ingress_skip_too_small = 0;
-
- uint64_t ingress_skip_too_small_64KB_bytes = 0;
- uint64_t ingress_skip_too_small_64KB = 0;
-
- utime_t duration = {0, 0};
- };
- std::ostream& operator<<(std::ostream &out, const worker_stats_t &s);
- void encode(const worker_stats_t& w, ceph::bufferlist& bl);
- void decode(worker_stats_t& w, ceph::bufferlist::const_iterator& bl);
-
-
- struct md5_stats_t {
- md5_stats_t& operator +=(const md5_stats_t& other);
- void dump(Formatter *f) const;
-
- dedup_stats_t small_objs_stat;
- dedup_stats_t big_objs_stat;
- uint64_t ingress_failed_load_bucket = 0;
- uint64_t ingress_failed_get_object = 0;
- uint64_t ingress_failed_get_obj_attrs = 0;
- uint64_t ingress_corrupted_etag = 0;
- uint64_t ingress_corrupted_obj_attrs = 0;
- uint64_t ingress_skip_encrypted = 0;
- uint64_t ingress_skip_encrypted_bytes = 0;
- uint64_t ingress_skip_compressed = 0;
- uint64_t ingress_skip_compressed_bytes = 0;
- uint64_t ingress_skip_changed_objs = 0;
-
- uint64_t shared_manifest_dedup_bytes = 0;
- uint64_t skipped_shared_manifest = 0;
- uint64_t skipped_purged_small = 0;
- uint64_t skipped_singleton = 0;
- uint64_t skipped_singleton_bytes = 0;
- uint64_t skipped_source_record = 0;
- uint64_t duplicate_records = 0;
- uint64_t size_mismatch = 0;
- uint64_t hash_mismatch = 0;
- uint64_t failed_src_load = 0;
- uint64_t failed_rec_load = 0;
- uint64_t failed_block_load = 0;
-
- uint64_t valid_hash_attrs = 0;
- uint64_t invalid_hash_attrs = 0;
- uint64_t set_hash_attrs = 0;
- uint64_t skip_hash_cmp = 0;
-
- uint64_t set_shared_manifest_src = 0;
- uint64_t loaded_objects = 0;
- uint64_t processed_objects = 0;
- // counter is using on-disk size affected by block-size
- uint64_t dup_head_bytes_estimate = 0; //duplicate_head_bytes
- uint64_t deduped_objects = 0;
- // counter is using s3 byte size disregarding the on-disk size affected by block-size
- uint64_t deduped_objects_bytes = 0;
- uint64_t dup_head_bytes = 0;
- uint64_t failed_dedup = 0;
- uint64_t failed_table_load = 0;
- uint64_t failed_map_overflow = 0;
- utime_t duration = {0, 0};
- };
- std::ostream &operator<<(std::ostream &out, const md5_stats_t &s);
- void encode(const md5_stats_t& m, ceph::bufferlist& bl);
- void decode(md5_stats_t& m, ceph::bufferlist::const_iterator& bl);
-
- struct parsed_etag_t {
- uint64_t md5_high; // High Bytes of the Object Data MD5
- uint64_t md5_low; // Low Bytes of the Object Data MD5
- uint16_t num_parts; // How many parts were used in multipart upload
- // Setting num_parts to zero when multipart is not used
- };
-
-#define DIV_UP(a, b) ( ((a)+(b-1)) / b)
- // CEPH min allocation unit on disk is 4KB
- // TBD: take from config
- static constexpr uint64_t DISK_ALLOC_SIZE = 4*1024;
- // 16 bytes hexstring -> 8 Byte uint64_t
- static inline constexpr unsigned HEX_UNIT_SIZE = 16;
-
- //---------------------------------------------------------------------------
- static inline uint64_t byte_size_to_disk_blocks(uint64_t byte_size) {
- return DIV_UP(byte_size, DISK_ALLOC_SIZE);
- }
-
- //---------------------------------------------------------------------------
- static inline uint64_t disk_blocks_to_byte_size(uint64_t disk_blocks) {
- return disk_blocks * DISK_ALLOC_SIZE;
- }
-
- //---------------------------------------------------------------------------
- // ceph store full blocks so need to round up and multiply by block_size
- static inline uint64_t calc_on_disk_byte_size(uint64_t byte_size) {
- uint64_t size_4k_units = byte_size_to_disk_blocks(byte_size);
- return disk_blocks_to_byte_size(size_4k_units);
- }
-
- enum urgent_msg_t {
- URGENT_MSG_NONE = 0,
- URGENT_MSG_ABORT = 1,
- URGENT_MSG_PASUE = 2,
- URGENT_MSG_RESUME = 3,
- URGENT_MSG_RESTART = 4,
- URGENT_MSG_INVALID = 5
- };
-
- const char* get_urgent_msg_names(int msg);
- bool hex2int(const char *p, const char *p_end, uint64_t *p_val);
- bool parse_etag_string(const std::string& etag, parsed_etag_t *parsed_etag);
- void etag_to_bufferlist(uint64_t md5_high, uint64_t md5_low, uint16_t num_parts,
- ceph::bufferlist *bl);
- const char* get_next_data_ptr(bufferlist::const_iterator &bl_itr,
- char data_buff[],
- size_t len,
- const DoutPrefixProvider* dpp);
-
- //---------------------------------------------------------------------------
- static inline void build_oid(const std::string &bucket_id,
- const std::string &obj_name,
- std::string *oid)
- {
- *oid = bucket_id + "_" + obj_name;
- }
-
- //---------------------------------------------------------------------------
- static inline uint64_t calc_deduped_bytes(uint64_t head_obj_size,
- uint16_t num_parts,
- uint64_t size_bytes)
- {
- if (num_parts > 0) {
- // multipart objects with an empty head i.e. we achive full dedup
- return size_bytes;
- }
- else {
- // reduce the head size
- if (size_bytes > head_obj_size) {
- return size_bytes - head_obj_size;
- }
- else {
- return 0;
- }
- }
- }
-
-} //namespace rgw::dedup
}
main.init_lua();
+#ifdef WITH_RADOSGW_RADOS
main.init_dedup();
+#endif
return 0;
} /* RGWLib::init() */
main.init_opslog();
main.init_tracepoints();
main.init_lua();
+#ifdef WITH_RADOSGW_RADOS
main.init_dedup();
+#endif
r = main.init_frontends2(nullptr /* RGWLib */);
if (r != 0) {
derr << "ERROR: initialize frontend fail, r = " << r << dendl;